In [None]:
import os
import numpy as np
from tqdm import tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

# Create the client instance
client = Elasticsearch(
    "http://localhost:9200",
    request_timeout=100000
)

# Successful response!
client.info()

In [11]:
# create lecard index
client.indices.create(
    index="lecard", 
    settings={
        'analysis': {
            'analyzer': {
                # we must set the default analyzer
                "default": {
                    "type": "smartcn"
                }
            }
        }
    },
    mappings={
        "properties": {
            # field name
            "vector": {
                "type": "dense_vector",
                "dims": 768,
                # enable hnsw
                "index": True,
                # inner product is prohibitive since it only allows unit-length vector
                "similarity": "l2_norm"
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'lecard'})

In [None]:
client.search(
    index="test-english", 
    query={
        "combined_fields": {
            "query": "one",
            "fields": ["title", "abstract"],
        }
    },
    highlight={
        "fields": {
            "title": {
                "pre_tags" : ["<em class='text-primary'>"],
                "post_tags": ["</em>"]
            },
            "abstract": {
                "pre_tags" : ["<em class='text-primary'>"],
                "post_tags": ["</em>"]
            }
        }
    }
)

In [14]:
client.knn_search(
    index="lecard", 
    knn={
        "field": "vector",
        "query_vector": np.random.rand(768).astype(np.float32).tolist(),
        "k": 10,
        "num_candidates": 10
    },
    source=["title", "abstract"]
)["hits"]["hits"]

BadRequestError: BadRequestError(400, 'x_content_parse_exception', '[1:14818] [knn-search] failed to parse field [knn]')

In [None]:
# add demo english documents
def gendata():
    num2word = {0: 'Zero', 1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five', \
            6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten', \
            11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen', \
            15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen', 19: 'Nineteen'}

    for i in range(10):
        yield {
            "_index": "test-english",
            "title": f"this is the {num2word[i]} title",
            "abstract": f"this is the {num2word[i]} abstract",
            "body": f"this is the {num2word[i]} body"
        }

bulk(client, gendata())

In [None]:
# add demo chinese documents
def gendata():
    num2word = {0: '零', 1: '一', 2: '二', 3: '三', 4: '四', 5: '五', \
            6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten', \
            11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen', \
            15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen', 19: 'Nineteen'}

    for i in range(6):
        yield {
            "_index": "test-chinese",
            "title": f"这是第{num2word[i]}篇文章的标题",
            "abstract": f"这是第{num2word[i]}篇文章的摘要",
            "body": f"这是第{num2word[i]}篇文章的正文"
        }
bulk(client, gendata())

In [12]:
# add lecard documents
def gendata():
    model = "DPR"
    dim = 768
    embeddings = np.memmap(
        os.path.join("data/encode", model, "text_embeddings.mmp"),
        dtype=np.float32,
        mode="r"
    ).reshape(-1, dim)
    
    with open("D:\Data\lecard\collection.tsv", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc="indexing...", total=embeddings.shape[0])):
            fields = line.strip().split("\t")
            yield {
                "_index": "lecard",
                "title": fields[1],
                "abstract": fields[2],
                "body": fields[3],
                "vector": embeddings[i].tolist()
            }
bulk(client, gendata())

indexing...: 100%|██████████| 10718/10718 [02:50<00:00, 62.77it/s] 


(10718, [])

In [None]:
# add demo vectors
def gendata():
    num2word = {0: '零', 1: '一', 2: '二', 3: '三', 4: '四', 5: '五'}

    for i in range(6):
        yield {
            "_index": "test-vector",
            "title": f"这是第{num2word[i]}篇文章的标题",
            "abstract": f"这是第{num2word[i]}篇文章的摘要",
            "body": f"这是第{num2word[i]}篇文章的正文",
            "vector": np.random.rand(768).astype(np.float16).tolist()
        }
bulk(client, gendata())

In [None]:
# test chinese index
client.indices.analyze(index="test-chinese", text="中国人民大学")

In [None]:
# delete index
client.indices.delete(index="lecard")

In [None]:
client.delete_by_query(
    index="test-vector", 
    query={
        "match_all": {}
    }
)