In [1]:
import json
import textwrap
import numpy as np
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from utils.modeling import EmbeddingModel

client = Elasticsearch("http://127.0.0.1:9200")

# install chinese analyzer
ES_HOME = "/data/peitian/Apps/elasticsearch-8.17.0"
# !bash {ES_HOME}/bin/elasticsearch-plugin install analysis-smartcn

client.info()

with open("config/bge-m3.json") as f:
    config = json.load(f)
model = EmbeddingModel(**config)

def pretty_response(response):
    if len(response["hits"]["hits"]) == 0:
        print("Your search returned no results.")
    else:
        for hit in response["hits"]["hits"]:
            score = hit["_score"]
            # language = hit["_source"]["language"]
            # id = hit["_source"]["_id"]
            content = hit["_source"]["content"]
            # passage = hit["_source"]["passage"]
            print()
            # print(f"ID: {id}")
            print(f"Content: {textwrap.fill(content, 120)}")
            print(f"Score: {score}")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# 创建法条index

client.indices.create(
    index="law",
    settings={
        # 'analysis': {
        #     'analyzer': {
        #         # we must set the default analyzer
        #         "default": {
        #             "type": "smartcn"
        #         }
        #     }
        # },
        # "index.mapping.ignore_malformed": True
    },
    mappings={
        "properties": {
            # field name
            "country": {
                "type": "keyword",
            },
            "code": {
                "type": "keyword",
            },
            "content": {
                "type": "text",
            },
            "embedding": {
                "type": "dense_vector",
                "dims": 1024,
                # enable hnsw
                "index": True,
                # inner product only allows unit-length vector
                "similarity": "dot_product"  
            }
        }
    }
)

In [2]:
# 删除index

client.indices.delete(index="law")

ObjectApiResponse({'acknowledged': True})

In [None]:
import os
import json
base_dir = "/data/peitian/Data/legal/yidaiyilu/output"

def generate_laws():
    for path in os.listdir(base_dir):
        code_name = path.split(".")[0]
        with open(os.path.join(base_dir, path), encoding="utf-8") as f:
            for i, line in enumerate(f):
                content = json.loads(line)["text"]

                yield {
                    "_index": "law",
                    "country": "俄罗斯",
                    "code": code_name,
                    "content": content,
                    "embedding": model.encode(content)[0].tolist()
                }
                
                break

helpers.bulk(client, generate_laws())

In [None]:
resp = client.search(
    index="law", 
    _source={
        "excludes": "embedding",
    },
    query={
        "match": 
            {
                "content": {
                    "query": "不正当",
                    "boost": 0.2,
                }
            }
    },
    # knn={
    #     "field": "embedding",
    #     "query_vector": model.encode("这是啥")[0].tolist(),  # generate embedding for query so it can be compared to `title_vector`
    #     "k": 5,
    #     "num_candidates": 10,
    #     "boost": 0.9
    # },
    size=10,
)
pretty_response(resp)

In [None]:
[x["_source"] for x in resp["hits"]["hits"]]

In [None]:
resp["hits"]