In [9]:
import os
import json
import numpy as np
from tqdm import tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

# Create the elastic instance
elastic = Elasticsearch(
    "http://localhost:9200",
    request_timeout=1000000
)

# Successful response!
elastic.info()

ObjectApiResponse({'name': 'PT-LAPTOP', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'VEFspzOtRu6pZW2yB-lWog', 'version': {'number': '8.3.2', 'build_type': 'zip', 'build_hash': '8b0b1f23fbebecc3c88e4464319dea8989f374fd', 'build_date': '2022-07-06T15:15:15.901688194Z', 'build_snapshot': False, 'lucene_version': '9.2.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [12]:
elastic.search(
    index="wenshu",
    size=2,
    from_=0,
    _source=True,
    fields=["case_name", "content", {"field": "publish_date", "format": "year_month_day"}, "court_name"],
    query={
        "combined_fields": {
            "query": "测试",
            "fields": ["case_name", "content"]
        }
    },
    aggs={
        "agg-court": {
            "terms": {
                "field": "court"
            }
        }
    },
    # highlight={
    #     "fields": {
    #         "content": {
    #             "pre_tags" : ["<strong>"],
    #             "post_tags": ["</strong>"],
    #             "number_of_fragments": 1,
    #         }
    #     }
    # },
    # post_filter={
    #     "bool": {
    #         "filter": [
    #             {
    #                 "terms": {"court_name": ["德阳市中级人民法院"]}
    #             },
    #             {
    #                 "terms": {"case_type": ["管辖案件"]}
    #             }
    #         ]
    #     }
    # }
)["hits"]["total"]

{'value': 223, 'relation': 'eq'}

In [None]:
elastic.knn_search(
    index="lecard", 
    knn={
        "field": "vector",
        "query_vector": np.random.rand(768).astype(np.float32).tolist(),
        "k": 10,
        "num_candidates": 10
    },
    source=["title", "abstract"]
)["hits"]

In [None]:
def gendata():
    model = "DPR"
    dim = 768
    embeddings = np.memmap(
        os.path.join("data/encode", model, "wenshu", "text_embeddings.mmp"),
        dtype=np.float32,
        mode="r"
    ).reshape(-1, dim)

    with open("/home/peitian_zhang/Data/wenshu/wenshu1.json", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc="Indexing", total=100000)):
            case = json.loads(line.strip())
            case["_index"] = "test-wenshu"
            case["vector"] = embeddings[i].tolist()
            yield case
bulk(elastic, gendata())

In [None]:
def gendata():
    with open("D:\Data\wenshu\p4.txt", encoding="utf-8") as f:
        for i, line in enumerate(f):
            case = json.loads(line.strip())
            del case["crawl_time"]
            del case["legal_base"]
            yield case
for x in tqdm(gendata(), desc="Indexing", total=64936):
    elastic.index(index="wenshu", document=x)

In [None]:
elastic.indices.create(
    index="test-wenshu",
    settings={
        'analysis': {
            'analyzer': {
                # we must set the default analyzer
                "default": {
                    "type": "smartcn"
                }
            }
        }
    },
    mappings={
        "properties": {
            # field name
            "title": {
                "type": "text",
            },
            "judge_data": {
                "type": "date",
            },
            "publish_type": {
                "type": "keyword",
            },
            "id": {
                # supports wildcard search
                "type": "wildcard",
            },
            "court": {
                "type": "keyword",
            },
            "case_num": {
                "type": "keyword",
            },
            "content": {
                "type": "text"
            },
            "vector": {
                "type": "dense_vector",
                "dims": 768,
                # enable hnsw
                "index": True,
                # inner product is prohibitive since it only allows unit-length vector
                "similarity": "l2_norm"  
            }
        }
    }
)

In [None]:
elastic.indices.create(
    index="wenshu",
    settings={
        'analysis': {
            'analyzer': {
                # we must set the default analyzer
                "default": {
                    "type": "smartcn"
                }
            }
        },
        "index.mapping.ignore_malformed": True
    },
    mappings={
        "properties": {
            # field name
            "doc_id": {
                "type": "keyword",
            },
            "court_name": {
                "type": "keyword",
            },
            "court_id": {
                "type": "keyword",
            },
            "court_province": {
                "type": "keyword",
            },
            "court_city": {
                "type": "keyword",
            },
            "court_region": {
                "type": "keyword",
            },
            "court_district": {
                "type": "keyword"
            },
            "pub_prosecution_org": {
                "type": "keyword"
            },
            "case_type": {
                "type": "keyword",
            },
            "cause": {
                "type": "keyword",
            },
            "trial_round": {
                "type": "keyword"
            }
            "content": {
                "type": "text"
            },
            "vector": {
                "type": "dense_vector",
                "dims": 768,
                # enable hnsw
                "index": True,
                # inner product is prohibitive since it only allows unit-length vector
                "similarity": "l2_norm"  
            }
        }
    }
)

In [None]:
# test chinese index
elastic.indices.analyze(index="test-chinese", text="中国人民大学")

In [None]:
# delete index
elastic.indices.delete(index="wenshu")

In [None]:
elastic.search(
    index="test-wenshu",
    query={
        "match_all": {}
    }
)["hits"]["hits"]

In [None]:
elastic.delete_by_query(
    index="wenshu", 
    query={
        "match_all": {}
    }
)

In [None]:
elastic.search(
    index="wenshu",
    query={
        "match_all": {
        }
    },
    fields=["case_name", "content"],
    source=False
)["hits"]