In [11]:
import os
import json
import numpy as np
from tqdm import tqdm
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

# Create the elastic instance
elastic = Elasticsearch(
    "http://localhost:9200",
    request_timeout=1000000
)

# Successful response!
elastic.info()

ObjectApiResponse({'name': 'PT-LAPTOP', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'VEFspzOtRu6pZW2yB-lWog', 'version': {'number': '8.3.2', 'build_type': 'zip', 'build_hash': '8b0b1f23fbebecc3c88e4464319dea8989f374fd', 'build_date': '2022-07-06T15:15:15.901688194Z', 'build_snapshot': False, 'lucene_version': '9.2.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [38]:
elastic.search(
    index="test-wenshu",
    size=20,
    _source=False,
    fields=["_id", "title", "content", {"field": "judge_date", "format": "year_month_day"}, "court"],
    query={
        "combined_fields": {
            "query": "纵火",
            "fields": ["title", "content"]
        }
    },
    aggs={
        "agg-court": {
            "terms": {
                "field": "court"
            }
        }
    },
    # post_filter={
    #     "bool": {
    #         "filter": [
    #             {"terms": {
    #             "court": ["北京市第一中级人民法院"]
    #             }}
    #         ]
    #     }
    # }
)["hits"]["hits"]

[{'_index': 'test-wenshu',
  '_id': 'xyC6KoIBRcbCnSNid4Kb',
  '_score': 14.425627,
  'fields': {'title': ['张莉宇等租赁合同纠纷申诉、申请民事裁定书'],
   'judge_date': ['2014-11-20'],
   'court': ['北京市高级人民法院'],
   'content': ["<!DOCTYPE HTML PUBLIC -//W3C//DTD HTML 4.0 Transitional//EN'><HTML><HEAD><TITLE></TITLE></HEAD><BODY><div style='TEXT-ALIGN: center; LINE-HEIGHT: 25pt; MARGIN: 0.5pt 0cm; FONT-FAMILY: 黑体; FONT-SIZE: 18pt;'>北京市高级人民法院</div><div style='TEXT-ALIGN: center; LINE-HEIGHT: 25pt; MARGIN: 0.5pt 0cm; FONT-FAMILY: 黑体; FONT-SIZE: 18pt;'>民 事 裁 定 书</div><div id='1'  style='TEXT-ALIGN: right; LINE-HEIGHT: 25pt; MARGIN: 0.5pt 0cm;  FONT-FAMILY: 宋体;FONT-SIZE: 15pt; '>（2014）高民申字第4329号</div><div id='2'  style='LINE-HEIGHT: 25pt; TEXT-INDENT: 30pt; MARGIN: 0.5pt 0cm;FONT-FAMILY: 宋体; FONT-SIZE: 15pt;'>再审申请人（一审被告、反诉原告，二审上诉人）兼石莹之委托代理人：张莉宇，女，1974年5月7日出生，汉族，无业。</div><div style='LINE-HEIGHT: 25pt; TEXT-INDENT: 30pt; MARGIN: 0.5pt 0cm;FONT-FAMILY: 宋体; FONT-SIZE: 15pt;'>再审申请人（一审被告、反诉原告，二审上诉人）：石莹，男，1977年2月24日出生，

In [None]:
elastic.search(
    index="lecard", 
    size=1,
    query={
        "combined_fields": {
            "query": "强奸",
            "fields": ["title", "abstract"],
        }
    },
    fields=["title", "abstract", "_id"],
    source=False,
    highlight={
        "fields": {
            "title": {
                "pre_tags" : ["<em class='text-primary'>"],
                "post_tags": ["</em>"],
                "number_of_fragments": 1,
            },
            "content": {
                "pre_tags" : ["<strong>"],
                "post_tags": ["</strong>"],
                "number_of_fragments": 1,
            }
        }
    },
    post_filter={ 
        "term": { "color": "red" }
    }
)["hits"]

In [None]:
elastic.knn_search(
    index="lecard", 
    knn={
        "field": "vector",
        "query_vector": np.random.rand(768).astype(np.float32).tolist(),
        "k": 10,
        "num_candidates": 10
    },
    source=["title", "abstract"]
)["hits"]

In [None]:
# add lecard documents
def gendata():
    model = "DPR"
    dim = 768
    embeddings = np.memmap(
        os.path.join("data/encode", model, "text_embeddings.mmp"),
        dtype=np.float32,
        mode="r"
    ).reshape(-1, dim)
    
    with open("D:\Data\lecard\collection.tsv", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc="Indexing", total=embeddings.shape[0])):
            fields = line.strip().split("\t")
            yield {
                "_index": "lecard",
                "title": fields[1],
                "abstract": fields[2],
                "body": fields[3],
                "vector": embeddings[i].tolist()
            }
bulk(client, gendata())

In [None]:
def gendata():
    with open("D:\Data\wenshu1\wenshu1.json", encoding="utf-8") as f:
        for i, line in enumerate(tqdm(f, desc="Indexing", total=100000)):
            case = json.loads(line.strip())
            case["_index"] = "test-wenshu"
            yield case
bulk(elastic, gendata())

In [None]:
# create lecard index
elastic.indices.create(
    index="lecard", 
    settings={
        'analysis': {
            'analyzer': {
                # we must set the default analyzer
                "default": {
                    "type": "smartcn"
                }
            }
        }
    },
    mappings={
        "properties": {
            # field name
            "vector": {
                "type": "dense_vector",
                "dims": 768,
                # enable hnsw
                "index": True,
                # inner product is prohibitive since it only allows unit-length vector
                "similarity": "l2_norm"  
            }
        }
    }
)

In [None]:
elastic.indices.create(
    index="test-wenshu",
    settings={
        'analysis': {
            'analyzer': {
                # we must set the default analyzer
                "default": {
                    "type": "smartcn"
                }
            }
        }
    },
    mappings={
        "properties": {
            # field name
            "title": {
                "type": "text",
            },
            "judge_data": {
                "type": "date",
            },
            "publish_type": {
                "type": "keyword",
            },
            "id": {
                # supports wildcard search
                "type": "wildcard",
            },
            "court": {
                "type": "keyword",
            },
            "case_num": {
                "type": "keyword",
            },
            "content": {
                "type": "text"
            },
            "vector": {
                "type": "dense_vector",
                "dims": 768,
                # enable hnsw
                "index": True,
                # inner product is prohibitive since it only allows unit-length vector
                "similarity": "l2_norm"  
            }
        }
    }
)

In [None]:
# test chinese index
client.indices.analyze(index="test-chinese", text="中国人民大学")

In [None]:
# delete index
client.indices.delete(index="lecard")

In [None]:
elastic.delete_by_query(
    index="test-wenshu", 
    query={
        "match_all": {}
    }
)