In [2]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

# Create the client instance
client = Elasticsearch(
    "http://localhost:9200",
    request_timeout=100000
)

# Successful response!
client.info()

ObjectApiResponse({'name': 'LAPTOP-Q7SNUCG2', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'yhlAun-SQlmUfzFTFaRwIg', 'version': {'number': '8.3.2', 'build_type': 'zip', 'build_hash': '8b0b1f23fbebecc3c88e4464319dea8989f374fd', 'build_date': '2022-07-06T15:15:15.901688194Z', 'build_snapshot': False, 'lucene_version': '9.2.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [None]:
# create chinese index
client.indices.create(
    index="lecard", 
    settings={
        'analysis': {
            'analyzer': {
                # we must set the default analyzer
                "default": {
                    "type": "smartcn"
                }
            }
        }
    }
)

In [None]:
client.search(
    index="test-english", 
    query={
        "combined_fields": {
            "query": "one",
            "fields": ["title", "abstract"],
        }
    },
    highlight={
        "fields": {
            "title": {
                "pre_tags" : ["<em class='text-primary'>"],
                "post_tags": ["</em>"]
            },
            "abstract": {
                "pre_tags" : ["<em class='text-primary'>"],
                "post_tags": ["</em>"]
            }
        }
    }
)

In [None]:
client.search(
    index="lecard", 
    query={
        "match_all": {}
    }
)["hits"]["hits"]

In [None]:
# add demo english documents
def gendata():
    num2word = {0: 'Zero', 1: 'One', 2: 'Two', 3: 'Three', 4: 'Four', 5: 'Five', \
            6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten', \
            11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen', \
            15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen', 19: 'Nineteen'}

    for i in range(10):
        yield {
            "_index": "test-english",
            "title": f"this is the {num2word[i]} title",
            "abstract": f"this is the {num2word[i]} abstract",
            "body": f"this is the {num2word[i]} body"
        }

bulk(client, gendata())

In [None]:
# add demo chinese documents
def gendata():
    num2word = {0: '零', 1: '一', 2: '二', 3: '三', 4: '四', 5: '五', \
            6: 'Six', 7: 'Seven', 8: 'Eight', 9: 'Nine', 10: 'Ten', \
            11: 'Eleven', 12: 'Twelve', 13: 'Thirteen', 14: 'Fourteen', \
            15: 'Fifteen', 16: 'Sixteen', 17: 'Seventeen', 18: 'Eighteen', 19: 'Nineteen'}

    for i in range(6):
        yield {
            "_index": "test-chinese",
            "title": f"这是第{num2word[i]}篇文章的标题",
            "abstract": f"这是第{num2word[i]}篇文章的摘要",
            "body": f"这是第{num2word[i]}篇文章的正文"
        }
bulk(client, gendata())

In [3]:
# add lecard documents
def gendata():
    with open("D:\Data\lecard\collection.tsv", encoding="utf-8") as f:
        for line in f:
            fields = line.strip().split("\t")
            yield {
                "_index": "lecard",
                "title": fields[1],
                "abstract": fields[2],
                "body": fields[3]
            }
bulk(client, gendata())

(10718, [])

In [None]:
# test chinese index
client.indices.analyze(index="test-chinese", text="中国人民大学")

In [None]:
# delete index
client.indices.delete(index="test-chinese")

In [8]:
client.delete_by_query(
    index="lecard", 
    query={
        "match_all": {}
    }
)

ObjectApiResponse({'took': 6867, 'timed_out': False, 'total': 9000, 'deleted': 9000, 'batches': 9, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []})