# 索引构建

## 加载相关文件

In [1]:
import re
import time
import os
import csv
import json
import pickle
from typing import List, Dict, Tuple

In [2]:
with open("page_info.json", "r", encoding='utf-8') as f:
    url_info : Dict = json.load(f)

## 连接搜索引擎

In [3]:
from elasticsearch import Elasticsearch

es = Elasticsearch(hosts='http://elastic:6Z9lWX66IlNKRdFOdB*R@127.0.0.1:9200')
# es = Elasticsearch(hosts='http://127.0.0.1:9200')

## 设计索引结构

In [10]:
doc = {
    'settings': {
        'analysis': {
            'analyzer': 'ik_max_word',
            "search_analyzer": 'ik_max_word'
        }
    },
    'mappings':{
        'properties':{
            'url':{
                'type': 'text',
                'analyzer': 'ik_max_word',
            },
            'anchor_text':{
                'type': 'text',
                'analyzer': 'ik_max_word',
            },
            'title':{
                'type': 'text',
                'analyzer': 'ik_max_word',
            },
            'content': {
                'type': 'text',
                'analyzer': 'ik_max_word',
            }
        }
    }
}



In [13]:
es.indices.delete(index='index')

ObjectApiResponse({'acknowledged': True})

In [7]:
es.indices.create(index='test', body=doc)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'test'})

In [14]:
es.indices.create(index='index', body=doc)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'index'})

## 测试索引

In [8]:
test_urls = list(url_info.keys())[:10]
for url in test_urls:
    test = url_info[url]

    with open('new_pages/' + str(test["page"]), 'r', encoding='utf-8') as f:
        test["content"] = f.read()

    res = es.index(index='test', body=test)

In [11]:
query_all = {
    "query": {
        "match_all": {}
    }
}

my_query = {
    "explain": True,
    "query": {
        "multi_match": {
            "query":"公司",
            "fields":[
                "url",
                "content"
            ]
        }
    },
    "highlight": {
        "pre_tags" : ["<font color='red'>"],
        "post_tags" : ["</font>"],
        "fields" : {
            "title" : {}
        }
    }
}

In [12]:
es.search(index='main', body=my_query)

ObjectApiResponse({'took': 64, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 0.5753642, 'hits': [{'_shard': '[main][0]', '_node': 'aIoZDgcZS9qNgJgetXHgEw', '_index': 'main', '_id': 'AH1OJYwB10bB2mGG_s-B', '_score': 0.5753642, '_source': {'url': 'https://mzh.moegirl.org.cn/Template:Galgame公司', 'anchor_text': [], 'title': '模板:Galgame公司', 'page': 1701425953, 'content': "模板讨论简体模板:Galgame公司萌娘百科，万物皆可萌的百科全书！转载请标注来源页面的网页链接，并声明引自萌娘百科。内容不可商用。各种Galgame公司。若有遗漏或未来再有补充，欢迎随时编辑。由于Galgame公司太过繁多，对于以下类型的公司，若无条目记录则不许列其名：制作具有露骨性描写标题作品的公司，其他制作拔作的公司酌情收录；仅制作R-18G级别重口味作品的公司；只制作过少数几部作品且均不知名的公司。相关模板：{{游戏公司}}维基百科提示您 关于Galgame公司，在自由的百科全书维基百科上有相关条目。请参阅：日本成人游戏厂商列表折叠查 · 论 · 编Galgame公司日本AQUAPLUS系AQUAPLUS • LeafVISUAL ARTS系13cm • AMEDEO • Bonbee! • IMAGE CRAFT • Key • Lapis lazuli • ocelot • otherwise • PLAYM • Sirius • ZERO • EGO • 裸足少女 • Hamham Soft • tone work's • GLOVETY • IrisCIRCUS系CIRCUS • CIRCUS NORTHERN • 

## 插入数据

In [15]:
test_urls = list(url_info.keys())
for url in test_urls:
    test = url_info[url]

    with open('new_pages/' + str(test["page"]), 'r', encoding='utf-8') as f:
        test["content"] = f.read()

    res = es.index(index='index', id=test["page"], body=test)

测试查询

In [18]:
my_query = {
    # "explain": True,
    "query": {
        "multi_match": {
            "query":"公司",
            "fields":[
                "url^2",
                "content"
            ]
        }
    },
    "highlight": {
        "pre_tags" : ["<font color='red'>"],
        "post_tags" : ["</font>"],
        "fields" : {
            "title" : {}
        }
    },
    "_source":{
        "excludes":["content"]
    }
}
es.search(index='index', body=my_query)

ObjectApiResponse({'took': 2, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5251, 'relation': 'eq'}, 'max_score': 12.089063, 'hits': [{'_index': 'index', '_id': '1701512531', '_score': 12.089063, '_source': {'url': 'https://mzh.moegirl.org.cn/TRIGGER(公司)', 'anchor_text': ['6\n外部链接'], 'title': 'TRIGGER', 'page': 1701512531, 'page_rank': 0.00010814988401399437}}, {'_index': 'index', '_id': '1701530273', '_score': 12.089063, '_source': {'url': 'https://mzh.moegirl.org.cn/晓(公司)', 'anchor_text': [], 'title': '晓', 'page': 1701530273, 'page_rank': 7.470820104215317e-05}}, {'_index': 'index', '_id': '1701517669', '_score': 12.089063, '_source': {'url': 'https://mzh.moegirl.org.cn/橘公司', 'anchor_text': ['橘公司'], 'title': '橘公司', 'page': 1701517669, 'page_rank': 5.925042784280388e-05}}, {'_index': 'index', '_id': '1701449489', '_score': 12.089063, '_source': {'url': 'https://mzh.moegirl.org.cn/魔女公司', 'anchor_text': ['魔女公司'], 'ti