# search books
used whoosh search engine.
[whoosh documents](https://whoosh.readthedocs.io/en/latest/index.html)

## 安装 whoosh 文本搜索引擎
- 安装 whoosh 文本搜索引擎
- 安装结巴中文分词
- 安装stanza分词
- 安装jiayan分词

In [211]:
#!pip install whoosh
#!pip install jieba

## 创建搜索工具

In [301]:
import os
from whoosh.index import create_in
from whoosh.fields import *
from jieba.analyse import ChineseAnalyzer
import json

### 创建分词器

In [319]:
# jiayan 分词器
from whoosh.analysis import RegexAnalyzer, LowercaseFilter, StopFilter, StemFilter
from whoosh.analysis import Tokenizer, Token
from whoosh.lang.porter import stem
from jiayan import load_lm
from jiayan import CharHMMTokenizer
import site
import re

STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
                        'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
                        'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
                        'to', 'us', 'we', 'when', 'will', 'with', 'yet',
                        'you', 'your', '的', '了', '和'))

accepted_chars = re.compile(r"[\u4E00-\u9FD5]+")

class JiayanChineseTokenizer(Tokenizer):
    chartokenizer = None
    wordtokenizer = None

    def __call__(self, text, tokenizer = 'word', **kargs):
        if tokenizer == 'char':
            if self.chartokenizer is None:
                model_path = os.path.join(site.getsitepackages()[0], 'jiayan', 'data', 'jiayan.klm') #site.getusersitepackages()
                lm = load_lm(model_path)
                self.chartokenizer = CharHMMTokenizer(lm)
            tokenizer = self.chartokenizer
        else:
            if self.wordtokenizer is None:
                self.wordtokenizer = WordNgramTokenizer()
            tokenizer = self.wordtokenizer

        result = tokenizer.tokenize(text)

        count = 0
        token = Token()
        for tok in result:
            text = tok
            start_char = count
            count = count + len(tok)
            end_char = count

            if not accepted_chars.match(text) and len(text) <= 1:
                continue
            token.original = token.text = text
            token.pos = start_char
            token.startchar = start_char
            token.endchar = end_char
            yield token

def JiayanChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
    return (JiayanChineseTokenizer() | LowercaseFilter() |
            StopFilter(stoplist=stoplist, minsize=minsize) |
            StemFilter(stemfn=stemfn, ignore=None, cachesize=cachesize))

In [320]:
# Stanza 分词器
from whoosh.analysis import RegexAnalyzer, LowercaseFilter, StopFilter, StemFilter
from whoosh.analysis import Tokenizer, Token
from whoosh.lang.porter import stem
import re
import stanza

STOP_WORDS = frozenset(('a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'can',
                        'for', 'from', 'have', 'if', 'in', 'is', 'it', 'may',
                        'not', 'of', 'on', 'or', 'tbd', 'that', 'the', 'this',
                        'to', 'us', 'we', 'when', 'will', 'with', 'yet',
                        'you', 'your', '的', '了', '和'))

accepted_chars = re.compile(r"[\u4E00-\u9FD5]+")

class StanzaChineseTokenizer(Tokenizer):
    nlp = None

    def __call__(self, text, **kargs):
        #stanza.download('zh')       # This downloads the English models for the neural pipeline
        if self.nlp is None:
            self.nlp = stanza.Pipeline('zh', download_method=stanza.DownloadMethod.REUSE_RESOURCES) # This sets up a default neural pipeline in English
        doc = self.nlp(text)
        
        token = Token()
        for sentence in doc.sentences:
            #print(sentence)
            for tok in sentence.tokens:
                text = ''
                start_char = tok._start_char
                end_char = tok._end_char
                if tok.words:  # not-yet-processed MWT can leave empty tokens
                    for word in tok.words:
                        start_char = word._start_char
                        end_char = word._end_char
                        text = word._text

                if not accepted_chars.match(text) and len(text) <= 1:
                    continue
                token.original = token.text = text
                token.pos = start_char
                token.startchar = start_char
                token.endchar = end_char
                yield token


def StanzaChineseAnalyzer(stoplist=STOP_WORDS, minsize=1, stemfn=stem, cachesize=50000):
    return (StanzaChineseTokenizer() | LowercaseFilter() |
            StopFilter(stoplist=stoplist, minsize=minsize) |
            StemFilter(stemfn=stemfn, ignore=None, cachesize=cachesize))

### 创建搜索范式

there are two ways to define a schema.
one is used Schema directly to create, another is used subclass Schema. 

In [321]:
# Create book_schema, used Schema directly. stored is True for search. 
#BookSchema = Schema(book_title=TEXT(stored=True),
#                    volume_title=TEXT(stored=True),
#                    chapter_title=TEXT(stored=True),
#                    content=TEXT(stored=True))

In [335]:
# Create book_schema, used subclass Schema.
analyzer = StanzaChineseAnalyzer() # Stanza
analyzer = JiayanChineseAnalyzer() # Jiayan
analyzer = ChineseAnalyzer() # Jieba

class BookSchema(SchemaClass):
    book_title = TEXT(stored=True, analyzer=analyzer)
    volume_title = TEXT(stored=True, analyzer=analyzer)
    chapter_title = TEXT(stored=True, analyzer=analyzer)
    content = TEXT(stored=True, analyzer=analyzer)

### 建立索引

In [336]:
def decodebook(filename, ix):
    
    print(filename)
    # 按照schema定义信息，增加需要建立索引的文档
    writer = ix.writer()
    with open(filename, 'r', encoding='utf-8') as file:
        book = json.load(file);
        book_title = book["title"]
        description = book["description"]
        for volume in book["volumes"]:
            volume_title = volume["title"]
            for chapter in volume["chapters"]:
                chapter_title = chapter["title"]
                #print(f"chapter: {chapter_title}")
                
                if (chapter.get("paragraphs")):
                    for paragraph in chapter["paragraphs"]:
                        content = paragraph["content"]
                        writer.add_document(book_title=book_title, volume_title=volume_title, chapter_title=chapter_title, content=content)            
        writer.commit()

In [337]:
def CreateBookIndex(bookspath):
    # 解析 books.json 文件
    filenames = os.listdir(bookspath)
    filenames.sort()
    filenames = [f"{bookspath}/{filename}" for filename in filenames if filename.endswith(".json")]
    
    # 存储 schema 信息至 index 目录
    indexpath = os.path.join(bookspath, "index/")
    
    if not os.path.exists(indexpath):
        os.mkdir(indexpath)
    ix = create_in(indexpath, BookSchema, indexname='BookIndex')
    
    for filename in filenames:
        decodebook(filename, ix)

In [338]:
bookspath = "/Users/sunyafu/zebra/YIJING/Books/ready"

In [340]:
CreateBookIndex(bookspath)

/Users/sunyafu/zebra/YIJING/Books/ready/仪礼.json
/Users/sunyafu/zebra/YIJING/Books/ready/公孙龙子.json
/Users/sunyafu/zebra/YIJING/Books/ready/公羊传.json
/Users/sunyafu/zebra/YIJING/Books/ready/六韬.json
/Users/sunyafu/zebra/YIJING/Books/ready/列子.json
/Users/sunyafu/zebra/YIJING/Books/ready/司马法.json
/Users/sunyafu/zebra/YIJING/Books/ready/吕氏春秋.json
/Users/sunyafu/zebra/YIJING/Books/ready/吴子.json
/Users/sunyafu/zebra/YIJING/Books/ready/周易.json
/Users/sunyafu/zebra/YIJING/Books/ready/周礼.json
/Users/sunyafu/zebra/YIJING/Books/ready/商君书.json
/Users/sunyafu/zebra/YIJING/Books/ready/国语.json
/Users/sunyafu/zebra/YIJING/Books/ready/墨子.json
/Users/sunyafu/zebra/YIJING/Books/ready/大戴礼记.json
/Users/sunyafu/zebra/YIJING/Books/ready/孙子兵法.json
/Users/sunyafu/zebra/YIJING/Books/ready/孙膑兵法.json
/Users/sunyafu/zebra/YIJING/Books/ready/孝经.json
/Users/sunyafu/zebra/YIJING/Books/ready/孟子.json
/Users/sunyafu/zebra/YIJING/Books/ready/尉缭子.json
/Users/sunyafu/zebra/YIJING/Books/ready/尔雅.json
/Users/sunyafu/zebra/YIJIN

### 搜索

In [342]:
from whoosh.qparser import QueryParser, MultifieldParser
from whoosh.index import open_dir
from whoosh.query import compound, Term

In [343]:
def GetBookIndex(bookspath):
    indexpath = os.path.join(bookspath, "index/")
    ix = open_dir(indexpath, indexname='BookIndex')
    return ix

In [346]:
# 创建一个检索器
ix = GetBookIndex(bookspath)
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("幽人")
    results = searcher.search(Term('content', '幽人'), limit=None)
    print('一共发现%d个结果。\n' % len(results))

    # 允许被检索到的文字内容包含的最大的字符数
    results.fragmenter.maxchars = 300
    # 在被检索到的文字之前和之后显示更多上下文
    results.fragmenter.surround = 50
    for index, result in enumerate(results):
        if len(result['volume_title']):
            print(f"{index}. {result['book_title']} {result['volume_title']}·{result['chapter_title']}")
        else:
            print(f"{index}. {result['book_title']} {result['chapter_title']}")
        print(f"{result['content']}\n")
        #print(f"... {result.highlights('content')} ...\n")

一共发现2个结果。

0. 周易 上经·第十卦 ䷉履 天泽履 乾上兑下
九二：履道坦坦，幽人贞吉。

1. 易传 象传上·履
素履之往，独行愿也。幽人贞吉，中不自乱也。眇能视，不足以有明也。跛能履，不足以与行也。咥人之凶，位不当也。武人为于大君，志刚也。愬愬终吉，志行也。夬履贞厉，位正当也。元吉在上，大有庆也。



In [345]:
# 创建一个检索器
ix = GetBookIndex(bookspath)
searcher = ix.searcher()

# 检索 content 中出现"大人"的文档
results = searcher.find("content", "小人 AND 大人", limit=None)
print('一共发现%d个结果。\n' % len(results))

for index, result in enumerate(results):
    if len(result['volume_title']):
        print(f"{index}. {result['book_title']} {result['volume_title']}·{result['chapter_title']}")
    else:
        print(f"{index}. {result['book_title']} {result['chapter_title']}")
    print(f"... {result.highlights('content')} ...\n")

一共发现17个结果。

0. 周易 上经·第十二卦 ䷋否 地天否 乾上坤下
... 六二：包承。<b class="match term0">小人</b>吉，<b class="match term1">大人</b>否，亨 ...

1. 孟子 告子章句上·第十五节
... 孟子曰：“从其大体为<b class="match term1">大人</b>，从其小体为<b class="match term0">小人</b> ...

2. 墨子 44章 大取
... 圣人之爱人也；其利人也，厚于圣人之利人也。<b class="match term1">大人</b>之爱<b class="match term0">小人</b>也，薄于<b class="match term0">小人</b>之爱<b class="match term1">大人</b>也；其利<b class="match term0">小人</b>也，厚于<b class="match term0">小人</b>之利<b class="match term1">大人</b>也。以臧为其亲也，而爱之，非爱其亲也；以臧为 ...

3. 论语 季氏篇
... 孔子曰：“君子有三畏：畏天命，畏<b class="match term1">大人</b>，畏圣人之言。<b class="match term0">小人</b>不知天命而不畏也，狎<b class="match term1">大人</b>，侮圣人之言 ...

4. 孟子 告子章句上·第十五节
... 公都子问曰：“钧是人也，或为<b class="match term1">大人</b>，或为<b class="match term0">小人</b>，何也 ...

5. 战国策 东周·杜赫欲重景翠于周
... 有鸟无鸟之际，然后能多得鸟矣。今君将施于<b class="match term1">大人</b>，<b class="match term1">大人</b>轻君；施于<b class="match term0">小人</b>，<b class="match term0">小人</b>无可以求，又费财焉。君必施于今之穷士，不必...为<b class="match term1">大人</b>者，故能得欲矣 ...

6. 易传 象传下·革

In [223]:
# 创建一个检索器
ix = GetBookIndex(bookspath)
with ix.searcher() as searcher:
    query = QueryParser("content", ix.schema).parse("小人 AND 大人")
    tile_query = [Term('book_title', '论语'), Term('book_title', '周易')]
    content_query = [Term('content', '大人'), Term('content', '小人')]
    query = compound.And([compound.Or(tile_query), compound.And(content_query)])
    
    results = searcher.search(query, limit=None)
    print('一共发现%d个结果。\n' % len(results))

    # 允许被检索到的文字内容包含的最大的字符数
    results.fragmenter.maxchars = 300
    # 在被检索到的文字之前和之后显示更多上下文
    results.fragmenter.surround = 60
    
    for index, result in enumerate(results):
        if len(result['volume_title']):
            print(f"{index}. {result['book_title']} {result['volume_title']}·{result['chapter_title']}")
        else:
            print(f"{index}. {result['book_title']} {result['chapter_title']}")
        #print(f"{result['content']}\n")
        print(f"... {result.highlights('content')} ...\n")

一共发现2个结果。

0. 周易 上经·第十二卦 ䷋否 地天否 乾上坤下
... 六二：包承。<b class="match term0">小人</b>吉，<b class="match term1">大人</b>否，亨 ...

1. 论语 季氏篇
... 孔子曰：“君子有三畏：畏天命，畏<b class="match term1">大人</b>，畏圣人之言。<b class="match term0">小人</b>不知天命而不畏也，狎<b class="match term1">大人</b>，侮圣人之言 ...



### Test for display html 

In [232]:
from IPython.display import display, HTML
display(HTML('<p>Hello, world!</p>'))

In [227]:
from IPython.display import display, HTML

# 创建一个检索器
ix = GetBookIndex(bookspath)
searcher = ix.searcher()

# 检索 content 中出现"大人"的文档
# 查 幽人贞吉 能查出来幽人，查 幽人 查不出来，应该有问题
results = searcher.find("content", "幽人", limit=None)
print('一共发现%d个结果。\n' % len(results))

for index, result in enumerate(results):
    if len(result['volume_title']):
        display(HTML(f"{index}. {result['book_title']} {result['volume_title']}·{result['chapter_title']}"))
    else:
        display(HTML(f"{index}. {result['book_title']} {result['chapter_title']}"))
    display(HTML(f"{result.highlights('content')}\n"))

一共发现17个结果。

