In [15]:
import os
import Spider

def highlight(text: str, keyword: str):
    idx = text.lower().find(keyword.lower())
    result = text
    if idx >= 0:
        ori_word = text[idx:idx+(len(keyword))]
        result = text.replace(ori_word, f'*{ori_word}*')
    return result

def score(item, keyword: str):
    title_score = item[1].lower().count(keyword.lower())
    content_score = item[2].lower().count(keyword.lower())
    score_result = title_score * 5 + content_score * 3
    return score_result

In [16]:
import jieba

class MySearcherC6V0:
    """
    第五次课升级的搜索类版本：
    1、增加初始化参数scale，用于倍增文档集
    2、增加缓存机制，避免重复匹配相同关键词
    3、增加线下缓存预填充机制，用猜测得到的用户查询词预填充
    4、用文档分词得到的词表进行缓存预填充
    """
    def __init__(self, scale: int=1):
        self.docs = list()
        self.load_data()
        if scale > 1:
            self.docs *= scale  # 文档规模倍增，用于测试搜索速度
        self.cache = dict()
        self.vocab = set()
        self.build_cache()

    def load_data(self, data_file_name='./news_list.pkl'):
        if os.path.exists(data_file_name):
            self.docs = Spider.pickle_load(data_file_name)
        else:
            Spider.pickle_save(data_file_name)
            self.docs = Spider.pickle_load(data_file_name)

    def search(self, keyword):
        keyword = keyword.lower()
        if keyword not in self.cache:
            count = 0
            sorted_result = list()
            for item in self.docs:
                if keyword in (item[1] + item[2]).lower():
                    sorted_result.append([count, score(item, keyword), item[1]])
                count += 1
            sorted_result.sort(key=lambda x: x[1], reverse=True)
            self.cache[keyword] = sorted_result
        else:
            sorted_result = self.cache[keyword]
        return sorted_result

    def render_search_result(self, keyword):
        count = 0
        for item in self.search(keyword):
            count += 1
            print(f'{count}[{item[1]}] {highlight(self.docs[item[0]][1], keyword)}')

    def build_cache(self):
        """用分词（用文档过滤词库）的方式初始化缓存"""
        for doc in self.docs:
            for word in jieba.cut(doc[1]+' '+doc[2], cut_all=True):
                self.search(word)
                self.vocab.add(word)


In [17]:
%time searcher_v0 = MySearcherC6V0()

Wall time: 1min 15s


In [18]:
class MySearcherC6V1(MySearcherC6V0):
    """
    避免相同词重复预热
    """
    def build_cache(self):
        word_set = set()
        for doc in self.docs:
            for word in jieba.cut(doc[1]+' '+doc[2]):
                if word not in word_set:
                    self.search(word)
                    self.vocab.add(word)
                    word_set.add(word)

In [19]:
%time searcher_v1 = MySearcherC6V1()

Wall time: 1min 7s


In [20]:
class MySearcherC6V2(MySearcherC6V1):
    """
    尽量减少lower运行次数
    """
    def __init__(self, scale: int=1):
        self.docs = list()
        self.load_data()
        if scale > 1:
            self.docs *= scale  # 文档规模倍增，用于测试搜索速度
        self.cache = dict()
        self.vocab = set()
        self.lower_preprocess()
        self.build_cache()

    def search(self, keyword):
        keyword = keyword.lower()
        if keyword not in self.cache:
            count = 0
            sorted_result = list()
            for item in self.docs:
                if keyword in item[3]:
                    sorted_result.append([count, score(item, keyword), item[1]])
                count += 1
            sorted_result.sort(key=lambda x: x[1], reverse=True)
            self.cache[keyword] = sorted_result
        else:
            sorted_result = self.cache[keyword]
        return sorted_result

    def lower_preprocess(self):
        for doc_id in range(len(self.docs)):
            self.docs[doc_id].append(
                (self.docs[doc_id][1] + ' ' + self.docs[doc_id][2]).lower())

    def simple_test(self):
        assert(len(self.search('tiktok')) > 1)


In [21]:
%time searcher_v2 = MySearcherC6V2()

Wall time: 10.2 s


In [22]:
class MySearcherC6V3(MySearcherC6V2):
    """
    用文档刷词构建缓存
    """
    def build_cache(self):
        """用分词（用文档过滤词库）的方式初始化缓存"""
        doc_id = 0
        for doc in self.docs:
            doc_word_set = set()
            for word in jieba.cut(doc[3]):
                if word not in doc_word_set:
                    result_item = [doc_id, score(doc, word), doc[1]]
                    if word not in self.cache:
                        self.cache[word] = [result_item]
                    else:
                        self.cache[word].append(result_item)
                self.vocab.add(word)
                doc_word_set.add(word)
            doc_id += 1
        for word in self.cache:
            self.cache[word].sort(key=lambda x: x[1], reverse=True)

In [23]:
%time searcher_v3 = MySearcherC6V3()

Wall time: 4.4 s


In [24]:
searcher_v3.simple_test()

In [25]:
class MySearcherC6V4(MySearcherC6V3):
    """
    去掉search里的文档扫描过程
    """
    def search(self, keyword):
        keyword_l = keyword.lower()
        if keyword_l in self.cache:
            sorted_result = self.cache[keyword_l]
        else:
            sorted_result = []
        return sorted_result

In [26]:
%time searcher_v4 = MySearcherC6V4()

Wall time: 4.57 s
