In [6]:
import os
import Spider

def highlight(text: str, keyword: str):
    idx = text.lower().find(keyword.lower())
    result = text
    if idx >= 0:
        ori_word = text[idx:idx+(len(keyword))]
        result = text.replace(ori_word, f'*{ori_word}*')
    return result

def score(item, keyword: str):
    title_score = item[1].lower().count(keyword.lower())
    content_score = item[2].lower().count(keyword.lower())
    score_result = title_score * 5 + content_score * 3
    return score_result

In [7]:
import jieba

class MySearcherC7V0:
    """
    第六次课升级的搜索类版本：
    1、避免重复查询相同词
    2、尽量减少lower()运行的次数
    3、用文档刷词构建缓存
    4、去掉search里的文档扫描过程
    """
    def __init__(self, scale: int=1):
        self.docs = list()
        self.load_data()
        if scale > 1:
            self.docs *= scale  # 文档规模倍增，用于测试搜索速度
        self.cache = dict()
        self.vocab = set()
        self.lower_preprocess()
        self.build_cache()

    def load_data(self, data_file_name='./news_list.pkl'):
        if os.path.exists(data_file_name):
            self.docs = Spider.pickle_load(data_file_name)
        else:
            Spider.pickle_save(data_file_name)
            self.docs = Spider.pickle_load(data_file_name)

    def search(self, keyword):
        keyword_l = keyword.lower()
        if keyword_l in self.cache:
            sorted_result = self.cache[keyword_l]
        else:
            sorted_result = []
        return sorted_result

    def render_search_result(self, keyword):
        count = 0
        for item in self.search(keyword):
            count += 1
            print(f'{count}[{item[1]}] {highlight(self.docs[item[0]][1], keyword)}')

    def build_cache(self):
        """用分词（用文档过滤词库）的方式初始化缓存"""
        doc_id = 0
        for doc in self.docs:
            doc_word_set = set()
            for word in jieba.cut(doc[3]):
                if word not in doc_word_set:
                    result_item = [doc_id, score(doc, word), doc[1]]
                    if word not in self.cache:
                        self.cache[word] = [result_item]
                    else:
                        self.cache[word].append(result_item)
                self.vocab.add(word)
                doc_word_set.add(word)
            doc_id += 1
        for word in self.cache:
            self.cache[word].sort(key=lambda x: x[1], reverse=True)

    def lower_preprocess(self):
        for doc_id in range(len(self.docs)):
            self.docs[doc_id].append(
                (self.docs[doc_id][1] + ' ' + self.docs[doc_id][2]).lower())

    def simple_test(self):
        assert(len(self.search('tiktok')) > 1)

In [8]:
%time searcher_v0 = MySearcherC7V0()


Wall time: 4.36 s


In [34]:
class MySearcherC7V1(MySearcherC7V0):
    """
    1、初始化过程加载自定义分词词典
    2、jieba分词使用搜索引擎模式cut_for_search

    3、对查询分词
    4、对分词结果取posting
    5、对posting lists 取交集
    6、将posting保存格式改成只用doc_id
    """
    def __init__(self):
        jieba.load_userdict('./dict.txt')
        super().__init__()

    def build_cache(self):
        """用分词（用文档过滤词库）的方式初始化缓存（构建索引）"""
        doc_id = 0
        for doc in self.docs:
            doc_word_set = set()
            for word in jieba.cut_for_search(doc[3]):
                if word not in doc_word_set:
                    result_item = doc_id
                    if word not in self.cache:
                        self.cache[word] = {result_item}
                    else:
                        self.cache[word].add(result_item)
                    self.vocab.add(word)
                    doc_word_set.add(word)
            doc_id += 1

    def search(self, query):
        result = None
        for keyword in jieba.cut(query.lower()):
            if keyword in self.cache:
                if result is None:
                    result = self.cache[keyword]
                else:
                    result = result & self.cache[keyword]
            else:
                result = set()
                break
        if result is None:
            result = set()
        sorted_result = self.rank(query, result)
        return sorted_result

    def rank(self, query, result_set):
        result = list()
        for doc_id in result_set:
            result.append([doc_id, self.score(self.docs[doc_id], query)])

        result.sort(key=lambda x: x[1], reverse=True)
        return result

    def score(self, item, query):
        score = 0
        # TODO 对query查询的分词避免重复
        for keyword in jieba.cut(query.lower()):
            score += item[1].lower().count(keyword.lower())*5 + item[2].lower().count(keyword.lower())*3
        return score

In [35]:
%time searcher_v1 = MySearcherC7V1()

Wall time: 1.78 s


In [36]:
searcher_v1.search('华为手机')

[[6, 235],
 [99, 131],
 [75, 116],
 [115, 102],
 [24, 99],
 [214, 98],
 [1, 93],
 [183, 90],
 [98, 77],
 [172, 68],
 [203, 68],
 [196, 48],
 [200, 21]]

In [37]:
searcher_v1.render_search_result('华为手机')

1[235] 传华为预计今年智能手机出货量同比减少60%，降至7000万部
2[131] 华为将在英国起诉汇丰，要拿到孟晚舟案关键文件
3[116] 华为推"智慧养猪"，任正非曾称如果养猪可能也是状元
4[102] 华为推“智慧养猪”，任正非：华为不靠手机也能活
5[99] 华为折叠屏手机华为MateX2将于下周一20时发布
6[98] 华为Mate X2镜头供应商为舜宇光学和欧菲光 屏幕为三星
7[93] 华为供应链公司：已向华为P50手机供货，供货时间有延后
8[90] 小米正式发布隔空充电技术 雷军称可实现单设备5瓦远距离充电
9[77] 除了欢迎拜登致电华为，任正非还谈了孟晚舟、退休时间、5G转让等
10[68] 外媒：夺回华为失去的市场，荣耀仍能重现辉煌
11[68] 争国内第一！荣耀想打败华为小米，靠3599的V40行么？
12[48] 小米国内机型不再支持自行安装GMS框架，国际版不受影响
13[21] 男子年会中奖“清空购物车”，不料里面有套房，老板：过分了
