In [17]:
import os
import jieba
import Spider

def highlight(item, query: str, side_len: int = 12) -> str:
    positions = list()
    query_words = list(jieba.cut(query))  # 把生成器强制转换为列表
    i = 0
    content_lower = item[2].lower()
    word_start_map = list()
    word_end_map = list()
    last_word_end = -1
    len_content_lower = len(content_lower)
    segments = list()
    for keyword in query_words:
        idx = content_lower.find(keyword.lower())
        positions.append(idx)
    for keyword in jieba.cut(content_lower):
        # 用于实现提取摘要时“整词切分”，避免出现截取摘要时首尾的词被截断
        current_word_start = last_word_end + 1
        current_word_end = current_word_start + len(keyword) - 1
        for _ in range(current_word_start, current_word_end+1):
            word_start_map.append(current_word_start)
            word_end_map.append(current_word_end)
        last_word_end = current_word_end
    positions.sort()
    while i < len(positions):
        start_pos = max(positions[i] - side_len, 0)
        end_pos = min(positions[i] + side_len, len_content_lower-1)
        # 用于实现合并相邻且有部分重合的摘要
        while (i < len(positions) - 1) and (positions[i+1] - positions[i] < side_len*2):
            end_pos = min(positions[i+1] + side_len, len_content_lower-1)
            i += 1
        start_ellipsis = '...' if start_pos > 0 else ''
        end_ellipsis = '...' if end_pos < len_content_lower else ''
        segments.append(start_ellipsis + item[2][word_start_map[start_pos]: word_end_map[end_pos]] + end_ellipsis)
        i += 1
    result = text = item[1] + '\n' + ''.join(segments)
    text_lower = text.lower()
    for keyword in query_words:
        # 高亮部分
        idx = text_lower.find(keyword.lower())
        if idx >= 0:
            ori_word = text[idx:idx+(len(keyword))]
            result = result.replace(ori_word, f'<span style="color:red";>{ori_word}</span>')
    return result


class MySearcherC11V0:
    """
    第十次课升级的搜索类版本：
    用文档频率(DF)对词进行加权
    """
    def __init__(self):
        self.docs = list()  # 所有文档原始数据
        self.load_data()
        self.cache = dict()
        self.vocab = set()  # 索引词表
        self.lower_preprocess()
        jieba.load_userdict('./dict.txt')
        self.df = dict()
        self.build_cache()

    def load_data(self, data_file_name='./news_list.pkl'):
        if os.path.exists(data_file_name):
            self.docs = Spider.pickle_load(data_file_name)
        else:
            Spider.pickle_save(data_file_name)
            self.docs = Spider.pickle_load(data_file_name)

    def search(self, query):
        result = None
        for keyword in jieba.cut(query.lower()):
            if keyword in self.cache:
                if result is None:
                    result = self.cache[keyword]
                else:
                    result = result & self.cache[keyword]
            else:
                result = set()
                break
        if result is None:
            result = set()
        sorted_result = self.rank(query, result)
        return sorted_result

    def rank(self, query, result_set):
        result = list()
        for doc_id in result_set:
            result.append([doc_id, self.score(self.docs[doc_id], query)])
        result.sort(key=lambda x: x[1], reverse=True)
        return result

    def render_search_result(self, query):
        """
        返回带有高亮和摘要的查询结果
        """
        count = 0
        result = ''
        for item in self.search(query):
            count += 1
            result += f'{count}[{item[1]}] {highlight(self.docs[item[0]], query)}\n'
        return result

    def score(self, item, query):
        score = 0
        for keyword in jieba.cut(query.lower()):
            title_score = item[1].lower().count(keyword.lower())
            content_score = item[2].lower().count(keyword.lower())
            score += (title_score * 5 + content_score * 3) / len(item[2]) / self.df[keyword]
        return score

    def build_cache(self):
        """
        用分词（用文档过滤词库）的方式构建索引
        """
        doc_id = 0
        for doc in self.docs:
            doc_word_set = set()
            for word in jieba.cut_for_search(doc[3]):
                if word not in doc_word_set:
                    result_item = doc_id
                    if word not in self.cache:
                        self.cache[word] = {result_item}
                    else:
                        self.cache[word].add(result_item)
                    self.vocab.add(word)
                    doc_word_set.add(word)
                    if word in self.df:
                        self.df[word] += 1
                    else:
                        self.df[word] = 1
            doc_id += 1

    def lower_preprocess(self):
        for doc_id in range(len(self.docs)):
            self.docs[doc_id].append(
                (self.docs[doc_id][1] + ' ' + self.docs[doc_id][2]).lower())


In [18]:
from math import log10
class MySearcherC11V1(MySearcherC11V0):
    """
    改善文档频和文档长度加权的影响
    改善IDF权值
    """
    def score(self, item, query):
        score = 0
        for keyword in jieba.cut(query.lower()):
            title_score = item[1].lower().count(keyword.lower())
            content_score = item[2].lower().count(keyword.lower())
            tf = (title_score * 2 + content_score * 1) / log10(len(item[2]))
            idf = log10(len(self.docs) / log10(self.df[keyword] + 0.01))
            score += tf * idf
        return score

In [19]:
%%time
searcher_v1 = MySearcherC11V1()

Wall time: 2.49 s


In [20]:
print(searcher_v1.render_search_result('华为5G'))

1[45.33659621431827] 传<span style="color:red";>华为</span>预计今年智能手机出货量同比减少60%，降至7000万部
...，据供应链消息人士透露，<span style="color:red";>华为</span>已通知其供应商，预计......，因为美国政府禁止其进口<span style="color:red";>5G</span>机型的零部件。许多供应...
2[27.96582616358055] <span style="color:red";>华为</span>将在英国起诉汇丰，要拿到孟晚舟案关键文件
...消息，当地时间12日，中国<span style="color:red";>华为</span>公司首席财务官孟晚舟......，也有关于<span style="color:red";>华为</span>在欧洲拓展<span style="color:red";>5G</span>业务不是很好的消息。一...
3[25.321727082223212] <span style="color:red";>华为</span>推"智慧养猪"，任正非曾称如果养猪可能也是状元
...生猪养殖业真香，连电信巨头<span style="color:red";>华为</span>都想进去分一杯羹了。近......发展论坛上，<span style="color:red";>华为</span>就发表了《<span style="color:red";>5G</span>引领现代猪场AI使能智...
4[24.167628831967857] <span style="color:red";>华为</span>推“智慧养猪”，任正非：<span style="color:red";>华为</span>不靠手机也能活
...近日，任正非首次公开提及<span style="color:red";>华为</span>“南泥湾”计划，即生产自......发展论坛上，<span style="color:red";>华为</span>就发表了《<span style="color:red";>5G</span>引领现代AI使能智慧养...
5[19.654983631278615] <span style="color:re

In [25]:
class MySearcherC11V2(MySearcherC11V0):
    """
    BM25打分算法
    """
    def __init__(self):
        self.avg_dl = 0
        super().__init__()

    def build_cache(self):
        """
        用分词（用文档过滤词库）的方式构建索引
        """
        doc_id = 0
        doc_length_sum = 0
        for doc in self.docs:
            doc_word_set = set()
            doc_length_sum += len(doc[3])
            for word in jieba.cut_for_search(doc[3]):
                if word not in doc_word_set:
                    result_item = doc_id
                    if word not in self.cache:
                        self.cache[word] = {result_item}
                    else:
                        self.cache[word].add(result_item)
                    self.vocab.add(word)
                    doc_word_set.add(word)
                    if word in self.df:
                        self.df[word] += 1
                    else:
                        self.df[word] = 1
            doc_id += 1
        self.avg_dl = doc_length_sum / len(self.docs)


    def score(self, item, query, k1 = 2, b = 0.75):
        score = 0
        for keyword in jieba.cut(query.lower()):
            f = item[2].lower().count(keyword.lower())
            dl = len(item[2])
            tf = (f * (k1 + 1)) / (f + k1 * (1 - b + b * (dl / self.avg_dl)))
            idf = log10((len(self.docs) - self.df[keyword] + 0.5) / (self.df[keyword] + 0.5))
            score += tf * idf
        return score

In [26]:
%%time
searcher_v2 = MySearcherC11V2()

Wall time: 1.58 s


In [27]:
print(searcher_v2.render_search_result('华为5G手机'))

1[6.638733073087041] <span style="color:red";>华为</span>供应链公司：已向<span style="color:red";>华为</span>P50<span style="color:red";>手机</span>供货，供货时间有延后
据<span style="color:red";>华为</span><span style="color:red";>手机</span>供应链公司，该公司已逐......，因为美国政府禁止其进口<span style="color:red";>5G</span>机型的零部件。许多供应...
2[6.514435315643027] 传<span style="color:red";>华为</span>预计今年智能<span style="color:red";>手机</span>出货量同比减少60%，降至7000万部
...，据供应链消息人士透露，<span style="color:red";>华为</span>已通知其供应商，预计......7000万至8000万部智能<span style="color:red";>手机</span>的零部件。而且<span style="color:red";>华为</span>的零部......，因为美国政府禁止其进口<span style="color:red";>5G</span>机型的零部件。许多供应...
3[6.154567506099941] 除了欢迎拜登致电<span style="color:red";>华为</span>，任正非还谈了孟晚舟、退休时间、<span style="color:red";>5G</span>转让等
...任正非接受中外媒体采访，就<span style="color:red";>华为</span>发展、产业、个人生活......发生了很多事，有关于<span style="color:red";>华为</span><span style="color:red";>手机</span>出货量的报道，也有关于<span style="color:red";>华为</span>在欧洲拓展<span style="color:red";>5G</span>业务不是很好的消息。一...
4[5.9880