In [1]:
import os
import csv
import math
import jieba
import string
import pickle
from collections import Counter
from collections import defaultdict

# 增加字段大小限制，设置为更大的值（比如 10MB）
csv.field_size_limit(10000000)

# 读取倒排索引
def load_index(filename="Text_Index.pkl"):
    print("加载倒排索引...")
    with open(filename, 'rb') as f:
        inverted_index, doc_tfs, idf = pickle.load(f)
    print("倒排索引加载完成。")
    return inverted_index, doc_tfs, idf

# 打开并读取 Html_Content.csv 文件
csv_filename = "Html_Content.csv"

# 读取文件中的内容
with open(csv_filename, mode='r', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    # 跳过表头（如果有的话）
    next(csv_reader, None)
    # 将所有内容读入列表
    html_data = list(csv_reader)

# 加载倒排索引
inverted_index, doc_tfs, idf = load_index()

加载倒排索引...
倒排索引加载完成。


In [2]:
from urllib.parse import urlparse

# 中文分词
def tokenize(text):
    return list(jieba.cut(text))

# 计算PageRank的函数（采用简化的PageRank算法）
def compute_pagerank(html_folder, top_docs, d=0.85, max_iter=100, tol=1e-6):
    # 构建网页之间的链接关系
    link_graph = defaultdict(list)
    doc_urls = []
    for doc_id, score in top_docs:
        # 获取网页的URL和锚文本
        webpage_title = html_data[doc_id][0]    # 第一列是 标题
        webpage_url = html_data[doc_id][1]      # 第二列是 URL
        doc_urls.append(webpage_url)
        webpage_anchors = html_data[doc_id][2]  # 第三列是 锚文本
        
        # 构建本地文件路径
        local_file_path = os.path.join(html_folder, f"{webpage_title}.html")
        
        # 从本地HTML文件中解析链接
        if os.path.exists(local_file_path):
            with open(local_file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                links = extract_links_from_html(content)  # 提取HTML中的链接
                for link in links:
                    # 如果该链接在我们关注的网页列表中，则记录下来
                    if link in doc_urls:
                        link_graph[webpage_url].append(link)
    
    # 初始化PageRank
    num_docs = len(doc_urls)
    pagerank = {url: 1 / num_docs for url in doc_urls}  # 每个页面的初始PageRank值为 1/N
    for _ in range(max_iter):
        new_pagerank = {}
        for url in doc_urls:
            inbound_links = [key for key, links in link_graph.items() if url in links]
            rank_sum = sum(pagerank[link] / len(link_graph[link]) for link in inbound_links)
            new_pagerank[url] = (1 - d) / num_docs + d * rank_sum
        
        # 检查是否收敛
        if all(abs(new_pagerank[url] - pagerank[url]) < tol for url in doc_urls):
            break
        
        pagerank = new_pagerank
    
    return pagerank

# 从HTML中提取所有的链接（简化版本，仅提取href链接）
def extract_links_from_html(content):
    from bs4 import BeautifulSoup
    soup = BeautifulSoup(content, 'html.parser')
    links = soup.find_all('a', href=True)
    return [link['href'] for link in links]

# 计算查询的TF-IDF向量
def compute_query_tfidf(query, idf):
    query_tokens = tokenize(query)
    query_tf = defaultdict(int)
    
    # 计算TF
    for word in query_tokens:
        query_tf[word] += 1
    total_words = len(query_tokens)
    for word in query_tf:
        query_tf[word] /= total_words
    
    # 计算TF-IDF
    query_tfidf = {}
    for word, tf in query_tf.items():
        query_tfidf[word] = tf * idf.get(word, 0)  # 默认IDF为0
    
    return query_tfidf

# 计算文档与查询的相关度（余弦相似度）
def compute_cosine_similarity(query_tfidf, doc_tfidf):
    dot_product = sum(query_tfidf.get(word, 0) * doc_tfidf.get(word, 0) for word in query_tfidf)
    query_norm = math.sqrt(sum(val**2 for val in query_tfidf.values()))
    doc_norm = math.sqrt(sum(val**2 for val in doc_tfidf.values()))
    if query_norm * doc_norm == 0:
        return 0
    return dot_product / (query_norm * doc_norm)

# 执行搜索
def search(query, inverted_index, doc_tfs, idf, top_n=10):
    
    # 计算查询的TF-IDF向量
    query_tfidf = compute_query_tfidf(query, idf)
    
    # 计算每个文档与查询的相关度
    scores = []
    for doc_id, doc_tfidf in enumerate(doc_tfs):
        score = compute_cosine_similarity(query_tfidf, doc_tfidf)
        scores.append((doc_id, score))
    
    # 按相关度排序并返回前top_n个文档
    scores.sort(key=lambda x: x[1], reverse=True)
    top_docs = scores[:100]
    
    # 计算PageRank
    html_folder = './HtmlFile/'
    pagerank = compute_pagerank(html_folder, top_docs)
    
    # 综合文档评分与PageRank
    combined_scores = []
    for doc_id, score in top_docs:
        webpage_url = html_data[doc_id][1]  # 获取文档URL
        pagerank_score = pagerank.get(webpage_url, 0)  # 获取该网页的PageRank
        combined_score = score + pagerank_score  # 综合评分
        combined_scores.append((doc_id, combined_score))
    
    # 按综合评分重新排序并返回前top_n个文档
    combined_scores.sort(key=lambda x: x[1], reverse=True)
    top_combined_docs = combined_scores[:top_n]
    
    return top_combined_docs

In [3]:
# 定义循环次数计数器
search_count = 0

# 初始化查询日志
query_log = []

# html 文件存放在当前目录下的 HtmlFile 文件夹
html_folder = './HtmlFile/'

# 确保 HtmlFile 文件夹存在
if not os.path.exists(html_folder):
    print("HtmlFile 文件夹不存在，请检查文件路径。")
    exit()

# 定义一个用于过滤标点符号的函数
def is_valid_word(word):
    invalid_symbols = ['，', '%', '‰', '、']
    invalid_words = ['与', '本', '的', '由', '多', '1', '新']
    
    if any(symbol in word for symbol in invalid_symbols) or word in invalid_words:
        return False
    
    return all(char not in string.punctuation for char in word)

# 分词并统计双词
def extract_bigrams(text):
    words = list(jieba.cut(text))  # 切割文本
    bigrams = []
    for i in range(len(words) - 1):
        bigram = f"{words[i]}{words[i+1]}"
        if is_valid_word(words[i]) and is_valid_word(words[i+1]) and words[i] != words[i+1]:
            bigrams.append(bigram)
    return bigrams

# 读取用户信息
def load_user_info(file_path):
    users = {}
    if os.path.exists(file_path):
        with open(file_path, mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            for row in reader:
                account, password, nickname, preferences = row
                users[account] = {'password': password, 'nickname': nickname, 'preferences': preferences}
    return users

# 保存用户信息
def save_user_info(file_path, users):
    with open(file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        for account, info in users.items():
            writer.writerow([account, info['password'], info['nickname'], info['preferences']])

# 用户注册
def register(users):
    print("用户注册")
    account = input("请输入账号: ")
    if account in users:
        print("账户已存在，请选择其他账户名。")
        return None
    password = input("请输入密码: ")
    nickname = input("请输入昵称: ")
    preferences = input("请输入您的偏好设置 (例如: '学术, 科技'): ")
    users[account] = {'password': password, 'nickname': nickname, 'preferences': preferences}
    print(f"注册成功！欢迎，{nickname}")
    return account

# 用户登录
def login(users):
    print("用户登录")
    account = input("请输入账号: ")
    if account not in users:
        print("账号不存在，请先注册。")
        return None
    password = input("请输入密码: ")
    if users[account]['password'] == password:
        print(f"登录成功！欢迎，{users[account]['nickname']}")
        return account, users[account]['preferences']
    else:
        print("密码错误，请重新尝试。")
        return None

# 主程序
def main():
    # 加载用户信息
    user_info_file = 'User_Information.csv'
    users = load_user_info(user_info_file)
    
    # 注册或登录
    current_user = None
    preferences = None
    while current_user is None:
        action = input("请选择操作: 1. 注册 2. 登录 (输入 'exit' 退出): ")
        if action == '1':
            current_user = register(users)
            preferences = users[current_user]['preferences'] if current_user else None
        elif action == '2':
            result = login(users)
            if result:
                current_user, preferences = result
        elif action.lower() == 'exit':
            print("退出程序")
            return
        else:
            print("无效选项，请选择 1 或 2。")
    
    # 保存用户信息
    save_user_info(user_info_file, users)
    
    # 初始化查询日志
    query_log.clear()
    
    # 循环进行查询
    global search_count
    while True:
        # 根据循环次数决定搜索内容
        if search_count == 0:
            query = "南开大学ESI学科发展报告pdf"  # 文档查询测试
        elif search_count == 1:
            query = "南开大学新校区"  # 短语查询测试
        elif search_count == 2:
            query = "新冠肺炎防控*漫画"  # 通配查询测试
        elif search_count == 3:
            query = "exit"
        else:
            query = input("请输入查询词（输入 'exit' 退出）：")
        # query = input("请输入查询词（输入 'exit' 退出）：")

        # 如果用户输入 "exit"，退出循环
        if query.lower() == 'exit':
            print("\n退出程序。")
            break
        
        # 输出查询内容
        print(f"\n正在搜索：{query}")
        
        # 记录查询日志
        query_log.append(query)
        
        # 如果用户已登录，则将用户偏好附加到查询词
        if preferences:
            query = f"{query} {preferences}"

        # 执行搜索（假设search是一个已定义的搜索函数）
        top_docs = search(query, inverted_index, doc_tfs, idf, top_n=10)
        
        # 输出搜索结果
        print("搜索结果：")
        
        all_bigrams = []  # 用于收集所有搜索结果中的双词
        
        for doc_id, score in top_docs:
            webpage_title = html_data[doc_id][0]    # 假设第一列是 标题
            webpage_url = html_data[doc_id][1]      # 假设第二列是 URL
            webpage_anchors = html_data[doc_id][2]  # 假设第三列是 锚文本
            webpage_body = html_data[doc_id][3]     # 假设第四列是 正文

            all_bigrams.extend(extract_bigrams(webpage_anchors))
            all_bigrams.extend(extract_bigrams(webpage_body))

            # 构建本地文件路径（假设网页标题对应文件名）
            local_file_path = os.path.join(html_folder, f"{webpage_title}.html")
            
            if os.path.exists(local_file_path):
                local_file_path = local_file_path.replace(' ', '%20')
                local_file_url = f"file:///{os.path.abspath(local_file_path)}"
                print(f"网页标题: {webpage_title}, 网页链接: {webpage_url}\n本地文件链接: {local_file_url}")
            else:
                print(f"网页标题: {webpage_title}, 网页链接: {webpage_url}\n本地文件不存在")
        
        # 输出查询日志
        print("\n查询日志：")
        for i, log in enumerate(query_log, 1):
            # if log.endswith(preferences):
            #     log = log[:-len(preferences)]
            print(f"{i}. {log}")
        
        # 输出当前用户的昵称和偏好
        if current_user:
            print(f"\n当前用户: {users[current_user]['nickname']}")
            print(f"用户偏好: {preferences}")

        # 统计双词
        bigram_counts = Counter(all_bigrams)
        common_bigrams = bigram_counts.most_common(10)
        print("\n推荐搜索：")
        
        # 输出推荐搜索短语
        for i in range(0, len(common_bigrams), 10):
            line = "、".join([bigram for bigram, _ in common_bigrams[i:i+10]])
            print(line)
        
        search_count += 1

In [4]:
if __name__ == "__main__":
    main()

用户登录


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ASUS\AppData\Local\Temp\jieba.cache


登录成功！欢迎，徐亚民

正在搜索：南开大学ESI学科发展报告pdf


Loading model cost 0.538 seconds.
Prefix dict has been built successfully.


搜索结果：
网页标题: 南开大学ESI全球前1%学科增至15个 “微生物学”首次上榜-南开要闻-南开大学, 网页链接: http://news.nankai.edu.cn/ywsd/system/2021/03/28/030045131.shtml
本地文件链接: file:///d:\Programs\InformationRetrieval\HtmlFile\南开大学ESI全球前1%学科增至15个%20“微生物学”首次上榜-南开要闻-南开大学.html
网页标题: 南开大学“社会科学总论”学科首次进入全球前1%-南开要闻-南开大学, 网页链接: http://news.nankai.edu.cn/ywsd/system/2020/05/18/030039232.shtml
本地文件链接: file:///d:\Programs\InformationRetrieval\HtmlFile\南开大学“社会科学总论”学科首次进入全球前1%-南开要闻-南开大学.html
网页标题: 南开大学工程科学跻身ESI全球前1‰-南开要闻-南开大学, 网页链接: http://news.nankai.edu.cn/ywsd/system/2023/11/25/030059003.shtml
本地文件链接: file:///d:\Programs\InformationRetrieval\HtmlFile\南开大学工程科学跻身ESI全球前1‰-南开要闻-南开大学.html
网页标题: 南开大学环境科学与生态学跻身ESI全球前1‰-南开要闻-南开大学, 网页链接: http://news.nankai.edu.cn/ywsd/system/2024/07/17/030062633.shtml
本地文件链接: file:///d:\Programs\InformationRetrieval\HtmlFile\南开大学环境科学与生态学跻身ESI全球前1‰-南开要闻-南开大学.html
网页标题: 中央广电总台国际在线：南开大学环境科学与生态学跻身ESI全球前1‰-媒体南开-南开大学, 网页链接: http://news.nankai.edu.cn/mtnk/system/2024/07/18/030062653.shtml
本地文件链接: file:///d:\Programs\Info