<a href="https://colab.research.google.com/github/njucs/notebook/blob/master/PaperCopilot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 scikit-learn sentence-transformers google-search-results transformers

In [2]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from googlesearch import search
from transformers import pipeline
import json
import os
import time
import datetime
import random

# 替换成你的API密钥，或者使用环境变量
API_KEY = os.getenv('SILICONFLOW_API_KEY', 'sk-aerqsdefgpftbyerwomejvrrhgfkpgrbouhzwklhwbruuitc')
BASE_URL = "https://api.siliconflow.cn/v1/chat/completions"


# 定义大模型调用类
class CallLLM:
    def __init__(self):
        self.url = BASE_URL
        self.headers = {
            "Authorization": f"Bearer {API_KEY}",
            "Content-Type": "application/json"
        }

    def call(self, model_name, prompt):
        payload = {
            "model": model_name,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        }
                    ]
                }
            ],
            "stream": False,
            "max_tokens": 1024, # 可以适当增加 max_tokens
            "stop": ["null"],
            "temperature": 0.7,
            "top_p": 0.7,
            "top_k": 50,
            "frequency_penalty": 0.5,
            "n": 1,
            "response_format": {"type": "text"}
        }

        try:
            response = requests.post(self.url, json=payload, headers=self.headers)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"API 请求错误: {e}")
            return None


# 定义大模型类
class Model:
    def __init__(self, model_name, call_llm):
        self.model_name = model_name
        self.call_llm = call_llm

    def generate_response(self, input_text):
        response_data = self.call_llm.call(self.model_name, input_text)
        if response_data and "choices" in response_data and response_data["choices"]:
          return response_data["choices"][0]["message"]["content"]
        else:
            return None

# 1. 使用大模型抽取论文列表
def extract_papers_with_llm(html_content, llm_model):
    prompt = f"""
    你是一个信息抽取专家，请从以下 HTML 文本中识别并提取论文列表，论文列表一般包含每篇论文的标题和作者信息。
    请以 JSON 格式返回提取结果，JSON 格式如下：
    [
      {{
        "title": "论文标题1",
        "authors": "作者1, 作者2"
      }},
      {{
        "title": "论文标题2",
        "authors": "作者3"
      }},
       ...
    ]
    如果找不到论文列表，请返回空列表 []。

    HTML 文本：
    {html_content}
    """

    response_text = llm_model.generate_response(prompt)
    if response_text:
        try:
            papers = json.loads(response_text)
            if isinstance(papers, list):
               return papers
        except json.JSONDecodeError:
            print("JSON 解析错误")
    return []

# 2. 文本预处理
def preprocess_text(text):
    text = re.sub(r"[^\w\s]", "", text)  # 去除标点符号
    text = text.lower()
    return text

# 3. 语义相似度计算
def semantic_similarity(query, papers, embedding_model):
    query_embedding = embedding_model.encode(preprocess_text(query))
    filtered_papers = []
    for paper in papers:
        title = paper["title"]
        title_embedding = embedding_model.encode(preprocess_text(title))
        similarity = cosine_similarity(query_embedding.reshape(1, -1), title_embedding.reshape(1, -1))[0][0]
        if similarity > 0.5:  # 相似度阈值，可以根据需要调整
            filtered_papers.append({"title": paper["title"], "authors": paper["authors"], "similarity": similarity})
    return filtered_papers

# 4. 论文内容搜索
def search_paper_content(title):
    search_results = search(title, num_results=3)
    for url in search_results:
      try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        text_content = ' '.join([p.text for p in soup.find_all('p')])
        if text_content:
            return text_content
      except requests.exceptions.RequestException as e:
         print(f"Failed to fetch {url}: {e}")
    return None

# 5. 论文观点归纳
def summarize_paper_content(text, summarizer):
    try:
        summary = summarizer(text, max_length=200, min_length=50)[0]['summary_text'] # 可以调整长度
        return summary
    except:
      return "Summary Failed"

if __name__ == "__main__":
    # 替换成你想要爬取的网页链接
    url = "https://2024.aclweb.org/program/main_conference_papers/" # 使用示例网页，请替换
    response = requests.get(url)
    response.raise_for_status()
    html_content = response.text

    call_llm = CallLLM()
    llm_model = Model("THUDM/glm-4-9b-chat", call_llm)
    papers = extract_papers_with_llm(html_content, llm_model)
    print(f"Found {len(papers)} papers using LLM.")
    if not papers:
        print("No papers found with LLM. Please try using normal crawl.")

    query = "自然语言处理中的语义理解"
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')  # 使用 Sentence-BERT 模型
    filtered_papers = semantic_similarity(query, papers, embedding_model)
    print(f"Found {len(filtered_papers)} relevant papers.")

    summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # 使用 BART 模型进行摘要

    for paper in filtered_papers:
        print(f"Title: {paper['title']}")
        print(f"Authors: {paper['authors']}")
        paper_content = search_paper_content(paper['title'])
        if paper_content:
            summary = summarize_paper_content(paper_content, summarizer)
            print(f"Summary: {summary}")
        else:
           print("Paper Content Not Found")
        print("-" * 40)

Found 0 papers.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Found 0 relevant papers.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
