<a href="https://colab.research.google.com/github/njucs/notebook/blob/master/PaperCopilot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 scikit-learn sentence-transformers google-search-results transformers

In [3]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from googlesearch import search
from transformers import pipeline
import json
import os

# 替换成你的API密钥，或者使用环境变量
API_KEY = os.getenv('SILICONFLOW_API_KEY', 'sk-aerqsdefgpftbyerwomejvrrhgfkpgrbouhzwklhwbruuitc')
BASE_URL = "https://api.siliconflow.cn/v1/chat/completions"


# 定义大模型调用类
class CallLLM:
    def __init__(self):
        self.url = BASE_URL
        self.headers = {
            "Authorization": f"Bearer {API_KEY}",
            "Content-Type": "application/json"
        }

    def call(self, model_name, prompt):
        payload = {
            "model": model_name,
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": prompt
                        }
                    ]
                }
            ],
            "stream": False,
            "max_tokens": 1024, # 可以适当增加 max_tokens
            "stop": ["null"],
            "temperature": 0.7,
            "top_p": 0.7,
            "top_k": 50,
            "frequency_penalty": 0.5,
            "n": 1,
            "response_format": {"type": "text"}
        }

        try:
            response = requests.post(self.url, json=payload, headers=self.headers)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"API 请求错误: {e}")
            return None


# 定义大模型类
class Model:
    def __init__(self, model_name, call_llm):
        self.model_name = model_name
        self.call_llm = call_llm

    def generate_response(self, input_text):
        response_data = self.call_llm.call(self.model_name, input_text)
        if response_data and "choices" in response_data and response_data["choices"]:
          return response_data["choices"][0]["message"]["content"]
        else:
            return None

# 1. 使用大模型抽取相关论文列表
def extract_relevant_papers_with_llm(html_content, query, llm_model):
    prompt = f"""
    你是一个信息抽取专家，请从以下 HTML 文本中识别并提取与用户查询相关的论文列表。
    论文列表一般包含每篇论文的标题和作者信息。
    用户查询：{query}
    请以 JSON 格式返回提取结果，JSON 格式如下：
    [
      {{
        "title": "论文标题1",
        "authors": "作者1, 作者2"
      }},
      {{
        "title": "论文标题2",
        "authors": "作者3"
      }},
       ...
    ]
    如果找不到相关论文，请返回空列表 []。

    HTML 文本：
    {html_content}
    """

    response_text = llm_model.generate_response(prompt)
    print("related papers: ", response_text)
    if response_text:
        try:
            papers = json.loads(response_text)
            if isinstance(papers, list):
               return papers
        except json.JSONDecodeError:
            print("JSON 解析错误")
    return []


# 3. 论文内容搜索
def search_paper_content(title):
    search_results = search(title, num_results=3)
    for url in search_results:
      try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
        text_content = ' '.join([p.text for p in soup.find_all('p')])
        if text_content:
            return text_content
      except requests.exceptions.RequestException as e:
         print(f"Failed to fetch {url}: {e}")
    return None

# 4. 论文观点归纳
def summarize_paper_content(text, summarizer):
    try:
        summary = summarizer(text, max_length=200, min_length=50)[0]['summary_text'] # 可以调整长度
        return summary
    except:
      return "Summary Failed"

if __name__ == "__main__":
    # 替换成你想要爬取的网页链接
    url = "https://2024.aclweb.org/program/main_conference_papers/"
    response = requests.get(url)
    response.raise_for_status()
    html_content = response.text

    query = "多模态关系抽取"

    call_llm = CallLLM()
    llm_model = Model("THUDM/glm-4-9b-chat", call_llm)
    filtered_papers = extract_relevant_papers_with_llm(html_content, query, llm_model)
    print(f"Found {len(filtered_papers)} relevant papers using LLM.")

    summarizer = pipeline("summarization", model="facebook/bart-large-cnn") # 使用 BART 模型进行摘要

    for paper in filtered_papers:
        print(f"Title: {paper['title']}")
        print(f"Authors: {paper['authors']}")
        paper_content = search_paper_content(paper['title'])
        if paper_content:
            summary = summarize_paper_content(paper_content, summarizer)
            print(f"Summary: {summary}")
        else:
           print("Paper Content Not Found")
        print("-" * 40)

related papers:  
```json
[
  {
    "title": "Unsupervised Multimodal Clustering for Semantics Discovery in Multimodal Utterances",
    "authors": "Hanlei Zhang, Hua Xu, Fei Long, Xin Wang, Kai Gao"
  },
  {
    "title": "Exploring Chain-of-Thought for Multi-modal Metaphor Detection",
    "authors": "Yanzhi Xu, Yueying Hua, Shichen Li, Zhongqing Wang"
  },
  {
    "title": "DeVAn: Dense Video Annotation for Video-Language Models",
    "authors": "Tingkai Liu, Yunzhe Tao, Haogeng Liu, Qihang Fang, Ding Zhou, Huaibo Huang, Ran He, Hongxia Yang"
  },
  {
    "title": "MinPrompt: Graph-based Minimal Prompt Data Augmentation for Few-shot Question Answering",
    "authors": "Xiusi Chen, Jyun-Yu Jiang, Wei-Cheng Chang, Cho-Jui Hsieh, Hsiang-Fu Yu, Wei Wang"
  },
  {
    "title": "SportsMetrics: Blending Text and Numerical Data to Understand Information Fusion in LLMs",
    "authors": "Yebowen Hu, Kaiqiang Song, Sangwoo Cho, Xiaoyang Wang, Hassan Foroosh, Dong Yu, Fei Liu"
  },
  {
    "title"

Device set to use cpu
