# Summarize and extract information from Arxiv papers

install required dependencies

In [None]:
%pip install langchain arxiv pypdf anthropic --upgrade

read `.env` file for variables

In [37]:
import os 
os.environ = {**os.environ, **{env.split("=")[0]: env.split("=")[1] for env in open("../.env", "r").readlines()}}

imports

In [41]:
import arxiv
from langchain.document_loaders import PyPDFLoader
from anthropic import Anthropic

from langchain.chat_models import ChatAnthropic
from langchain.prompts import HumanMessagePromptTemplate

load arxiv paper as pdf and convert to text

In [56]:
paper_id = "2309.00267"

# get paper by id 
paper = next(arxiv.Search(id_list=[paper_id]).results())
print(f"title: {paper.title}")

# download paper
downloaded_file = paper.download_pdf(filename=f"{paper_id}.pdf")

title: RLAIF: Scaling Reinforcement Learning from Human Feedback with AI Feedback


In [57]:
loader = PyPDFLoader(downloaded_file)
pages = loader.load_and_split()

# count tokens of the paper 
client = Anthropic()
total_tokens = 0
max_total_tokens = 10000
paper_content = ""

for page in pages:
    tokens_per_page = client.count_tokens(page.page_content)
    total_tokens += tokens_per_page
    # add page content to paper content
    paper_content += page.page_content + "\n"
    
    # check if prompt got too long
    if total_tokens > max_total_tokens:
        break

create prompt

In [67]:
prompt_template = HumanMessagePromptTemplate.from_template("""please complete the following tasks for the <paper>:
1. Extract the objective and contribution of the paper in one sentence. (In <objective> tags)
2. Extract the implementation details as step-by-step instructions focus on technical details. (In <implementation> tags)
3. Extract the key insights and learnings of the paper as bullet points. (In <insights> tags)
4. Extract the results of the paper in one sentence. (In <results> tags)

<paper>{paper}</paper>
""")


prompt = prompt_template.format(paper=paper_content)

In [68]:
chat = ChatAnthropic(temperature=0,max_tokens=1024)

res = chat([prompt])

write result to file

In [69]:
# replace tags with markdown 
md = res.content.replace("<summary>", "### Summary\n").replace("</summary>", "").replace("<objective>", "### Objective\n").replace("</objective>", "").replace("<implementation>", "### Implementation\n").replace("</implementation>", "").replace("<insights>", "### Insights\n").replace("</insights>", "").replace("<results>", "### Results\n").replace("</results>", "")

with open("paper.md", "w") as f:
    f.write(md)