# Summarize and extract information from Arxiv papers

install required dependencies

In [None]:
%pip install tokenizers langchain arxiv pypdf huggingface_hub easyllm --upgrade

read `.env` file for variables

In [3]:
import os 
os.environ = {**os.environ, **{env.split("=")[0]: env.split("=")[1] for env in open("../.env", "r").readlines()}}

imports

In [8]:
import arxiv
from langchain.document_loaders import PyPDFLoader
from tokenizers import Tokenizer
from langchain import HuggingFaceHub
from easyllm.prompt_utils import build_llama2_prompt


load arxiv paper as pdf and convert to text

In [29]:
paper_id = "2309.05463"

# get paper by id 
paper = next(arxiv.Search(id_list=[paper_id]).results())
print(f"title: {paper.title}")

# download paper
downloaded_file = paper.download_pdf(filename=f"{paper_id}.pdf")

title: Textbooks Are All You Need II: phi-1.5 technical report


In [47]:
loader = PyPDFLoader(downloaded_file)
pages = loader.load_and_split()

# count tokens of the paper 
# model_id = "tiiuae/falcon-180B-chat"
# max_total_tokens = 1500
model_id = "meta-llama/Llama-2-7b-chat-hf"
max_total_tokens = 3000
tokenizer = Tokenizer.from_pretrained(model_id)
total_tokens = 0
paper_content = ""

for page in pages:
    tokens_per_page = len(tokenizer.encode(page.page_content))
    total_tokens += tokens_per_page
    # add page content to paper content
    paper_content += page.page_content + "\n"
    
    # check if prompt got too long
    if total_tokens > max_total_tokens:
        break

create prompt

In [51]:
prompt_template = """please complete the following tasks for the <paper>. Your response should be a markdown document with one headings per task.
1. Extract the objective and contribution of the paper in one sentence. 
2. Extract the implementation details as step-by-step instructions focus on technical details. 
3. Extract the key insights and learnings of the paper as bullet points.
4. Extract the results of the paper in one sentence.

<paper>{paper}</paper>
"""

prompt = prompt_template.format(paper=paper_content)
prompt = build_llama2_prompt(prompt)

In [52]:
llm = HuggingFaceHub(repo_id=model_id, model_kwargs={"temperature": 0.01, "max_new_tokens": 1024, "do_sample": True})

res = llm(prompt)

write result to file

In [53]:
with open("llama-paper.md", "w") as f:
    f.write(res)