In [None]:
import os
import re
import requests

import nest_asyncio
from llama_parse import LlamaParse

nest_asyncio.apply()

In [23]:
file = '1706.03762v7.pdf'
parser = LlamaParse(api_key=os.environ['LLAMAINDEX_API_KEY'], result_type="markdown")

text = parser.load_data(file)

Started parsing the file under job_id 822444e4-acd3-4cc9-8ef1-8dc8fbf8def1


In [24]:
text

[Document(id_='b90a4a2d-5786-44f9-9fa7-1515df64be0c', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# Attention Is All You Need\n\nAshish Vaswani* Noam Shazeer* Niki Parmar* Jakob Uszkoreit*\n\nGoogle Brain Google Brain Google Research Google Research\n\navaswani@google.com noam@google.com nikip@google.com usz@google.com\n\nLlion Jones* Aidan N. Gomez*† Łukasz Kaiser*\n\nGoogle Research University of Toronto Google Brain\n\nllion@google.com aidan@cs.toronto.edu lukaszkaiser@google.com\n\nIllia Polosukhin*‡\n\nillia.polosukhin@gmail.com\n\n# Abstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence a

In [19]:
os.makedirs("cited_papers", exist_ok=True)

def preprocess_text_list(text_list):
        return "\n\n".join(doc.text for doc in text_list if hasattr(doc, 'text'))
    
def extract_references(text):
    references_section = re.search(r"(?i)(references|bibliography)(.*)", text, re.DOTALL)
    if references_section: return references_section.group(2)
    return ""

def extract_dois_and_arxiv_ids(references_text):
    arxiv_pattern = r"arXiv\s*:\s*(\d{4}\.\d{4,5}(v\d+)?)|arXiv\s+preprint\s+arXiv\s*:\s*(\d{4}\.\d{4,5}(v\d+)?)"
    doi_pattern = r"\b10\.\d{4,9}/[-._;()/:A-Z0-9]+\b"
    references_text = re.sub(r"\s+", " ", references_text)

    arxiv_ids = re.findall(arxiv_pattern, references_text, re.IGNORECASE)
    arxiv_ids = [match[0] or match[2] for match in arxiv_ids]

    dois = re.findall(doi_pattern, references_text, re.IGNORECASE)
    return list(set(dois)), list(set(arxiv_ids))

def download_paper_by_doi(doi):
    url = f"https://doi.org/{doi}"
    headers = {"Accept": "application/pdf"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        with open(f"cited_papers/{doi.replace('/', '_')}.pdf", "wb") as f:
            f.write(response.content)
        print(f"Downloaded paper for DOI: {doi}")
    else:
        print(f"Failed to download DOI: {doi}")

def download_paper_by_arxiv_id(arxiv_id):
    url = f"https://arxiv.org/pdf/{arxiv_id.replace('arXiv:', '')}.pdf"
    response = requests.get(url)
    if response.status_code == 200:
        with open(f"cited_papers/{arxiv_id.replace(':', '_')}.pdf", "wb") as f:
            f.write(response.content)
        print(f"Downloaded paper for arXiv ID: {arxiv_id}")
    else:
        print(f"Failed to download arXiv ID: {arxiv_id}")


In [20]:
# text = preprocess_text_list(text)
references = extract_references(text)
if not references:
    print("No references found in the parsed text.")
else:
    dois, arxiv_ids = extract_dois_and_arxiv_ids(references)
    print(f"Found {len(dois)} DOIs and {len(arxiv_ids)} arXiv IDs.")

    for doi in dois:
        download_paper_by_doi(doi)

    for arxiv_id in arxiv_ids:
        download_paper_by_arxiv_id(arxiv_id)

Found 0 DOIs and 16 arXiv IDs.
Downloaded paper for arXiv ID: 1508.04025
Downloaded paper for arXiv ID: 1705.03122v2
Downloaded paper for arXiv ID: 1705.04304
Downloaded paper for arXiv ID: 1701.06538
Downloaded paper for arXiv ID: 1511.06114
Downloaded paper for arXiv ID: 1508.07909
Downloaded paper for arXiv ID: 1703.10722
Downloaded paper for arXiv ID: 1602.02410
Downloaded paper for arXiv ID: 1703.03130
Downloaded paper for arXiv ID: 1609.08144
Downloaded paper for arXiv ID: 1610.02357
Downloaded paper for arXiv ID: 1308.0850
Downloaded paper for arXiv ID: 1601.06733
Downloaded paper for arXiv ID: 1608.05859
Downloaded paper for arXiv ID: 1610.10099v2
Downloaded paper for arXiv ID: 1607.06450


In [30]:
output_file = os.path.splitext(file)[0] + '.txt'
combined_text = "\n\n".join(doc.text for doc in text)
combined_text


'# Attention Is All You Need\n\nAshish Vaswani* Noam Shazeer* Niki Parmar* Jakob Uszkoreit*\n\nGoogle Brain Google Brain Google Research Google Research\n\navaswani@google.com noam@google.com nikip@google.com usz@google.com\n\nLlion Jones* Aidan N. Gomez*† Łukasz Kaiser*\n\nGoogle Research University of Toronto Google Brain\n\nllion@google.com aidan@cs.toronto.edu lukaszkaiser@google.com\n\nIllia Polosukhin*‡\n\nillia.polosukhin@gmail.com\n\n# Abstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significa

In [31]:
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(combined_text)

In [33]:
dir_path = './cited_papers'

for filename in os.listdir(dir_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(dir_path, filename)
        output_file = os.path.splitext(file_path)[0] + '.txt'
        text = parser.load_data(file_path)
        combined_text = "\n\n".join(doc.text for doc in text)
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(combined_text)

Started parsing the file under job_id d16d1dd6-9acf-4c5b-8b39-faa4326a43a3
.Started parsing the file under job_id a5c3134e-8278-46bc-8db8-b3f3d467171e
..............Error while parsing the file './cited_papers/1308.0850.pdf': 
Started parsing the file under job_id 95af0db2-7b5b-40ac-b286-b6c85b64e3fa
.Started parsing the file under job_id ce2284ef-f8e6-499c-abbd-b7c76ee3a697
.Started parsing the file under job_id 0479d823-f18c-4ad5-b924-d3e1497ec9be
......Started parsing the file under job_id c00cd5e2-19e3-405e-88aa-d4cbcc1cd496
.Started parsing the file under job_id f51dc71d-2c46-46d9-87e3-bfddfd3809bf
..........Started parsing the file under job_id 2cdc8676-5cc0-4f9d-a6bf-defd02bfccbf
.Started parsing the file under job_id 5e21b2a0-23aa-4aaa-9ddd-1cec7feb8fa7
.Started parsing the file under job_id 9b6e8cda-e74e-4620-9908-c430546c8d00
........Started parsing the file under job_id ae5dc232-693a-4f0e-a7b5-39fc1f25195d
Started parsing the file under job_id 8dd19429-22c9-4d75-bee4-b74296c

In [35]:
file = './cited_papers/1308.0850.pdf'
output_file = os.path.splitext(file)[0] + '.txt'
text = parser.load_data(file)
combined_text = "\n\n".join(doc.text for doc in text)
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(combined_text)

Started parsing the file under job_id 246a2041-771d-4ff3-a208-8397ec05f20f


In [36]:
file = './cited_papers/1601.06733.pdf'
output_file = os.path.splitext(file)[0] + '.txt'
text = parser.load_data(file)
combined_text = "\n\n".join(doc.text for doc in text)
with open(output_file, 'w', encoding='utf-8') as f:
    f.write(combined_text)

Started parsing the file under job_id 3b73464d-db31-4c88-9112-5f7c722b1a63
