In [1]:
!pip install fitz --quiet
!pip install PyMuPDF --quiet
from tqdm import tqdm
import fitz
import os
from concurrent.futures import ThreadPoolExecutor
import json

years = range(2013, 2025)
output_folder = "references_by_year"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

def extract(path):
    doc = fitz.open(path)
    
    references_section = ""
    reference_found = False
    for page_number in range(doc.page_count):
        page = doc.load_page(page_number)
        text = page.get_text("text")
        if "references" in text.lower() or "citations" in text.lower():
            reference_start = text.lower().find("references")
            if reference_start != 1:
                reference_found = True
                references_section += text[reference_start:]
        elif reference_found:
            references_section += text
    
    references = []
    ref_rn = ""
    
    for line in references_section.splitlines():
        line = line.strip()
        if not line:
            continue
        if line.startswith("[") and line[1:2].isdigit():
            if ref_rn:
                references.append(ref_rn.strip())
            ref_rn = line
        else:
            ref_rn += " " + line
    if ref_rn:
        references.append(ref_rn.strip())
    return references
        
    
for year in years:
    print(f"Processing {year}")
    year_folder = f"full_papers/papers/{year}"
    pdf_files = [file for file in os.listdir(year_folder) if file.endswith(".pdf")]

    def process_file(pdf_file):
        pdf_path = os.path.join(year_folder, pdf_file)
        try: 
            r = extract(pdf_path)
            return { "paper": pdf_file, "references": r}
        except:
            return { "paper": pdf_file, "references": []}
    
    with ThreadPoolExecutor(max_workers=8) as executor:
        references_by_year = list(tqdm(executor.map(process_file, pdf_files), total=len(pdf_files)))
    
    output_file = os.path.join(output_folder, f"references_{year}.json")
    with open(output_file, 'w') as out_file:
        json.dump(references_by_year, out_file, indent=4)


Processing 2022


100%|██████████| 2831/2831 [18:30<00:00,  2.55it/s] 
