In [None]:
# !pip install unidecode
# !pip install pymupdf
import sys
import os
import fitz
sys.path.append(os.path.abspath(os.path.join(os.getcwd(),"..", "helper")))
from preprocess_text import clean_title, extract_conclusion_discussion, extract_limitations
import json
import re
from tqdm import tqdm
from difflib import get_close_matches


years = range(2017,2021)
abstracts_folder = os.path.abspath("../retrieved_abstracts")
pdfs_folder = os.path.abspath("../full_papers/papers")
output_folder  = "test_inputs"

print(abstracts_folder)
for year in years:
    print(f"Processing {year}")
    pdfs = os.path.join(pdfs_folder, str(year))
    output = os.path.join(output_folder, f"input_{year}.json")

    abstracts_map = {}
    if year < 2018:
        abstracts = os.path.join(abstracts_folder, f"data_{year}", "papers_data.json")
        with open(abstracts, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
                for p in data:
                    if isinstance(p, dict) and "title" in p and "abstract" in p:
                        abstracts_map[clean_title(p["title"])] = p["abstract"]
            except Exception as e:
                print(f"{e} for {year}")
    else:
        abstracts = os.path.join(abstracts_folder, f"data_{year}", "papers_data.jsonl")
        with open(abstracts, "r", encoding="utf-8") as f:
            for l in f:
                try:
                    p = json.loads(l)
                    if "title" in p and "abstract" in p:
                        abstracts_map[clean_title(p["title"])] = p["abstract"]
                except:
                    continue
    data = []
    for f in tqdm(os.listdir(pdfs)):
        found_limitations = None
        title = f.replace(".pdf", "")
        clean = clean_title(title)

        abstract = abstracts_map.get(clean)
        if not abstract:
            match = get_close_matches(clean, list(abstracts_map.keys()), n=1, cutoff=0.7)
            if match:
                abstract = abstracts_map[match[0]]
            else:
                print(f"No abstract match {title}")
                continue
        pdf_path = os.path.join(pdfs, f)
        txt = ""
        with fitz.open(pdf_path) as doc:
            txt = "\n".join(page.get_text("text") for page in doc)
        
        found = extract_conclusion_discussion(txt)
        if not found:
            continue
        final_input = abstract.strip() + "\n\n" + found.strip()
        limitations = extract_limitations(txt)
        if limitations:
            found_limitations = list(limitations.values())
        data.append({"title": title, "input": final_input, "limitations": found_limitations})

    with open(output, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    print(f"For year {year} saved {len(data)} papers to {output}")