In [None]:
# !pip install openai
!pip install --upgrade PyMuPDF
from API_KEY import API_KEY
import os
import re
import fitz
import json

output_folder = "limitations_2024"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

inline_limitations = re.compile(
    r"\b(Limitations?|Challenges?)[:\s]+(.{100,1500}?)",
    flags=re.IGNORECASE | re.DOTALL
)

find_limitations = re.compile(
    r"(?:^|\n)(?:\d{0,2}[\.]\s*)?"
    r"(limitations|limitation|conclusions and limitations|future work|conclusion (?:and|&) future work|limitations (?:and|&) future work|conclusion|conclusions|discussion|conclusion (?:and|&) discussion|conclusions|research limitations|study limitations|challenges)"
    r"(?::)?\s*\n+(.*?)(?=\n\s*(?:\d{1,2}[\.]+\s*)?[A-Z][A-Za-z0-9, \-]{3,60}\n|\Z)",
    flags=re.IGNORECASE | re.DOTALL
)

def text_from_pdf(path):
    text = ""
    with fitz.open(path) as doc:
        for page in doc:
            text += " ".join([block[4] for block in page.get_text("blocks")]) + "\n"
    return text

keywords = ["limitations", "conclusions and limitations","future work", "challenges", "limitation", "study limitations", "research limitations", "limitations and future work", "conclusion and future work", "conclusion & future work"]
def find_section(txt):
    paper_sections = {}
    for f in find_limitations.finditer(txt):
        title = f.group(1).strip().lower()
        text = f.group(2).strip()
        if not text.lower().startswith("question: does the paper discuss the limitations"):
            paper_sections[title] = text
    has_limitations = any(k in paper_sections for k in keywords)
    if not has_limitations:
        for section_title in ["conclusion", "conclusions and limitations" "conclusions", "discussion", "conclusion and discussion", "conclusion & discussion"]:
            section_text = paper_sections.get(section_title)
            if section_text:
                match = inline_limitations.search(section_text)
                if match:
                    paper_sections["inline"] = match.group(1).strip()
                break
    return paper_sections
    
def extract_limitations(path):
    doc = fitz.open(path)
    txt = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        txt += page.get_text("text") + "\n"
    title_paper = os.path.basename(path)
    paper_sections = find_section(txt)
    return title_paper, paper_sections

In [None]:
# !pip install unidecode
import sys
import os

import os
import json
import unidecode
import re
from tqdm import tqdm
from difflib import get_close_matches 

pdf_folder = "Papers"
abstracts_file = "../retrieved_abstracts/data_2024/papers_data.jsonl"
output_file = "test_inputs/abstract_conclusion_test.json"
os.makedirs("test_inputs", exist_ok=True)

def clean_title(title):
    title = unidecode.unidecode(title)
    title = title.lower()
    title = re.sub(r"[^a-z0-9 ]", " ", title)
    title = re.sub(r"\s+", " ", title).strip()
    return title

abstract_map = {}
with open(abstracts_file, "r", encoding="utf-8") as f:
    for line in f:
        try:
            paper = json.loads(line)
            if "title" in paper and "abstract" in paper:
                abstract_map[clean_title(paper["title"])] = paper["abstract"]
        except:
            continue

dataset = []

for file in tqdm(os.listdir(pdf_folder)):
    if not file.endswith(".pdf"):
        continue

    raw_title = file.replace(".pdf", "")
    cleaned = clean_title(raw_title)

    abstract = abstract_map.get(cleaned)
    if not abstract:
        matches = get_close_matches(cleaned, list(abstract_map.keys()), n=1, cutoff=0.7)
        if matches:
            abstract = abstract_map[matches[0]]
        else:
            continue
    try:
        _, sections = extract_limitations(os.path.join(pdf_folder, file))
    except Exception as e:
        continue

    conclusion = (
        sections.get("conclusion")
        or sections.get("discussion")
        or sections.get("conclusions")
        or sections.get("conclusion and discussion")
        or sections.get("conclusion & discussion")
        or sections.get("conclusions and limitations")
    )

    if not conclusion:
        continue

    combined_input = abstract.strip() + "\n\n" + conclusion.strip()

    dataset.append({
        "paper": raw_title,
        "input": combined_input
    })

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(dataset, f, indent=4)


In [None]:
import os
import json
import unidecode
import re
from difflib import get_close_matches
from tqdm import tqdm

pdf_folder = "Papers"
tokenized_file = "../tokenized_data/tokenized_2024.json"
output_file = "test_inputs/tokenized_test.json"
os.makedirs("test_inputs", exist_ok=True)

def clean_title(title):
    title = unidecode.unidecode(title)
    title = title.lower()
    title = re.sub(r"[^a-z0-9 ]", " ", title)
    title = re.sub(r"\s+", " ", title).strip()
    return title

with open(tokenized_file, "r", encoding="utf-8") as f:
    tokenized_data = json.load(f)

tokenized_map = {
    clean_title(entry["title"]): entry["tokens"]
    for entry in tokenized_data.values()
    if isinstance(entry, dict) and "title" in entry and "tokens" in entry
}

dataset = []

for file in tqdm(os.listdir(pdf_folder)):
    if not file.endswith(".pdf"):
        continue

    raw_title = file.replace(".pdf", "")
    cleaned = clean_title(raw_title)

    tokens = tokenized_map.get(cleaned)
    if not tokens:
        matches = get_close_matches(cleaned, list(tokenized_map.keys()), n=1, cutoff=0.7)
        if matches:
            tokens = tokenized_map[matches[0]]
        else:
            continue

    dataset.append({
        "paper": raw_title,
        "input": " ".join(tokens)
    })

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(dataset, f, indent=4)

