In [None]:
# !pip install openai
# !pip install --upgrade PyMuPDF
from API_KEY import API_KEY
import os
import re
import fitz
import json
from openai import OpenAI

output_folder = "limitations_2024"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

inline_limitations = re.compile(
    r"\b(Limitations?|Challenges?)[:\s]+(.{100,1500}?)",
    flags=re.IGNORECASE | re.DOTALL
)

find_limitations = re.compile(
    r"(?:^|\n)(?:\d{0,2}[\.]\s*)?"
    r"(limitations|limitation|conclusions and limitations|future work|conclusion (?:and|&) future work|limitations (?:and|&) future work|conclusion|conclusions|discussion|conclusion (?:and|&) discussion|conclusions|research limitations|study limitations|challenges)"
    r"(?::)?\s*\n+(.*?)(?=\n\s*(?:\d{1,2}[\.]+\s*)?[A-Z][A-Za-z0-9, \-]{3,60}\n|\Z)",
    flags=re.IGNORECASE | re.DOTALL
)

def text_from_pdf(path):
    text = ""
    with fitz.open(path) as doc:
        for page in doc:
            text += " ".join([block[4] for block in page.get_text("blocks")]) + "\n"
    return text

keywords = ["limitations", "conclusions and limitations","future work", "challenges", "limitation", "study limitations", "research limitations", "limitations and future work", "conclusion and future work", "conclusion & future work"]
def find_section(txt):
    paper_sections = {}
    for f in find_limitations.finditer(txt):
        title = f.group(1).strip().lower()
        text = f.group(2).strip()
        if not text.lower().startswith("question: does the paper discuss the limitations"):
            paper_sections[title] = text
    has_limitations = any(k in paper_sections for k in keywords)
    if not has_limitations:
        for section_title in ["conclusion", "conclusions and limitations" "conclusions", "discussion", "conclusion and discussion", "conclusion & discussion"]:
            section_text = paper_sections.get(section_title)
            if section_text:
                match = inline_limitations.search(section_text)
                if match:
                    paper_sections["inline"] = match.group(1).strip()
                break
    return paper_sections
    
def extract_limitations(path):
    doc = fitz.open(path)
    txt = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        txt += page.get_text("text") + "\n"
    title_paper = os.path.basename(path)
    paper_sections = find_section(txt)
    return title_paper, paper_sections

In [1]:
# Bullet point list, use LLMs to summarize
client = OpenAI(api_key = API_KEY)
def bullet_list(text):
    response = client.chat.completions.create(
        model = "gpt-4.1-nano",
        messages = [
            {"role": "system", "content":f"You are a research assistant that summarizes academic limitation sections."},
            {"role": "user", "content":f"Convert the following text into 3-6 clear bullet points:\n {text}"}
        ],
        max_tokens=300
    )
    return (response.choices[0].message.content).strip()
    

In [None]:
year = 2024
full_papers_folder = f"full_papers/papers/{year}"
abstracts_folder = f"retrieved_abstracts/data_{year}/papers_data.jsonl"
tokenized_papers_folder = f"tokenized_data/tokenized_{year}.json"

papers = [paper for paper in os.listdir(full_papers_folder) if paper.endswith(".pdf")]
limitations_input_conclusion = []
limitations_input_tokenized_text = []
limitations_input_full_paper = []
limitations_only = []

def clean_title(title):
    title = title.lower().replace("-", " ").replace(":", " ").strip()
    title = re.sub(r"\s+", " ", title)
    title = re.sub(r"[^a-z0-9 ]", " ", title)
    return title

def load_json_title(title_n, folder):
    title_n = clean_title(title_n)
    try:
        with open(folder, "r", encoding="utf-8") as f:
            data = json.load(f)
    except UnicodeDecodeError:
        with open(folder, "r", encoding="latin-1") as f:
            data = json.load(f)
    for i in data:
        if isinstance(i, dict):
            t = clean_title(i.get("title", ""))
            if t == title_n:
                return i
    return None

def load_jsonl_title(title_n, folder):
    title_n = clean_title(title_n)
    with open(folder, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                if clean_title(data.get("title", "")) == title_n:
                    return data
            except Exception:
                print(f"Failed to read {path}")
                continue
    return None

for paper in papers:
    path = os.path.join(full_papers_folder, paper)
    name, sections = extract_limitations(path)
    keywords = ["limitations", "future work", "challenges"]
    for l in keywords:
        section_text = sections.get(l)
        if section_text and section_text.strip().lower() != "question: does the paper discuss the limitations" and ("conclusions and limitations" in sections or "conclusions" in sections or "conclusion" in sections or "discussion" in sections or "conclusion and discussion" in sections or "conclusion & discussion" in sections):
            bullets = bullet_list(section_text)
            print(name + "\n" + bullets)

            abstract_data = load_jsonl_title(name, abstracts_folder)
            tokenized_data = load_json_title(name, tokenized_papers_folder)
            full_papers_data = text_from_pdf(path)

            # First dataset, abstracts and conclusion
            if abstract_data:
                conclusion_text = sections.get("conclusion") or sections.get("discussion") or sections.get("conclusion and discussion") or sections.get("conclusion & discussion")
                abstract = abstract_data.get("abstract", "")
                given_input = abstract + "\n" + conclusion_text
                limitations_input_conclusion.append({"paper": name, "input": given_input, "target": section_text, "target_bullets": bullets})

            # Second dataset, tokenized papers
            if tokenized_data:
                tokens = " ".join(tokenized_data.get("tokens", []))
                limitations_input_tokenized_text.append({"paper": name, "input": tokens, "target": section_text, "target_bullets": bullets})

            # Third dataset, full papers
             if full_papers_data:
                limitations_input_full_paper.append({"paper": name, "input": full_papers_data, "target": section_text, "target_bullets": bullets})

            limitations_only.append({"paper": name, "target": section_text, "target_bullets": bullets})
            break
            

In [None]:
output_file1 = os.path.join(output_folder, f"limitations_training_1_{year}.json")
output_file2 = os.path.join(output_folder, f"limitations_training_2_{year}.json")
output_file3 = os.path.join(output_folder, f"limitations_only_{year}.json")
output_file4 = os.path.join(output_folder, f"limitations_training_3_{year}.json")
with open(output_file1, 'w') as out_file:
    json.dump(limitations_input_conclusion, out_file, indent=4)
with open(output_file2, 'w') as out_file:
    json.dump(limitations_input_tokenized_text, out_file, indent=4)
with open(output_file3, 'w') as out_file:
    json.dump(limitations_only, out_file, indent=4)
with open(output_file4, 'w') as out_file:
    json.dump(limitations_input_full_paper, out_file, indent=4)    

In [None]:
# abstract + conclusion working
# !pip install unidecode
import os
import json
import unidecode
import re
from tqdm import tqdm
from difflib import get_close_matches

year = 2024
limitations_file = f"limitations_2024/limitations_only_{year}.json"
abstracts_file = f"retrieved_abstracts/data_{year}/papers_data.jsonl"
pdf_folder = f"full_papers/papers/{year}"
output_full = f"limitations_2024/limitations_training_1_{year}.json"
output_training_only = f"limitations_2024/limitations_training_bart_1_{year}.json"

def clean_title(title):
    title = unidecode.unidecode(title)
    title = title.lower()
    title = re.sub(r"[^a-z0-9 ]", " ", title)
    title = re.sub(r"\s+", " ", title).strip()
    return title

with open(limitations_file, "r", encoding="utf-8") as f:
    limitation_entries = json.load(f)

abstract_map = {}
with open(abstracts_file, "r", encoding="utf-8") as f:
    for line in f:
        try:
            paper = json.loads(line)
            if "title" in paper and "abstract" in paper:
                abstract_map[clean_title(paper["title"])] = paper["abstract"]
        except:
            continue

abstract_conclusion_dataset = []
training_only_dataset = []

for entry in tqdm(limitation_entries):
    raw_title = entry["paper"].replace(".pdf", "")
    cleaned = clean_title(raw_title)

    # Find the abstract of the paper
    abstract = abstract_map.get(cleaned)
    if not abstract:
        matches = get_close_matches(cleaned, list(abstract_map.keys()), n=1, cutoff=0.7)
        if matches:
            abstract = abstract_map[matches[0]]
        else:
            continue

    # Load the conclusion or discussion
    pdf_path = os.path.join(pdf_folder, entry["paper"])
    if not os.path.exists(pdf_path):
        continue

    try:
        _, sections = extract_limitations(pdf_path)
    except Exception as e:
        continue

    conclusion = (
        sections.get("conclusion")
        or sections.get("conclusions")
        or sections.get("discussion")
        or sections.get("conclusion and discussion")
        or sections.get("conclusion & discussion")
        or sections.get("conclusions and limitations")
    )

    if not conclusion:
        continue
    combined_input = abstract.strip() + "\n\n" + conclusion.strip()

    # Full dataset
    abstract_conclusion_dataset.append({
        "paper": raw_title,
        "input": combined_input,
        "target": entry["target"],
        "target_bullets": entry["target_bullets"]
    })
    # Only for training dataset
    training_only_dataset.append({
        "input": combined_input,
        "target": entry["target_bullets"]
    })

    os.makedirs(os.path.dirname(output_full), exist_ok=True)

    with open(output_full, "w", encoding="utf-8") as f:
        json.dump(abstract_conclusion_dataset, f, indent=4)
    
    with open(output_training_only, "w", encoding="utf-8") as f:
        json.dump(training_only_dataset, f, indent=4)


In [None]:
# Tokenized working

with open(limitations_file, "r", encoding="utf-8") as f:
    limitation_entries = json.load(f)

with open(tokenized_file, "r", encoding="utf-8") as f:
    tokenized_data = json.load(f)

tokenized_map = {
    clean_title(entry["title"]): entry["tokens"]
    for entry in tokenized_data.values()
    if isinstance(entry, dict) and "title" in entry and "tokens" in entry
}

full_dataset = []
training_dataset = []

for entry in tqdm(limitation_entries):
    raw_title = entry["paper"].replace(".pdf", "")
    cleaned = clean_title(raw_title)

    tokens = tokenized_map.get(cleaned)

    if not tokens:
        matches = get_close_matches(cleaned, list(tokenized_map.keys()), n=1, cutoff=0.7)
        if matches:
            match = matches[0]
            tokens = tokenized_map[match]
        else:
            continue
    if isinstance(tokens, list):
        tokens = " ".join(tokens)

    # Full dataset
    full_dataset.append({
        "paper": raw_title,
        "input": tokens,
        "target": entry["target"],
        "target_bullets": entry["target_bullets"]
    })

    # Version for training
    training_dataset.append({
        "input": tokens,
        "target": entry["target_bullets"]
    })

    os.makedirs(os.path.dirname(output_full), exist_ok=True)

    with open(output_full, "w", encoding="utf-8") as f:
        json.dump(full_dataset, f, indent=4)
    
    with open(output_training, "w", encoding="utf-8") as f:
        json.dump(training_dataset, f, indent=4)      

In [None]:
# Create training data for the full papers
year = 2024
input_file = f"limitations_2024/limitations_training_3_{year}.json"
output_file = f"limitations_2024/limitations_training_bart_3_{year}.json"

with open(input_file, "r", encoding="utf-8") as f:
    full_data = json.load(f)

training_data = [
    {"input": entry["input"], "target": entry["target_bullets"]}
    for entry in full_data
    if "input" in entry and "target_bullets" in entry
]
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(training_data, f, indent=4)