# 📦 0. Imports and Setup

In [19]:

import requests
import json
import re
import nltk
import time
from nltk.tokenize import sent_tokenize

# Ensure punkt tokenizer is available
nltk.download('punkt')


import nltk

# Ensure punkt tokenizer is available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
# Your API key
apikey = 'bgJyXuHdGkrBKt4VsCvR0LeiwE8x39WZ'


[nltk_data] Downloading package punkt to /Users/murat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# 🛠️ 1. Helper Functions

In [10]:

def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def safe_get(text):
    return text.strip() if isinstance(text, str) and text.strip() else None

def get_abstract_or_intro(paper):
    abstract = safe_get(paper.get("abstract"))
    if abstract:
        return abstract

    full_text = paper.get("fullText", "")
    full_text = clean_text(full_text)

    intro_match = re.search(
        r'(?:^|\n)(?:\d?\s*INTRODUCTION|BACKGROUND)(?:[:\.\n\s]+)(.*?)(?=\n[A-Z ]{3,}|[\n]{2,})',
        full_text,
        re.IGNORECASE
    )
    if intro_match:
        intro = intro_match.group(1).strip()
        if len(intro) > 100:
            return "Introduction: " + intro

    return None

def query_api(search_url, query, offset=0, limit=20):
    headers = {"Authorization": "Bearer " + apikey}
    url = f"{search_url}?q={query}&limit={limit}&offset={offset}"

    for _ in range(3):  # Retry logic
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            try:
                data = response.json()
                print(f"Query: {query} | Offset: {offset} | Results: {len(data.get('results', []))}")
                return data, response.elapsed.total_seconds()
            except json.JSONDecodeError:
                print(f"⚠️ JSON decode error at offset {offset} for query '{query}'")
                return {"results": []}, 0
        else:
            print(f"⚠️ Error {response.status_code}, retrying...")
            time.sleep(2)

    print(f"⚠️ Failed after retries for query='{query}', offset={offset}")
    return {"results": []}, 0


# 🧐 2. Section Extraction Logic

In [11]:

def extract_sections(full_text):
    if not full_text:
        return {
            "conclusions": "Full text not available",
            "limitations": "Full text not available",
            "future_work": "Full text not available"
        }

    full_text = clean_text(full_text)
    paragraphs = [p.strip() for p in re.split(r'\n\s*\n|\r\n\s*\r\n', full_text) if p.strip()]
    sections = []
    current_section = {"heading": "", "content": ""}

    for p in paragraphs:
        if len(p) < 100 and (p.isupper() or re.match(r'^\d+[\.\s]+\w+|^[IVX]+[\.\s]+\w+', p)):
            if current_section["content"]:
                sections.append(current_section)
            current_section = {"heading": p, "content": ""}
        else:
            current_section["content"] += (" " + p) if current_section["content"] else p
    if current_section["content"]:
        sections.append(current_section)

    patterns = {
        "limitations": [
            r'\b(?:limitation|shortcoming|drawback|weakness|constraint)s?\b',
            r'\bcurrent\s+(?:limitation|constraint|shortcoming)s?\b',
            r'\blimiting\s+factor[s]?\b',
            r'\bsources?\s+of\s+error\b'
        ],
        "future_work": [
            r'\bfuture\s+(?:work|research|direction|study|investigation)\b',
            r'\bfurther\s+(?:work|research|study|development)\b',
            r'\bopen\s+(?:question|issue|challenge|problem|area)s?\b',
            r'\bwe\s+plan\s+to\b'
        ],
        "conclusions": [
            r'\bconclusion[s]?\b',
            r'\bconcluding\s+remarks\b',
            r'\bin\s+conclusion\b',
            r'\bthis\s+study\s+(?:shows|demonstrates|confirms|indicates)\b'
        ]
    }

    output = {"limitations": "", "future_work": "", "conclusions": ""}

    for section in sections:
        heading_lower = section["heading"].lower()
        for key, regex_list in patterns.items():
            if any(re.search(p, heading_lower) for p in regex_list):
                output[key] += f"Section: {section['heading']}\n{section['content']}\n\n"

    for key, regex_list in patterns.items():
        if not output[key]:
            matched_sentences = []
            for section in sections:
                sentences = sent_tokenize(section["content"])
                for i, sentence in enumerate(sentences):
                    if any(re.search(p, sentence.lower()) for p in regex_list):
                        context = sentences[max(0, i-1):min(len(sentences), i+2)]
                        matched_sentences.append(" ".join(context))
            if matched_sentences:
                output[key] = "Auto-extracted mentions:\n" + "\n".join(matched_sentences)

    for key in output:
        if not output[key]:
            output[key] = f"No {key.replace('_', ' ')} content found"

    return output


# 🚀 3. Main Loop for Querying and Saving

In [4]:
# 📦 Globals
all_papers = []
filtered_papers = []
future_work_analysis = []

# 🚀 1. Collect Papers
def collect_papers():
    global all_papers
    search_url = "https://api.core.ac.uk/v3/search/works"
    topic_queries = [
        "artificial intelligence", "machine learning", "deep learning", "data science", "AI applications",
        "natural language processing", "language models", "NLP", "text mining", "information extraction",
        "computer vision", "image recognition", "object detection", "vision transformers",
        "biomedical informatics", "health informatics", "clinical AI", "medical imaging", "EHR", "genomics",
        "AI ethics", "explainable AI", "fairness in machine learning", "AI in education", "social computing",
        "support vector machines", "random forests", "decision trees", "unsupervised learning", "feature selection",
        "AI systems", "distributed learning", "edge AI", "federated learning", "hardware-aware ML"
    ]
    
    max_papers = 2000
    limit = 20
    seen_ids = set()

    for query in topic_queries:
        offset = 0
        while len(all_papers) < max_papers:
            data, _ = query_api(search_url, query, offset=offset, limit=limit)
            results = data.get("results", [])
            if not results:
                break

            for paper in results:
                if paper.get("fullText") and paper.get("id") not in seen_ids:
                    seen_ids.add(paper["id"])
                    all_papers.append(paper)

            offset += limit
            time.sleep(2)

        time.sleep(5)

        if len(all_papers) >= max_papers:
            break

    # Save raw papers immediately
    with open("core_raw_fulltext_collected.jsonl", "w", encoding="utf-8") as f:
        for paper in all_papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")

    print(f"✅ Collected and saved {len(all_papers)} raw papers.")

# 🧠 2. Filter and Structure Papers
def filter_papers():
    global filtered_papers
    for paper in all_papers:
        full_text = paper.get("fullText", "")
        sections = extract_sections(full_text)
        abstract_or_intro = get_abstract_or_intro(paper)
        if not abstract_or_intro:
            continue
        if all(sections[k].startswith("No ") for k in ["conclusions", "future_work", "limitations"]):
            continue

        record = {
            "abstract": abstract_or_intro,
            "conclusions": sections["conclusions"],
            "limitations": sections["limitations"],
            "future_work": sections["future_work"]
        }
        filtered_papers.append(record)

    # Save filtered papers
    with open("core_fulltext_dataset_filtered.jsonl", "w", encoding="utf-8") as f:
        for paper in filtered_papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")

    print(f"✅ Filtered and saved {len(filtered_papers)} papers with required sections.")

# 🧪 3. Analyze Future Work Sections
def analyze_future_work():
    global future_work_analysis
    for paper in filtered_papers:
        future_work_text = paper.get("future_work", "")
        if future_work_text and "No future work" not in future_work_text:
            analysis = {
                "abstract_snippet": paper["abstract"][:300] + "...",
                "future_work_summary": future_work_text
            }
            future_work_analysis.append(analysis)

    # Save future work analysis
    with open("future_work_analysis.jsonl", "w", encoding="utf-8") as f:
        for entry in future_work_analysis:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"✅ Analyzed and saved {len(future_work_analysis)} future work entries.")

# ▶️ Full Pipeline
def run_full_pipeline():
    collect_papers()
    filter_papers()
    analyze_future_work()
    print("🏁 Full pipeline complete!")

# ▶️ 4. Run the Main Function

In [5]:
run_full_pipeline()
# filter_papers()


NameError: name 'query_api' is not defined