## Research Paper Summarizer

this is a notebook-native pipeline to fetch papers/articles and summarize them with llm.

**Modes Supported**
- Local files (txt, md, html, pdf)
- Topic search (searches 3 relevant papers using arXiv API)
- Paper title
- Direct like

**Output**
- TL;DR, key points, entities/terms, one paragraph summary, questions
- saves output/*.md + an index csv

### imports

In [1]:
import os
import re
import textwrap
import requests
from readability import Document
import feedparser

from typing import List, Dict, Optional, Tuple

import pandas as pd

from bs4 import BeautifulSoup

import fitz
import tiktoken 
from dotenv import load_dotenv

import yaml

In [2]:
# create the output folder
os.makedirs('outputs', exist_ok=True)

### configuration

In [3]:
with open("params.yaml", "r") as f:
    params=yaml.safe_load(f)

cfg = params["config"]
USE_LLM=cfg["use_llm"]
USE_MODEL=cfg["use_model"]
MAX_TOKENS = int(cfg["max_tokens"])
OVERLAP_TOKENS = int(cfg["overlap_tokens"])
HEADERS = cfg["headers"]
MAX_ARXIV_RESULTS = int(cfg["max_arxiv_results"])
PAPER_SORT_STRATEGY = cfg["paper_sort_strategy"]

print(f"The system will use {USE_LLM} and the model is {USE_MODEL}")

The system will use openai and the model is gpt-4o-mini


In [4]:
## get the openai api key
def require_env():
    load_dotenv()
    v = os.getenv("OPENAI_API_KEY")
    if not v:
        raise RuntimeError("Openai API Key not found")
    print("API KEY FOUND")
    return v

In [5]:
api_key=require_env()

API KEY FOUND


### utilities: token counting and chunking

In [6]:
# --- token helpers (robust) ---
def count_tokens(text: str) -> int:
    try:
        enc = tiktoken.get_encoding("o200k_base")
        return len(enc.encode(text))
    except Exception:
        # fallback heuristic ~4 chars/token
        return max(1, len(text) // 4)

def chunk_text(
    text: str,
    max_tokens: int,
    overlap_tokens: int
) -> List[str]:
    """
    Splits text into token-aware chunks with overlap.
    """
    words = text.split()
    chunks, current = [], []
    token_count = 0
    for w in words:
        approx = max(1, len(w) // 4)  # heuristic to avoid calling tiktoken per word
        if token_count + approx > max_tokens and current:
            chunks.append(" ".join(current))
            # carry overlap (approximate chars ~= overlap_tokens*4)
            overlap = " ".join(current)[-overlap_tokens * 4:]
            current = [overlap] if overlap else []
            token_count = count_tokens(overlap) if overlap else 0
        current.append(w)
        token_count += approx
    if current:
        chunks.append(" ".join(current))
    return chunks


### Content extraction - URL/HTML/PDF/LOCAL

In [7]:
headers = HEADERS

# html
def html_to_text(html: str) -> str:
    doc = Document(html)
    main = doc.summary()
    soup = BeautifulSoup(main, "html.parser")
    for irrelevant in soup(["script", "style", "img"]):
        irrelevant.decompose()
    text = soup.get_text(separator="\n")
    text = re.sub(r"\n{2,}", "\n\n", text).strip()
    return text


# URL
def fetch_url(url: str) -> str:
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    ctype = (r.headers.get("Content-Type", "") or "").lower()

    if url.lower().endswith(".pdf") or "application/pdf" in ctype:
        # open PDF from bytes
        with fitz.open(stream=r.content, filetype="pdf") as doc:
            pages = [p.get_text() for p in doc]
        return "\n\n".join(pages)
    else:
        return html_to_text(r.text)

# local
def load_local_text(path: str) -> str:
    # PDFs
    if path.lower().endswith(".pdf"):
        with fitz.open(path) as doc:
            pages = [p.get_text() for p in doc]
        return "\n\n".join(pages)

    # Read bytes then decode robustly
    with open(path, "rb") as f:
        raw = f.read()
    try:
        s = raw.decode("utf-8", errors="ignore")
    except Exception:
        s = raw.decode("latin-1", errors="ignore")

    # HTML-ish?
    if "<html" in s.lower() or "<p" in s.lower():
        return html_to_text(s)

    # Plain text cleanup
    return re.sub(r"\n{2,}", "\n\n", s).strip()

### arXiv Search Helper

In [8]:
ARXIV_API = "http://export.arxiv.org/api/query"

def arxiv_search(
    query: str,
    max_results: int,
    sortBy: str
) -> List[Dict]:
    params = {
        "search_query": f"all:{query}",
        "start": 0,
        "max_results": max_results,
        "sortBy": sortBy,
        "sortOrder": "descending"
    }
    url = ARXIV_API + "?" + "&".join(f"{k}={requests.utils.quote(str(v))}" for k, v in params.items())
    feed = feedparser.parse(url)
    results = []
    for entry in feed.entries:
        pdf_link = next((l.href for l in entry.links if getattr(l, "type", "") == "application/pdf"), None)
        results.append({
            "title": entry.title,
            "summary": entry.summary,
            "authors": [a.name for a in getattr(entry, "authors", [])],
            "link": entry.link,
            "pdf": pdf_link,
            "published": getattr(entry, "published", "")
        })
    return results

def build_text_from_arxiv_entry(entry: Dict) -> str:
    parts = [
        f"Title: {entry.get('title','')}",
        f"Authors: {', '.join(entry.get('authors', []))}",
        f"Published: {entry.get('published','')}",
        "",
        "Abstract:",
        entry.get("summary","")
    ]
    return "\n".join(parts)

### prompt

In [9]:
system_prompt = """You are a research assistant. Summarize the following text into structured sections:

1) TL;DR (2–3 bullet points)
2) Key Points (5–10 bullets)
3) Entities & Terms (important names, orgs, dates, technical terms)
4) One-paragraph Summary (5–7 sentences, neutral and precise)
5) Questions to Explore (3–5 thoughtful questions)

Be concise and faithful to the source. Avoid speculation.
Text:
"""

In [10]:
prompt_params = params["prompt"]
TONE = prompt_params["tone"]
AUDIENCE = prompt_params["audience"]

In [11]:
def build_prompt(
    text: str,
    tone: str,
    audience: str
) -> str:
    return f"""TONE: {tone}
AUDIENCE: {audience}
{system_prompt}
{text}
"""


### LLM Backend

In [16]:
from openai import OpenAI

def summarize_paper(
    text: str,
    tone: str,
    audience: str,
    model: str = USE_MODEL,
    api_key: Optional[str] = None,
) -> str:
    api_key = api_key or os.getenv("OPENAI_API_KEY", "")
    if not api_key:
        raise RuntimeError("OPENAI_API_KEY not set.")
    client = OpenAI(api_key=api_key)

    md_parts = []
    chunks = chunk_text(text, max_tokens=MAX_TOKENS, overlap_tokens=OVERLAP_TOKENS)
    for i, chunk in enumerate(chunks, 1):
        prompt = build_prompt(chunk, tone=tone, audience=audience)
        resp = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt},
            ],
            temperature=0.2,
        )
        md_parts.append(f"## Chunk {i}\n\n{resp.choices[0].message.content}")

    return "# Summary\n\n" + "\n\n---\n\n".join(md_parts)


### helper functions

In [17]:
def merge_and_save(md: str, source: str) -> str:
    safe = re.sub(r"[^a-zA-Z0-9]+", "_", source)[:80].strip("_") or "summary"
    path = f"outputs/{safe}.md"
    with open(path, "w", encoding="utf-8") as f:
        f.write(md)
    return path

def summarize_text_block(text: str, tone: str, audience: str) -> str:
    return summarize_paper(text, tone=tone, audience=audience)

def run_local_files(paths: List[str], tone: str, audience: str) -> List[Dict]:
    out = []
    for p in paths:
        try:
            txt = load_local_text(p)
            md = summarize_text_block(txt, tone, audience)
            md_path = merge_and_save(md, f"file_{os.path.basename(p)}")
            out.append({"mode":"local", "source": p, "markdown_path": md_path})
        except Exception as e:
            out.append({"mode":"local", "source": p, "error": str(e)})
    return out

def run_topic_search(topic: str, tone: str, audience: str, top_n: int = MAX_ARXIV_RESULTS) -> List[Dict]:
    entries = arxiv_search(topic, max_results=top_n, sortBy=PAPER_SORT_STRATEGY)
    out = []
    for e in entries:
        try:
            txt = build_text_from_arxiv_entry(e)
            md = summarize_text_block(txt, tone, audience)
            md_path = merge_and_save(md, f"topic_{topic}_{e['title']}")
            out.append({"mode":"topic", "source": e["link"], "title": e["title"], "markdown_path": md_path})
        except Exception as ex:
            out.append({"mode":"topic", "source": e.get("link",""), "title": e.get("title",""), "error": str(ex)})
    return out

def run_paper_title(title: str, tone: str, audience: str) -> List[Dict]:
    entries = arxiv_search(title, max_results=1, sortBy=PAPER_SORT_STRATEGY)
    if not entries:
        return [{"mode":"title", "source": title, "error":"No results found on arXiv"}]
    e = entries[0]
    try:
        txt = build_text_from_arxiv_entry(e)
        md = summarize_text_block(txt, tone, audience)
        md_path = merge_and_save(md, f"title_{e['title']}")
        return [{"mode":"title", "source": e["link"], "title": e["title"], "markdown_path": md_path}]
    except Exception as ex:
        return [{"mode":"title", "source": e.get("link",""), "title": e.get("title",""), "error": str(ex)}]

def run_direct_links(links: List[str], tone: str, audience: str) -> List[Dict]:
    out = []
    for url in links:
        try:
            txt = fetch_url(url)
            md = summarize_text_block(txt, tone, audience)
            md_path = merge_and_save(md, f"url_{url}")
            out.append({"mode":"link", "source": url, "markdown_path": md_path})
        except Exception as e:
            out.append({"mode":"link", "source": url, "error": str(e)})
    return out

### run

In [18]:
rows = run_direct_links(["https://arxiv.org/pdf/2010.11929.pdf"], tone=TONE, audience=AUDIENCE)

In [19]:
pd.DataFrame(rows).to_csv("outputs/summaries_index.csv", index=False)
pd.DataFrame(rows)

Unnamed: 0,mode,source,markdown_path
0,link,https://arxiv.org/pdf/2010.11929.pdf,outputs/url_https_arxiv_org_pdf_2010_11929_pdf.md
