### PART-3

In [2]:
# Install dependencies if needed
# pip install datasets transformers

from datasets import load_dataset
from transformers import PegasusTokenizer

# 1. Load & subsample 5 000 examples
raw = load_dataset("ccdv/arxiv-summarization", split="train")
raw = raw.shuffle(seed=42).select(range(5000))

# 2. Load Pegasus‐XSum tokenizer and set special tokens
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
# By default PegasusTokenizer.pad_token == '<pad>' and eos_token == '</s>'
# If you want to pad with eos instead of a separate pad symbol:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# 3. Define max lengths
max_input_length  = 512   # Pegasus‐XSum max supported input
max_target_length = 256

# 4. Tokenization function
def tokenize_fn(batch):
    # Encode articles
    enc = tokenizer(
        batch["article"],
        truncation=True,
        padding="max_length",
        max_length=max_input_length,
    )
    # Encode abstracts as labels
    dec = tokenizer(
        batch["abstract"],
        truncation=True,
        padding="max_length",
        max_length=max_target_length,
    )
    return {
        "input_ids": enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "labels": dec["input_ids"],
    }

# 5. Apply tokenization and remove original columns
tokenized = raw.map(
    tokenize_fn,
    batched=True,
    remove_columns=raw.column_names,
)

# 6. Split into Train/Validation/Test (80/10/10)
split_1 = tokenized.train_test_split(test_size=0.20, seed=42)
split_2 = split_1["test"].train_test_split(test_size=0.50, seed=42)

datasets = {
    "train": split_1["train"],         # 4 000 samples
    "validation": split_2["train"],    #   500 samples
    "test": split_2["test"],           #   500 samples
}

print({k: len(v) for k, v in datasets.items()})
# -> {'train': 4000, 'validation': 500, 'test': 500}


{'train': 4000, 'validation': 500, 'test': 500}


In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

# 1. Reload & split raw test set (to restore article/abstract)
raw = (
    load_dataset("ccdv/arxiv-summarization", split="train")
    .shuffle(seed=42)
    .select(range(5000))
)
split1 = raw.train_test_split(test_size=0.20, seed=42)
split2 = split1["test"].train_test_split(test_size=0.50, seed=42)
raw_test = split2["test"]  # has 'article' & 'abstract'

# 2. Reattach text columns to your tokenized-only test split
#    (assumes `datasets["test"]` exists from your earlier tokenization)
tokenized_test = (
    datasets["test"]
    .add_column("article", raw_test["article"])
    .add_column("abstract", raw_test["abstract"])
)

# 3. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")

# 4. Load two separate model instances:
#    A. Base Pegasus
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/pegasus-xsum", device_map="auto"
)
#    B. Fine-tuned (wraps a fresh base internally)
ft_base_model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/pegasus-xsum", device_map="auto"
)
finetuned_model = PeftModel.from_pretrained(
    ft_base_model, "./lora-pegasus-xsum", device_map="auto"
)

# 5. Summarization helper
def generate_summaries(model, articles):
    model.eval()
    outs = []
    for art in articles:
        inputs = tokenizer(
            art,
            truncation=True,
            padding="longest",
            max_length=512,
            return_tensors="pt"
        ).to(model.device)
        sum_ids = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )
        outs.append(tokenizer.decode(sum_ids[0], skip_special_tokens=True))
    return outs

# 6. Select 10 samples and run inference
articles      = tokenized_test["article"][:10]
ground_truths = tokenized_test["abstract"][:10]

base_summaries      = generate_summaries(base_model,      articles)
finetuned_summaries = generate_summaries(finetuned_model, articles)


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
!pip install together

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Defaulting to user installation because normal site-packages is not writeable
Collecting together
  Downloading together-1.5.8-py3-none-any.whl (88 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.7/88.7 KB[0m [31m484.2 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m:01[0m
Collecting rich<15.0.0,>=13.8.1
  Downloading rich-14.0.0-py3-none-any.whl (243 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.2/243.2 KB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m[31m1.3 MB/s[0m eta [36m0:00:01[0m
Collecting pillow<12.0.0,>=11.1.0
  Downloading pillow-11.2.1-cp310-cp310-manylinux_2_28_x86_64.whl (4.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m:01[0m:01[0m
Installing collected packages: pillow, rich, together
  Attempting uninstall: pillow
    Found existing installation: pillow 10.4.0
    Uninstall

In [9]:
%env TOGETHER_API_KEY=6f2d3698eb38ec3c208297ffca14741a07fc8b07295dd557c37fbf9656865db8

env: TOGETHER_API_KEY=6f2d3698eb38ec3c208297ffca14741a07fc8b07295dd557c37fbf9656865db8


In [10]:
from together import Together

client = Together()  # reads TOGETHER_API_KEY automatically

In [11]:
# Example data: replace these lists with your actual data
originals           = articles           # list of 10 original texts
finetuned_summaries = finetuned_summaries  # list of 10 fine-tuned summaries
base_summaries      = base_summaries       # list of 10 base-model summaries

# Choose which summaries to evaluate here:
to_eval = list(zip(originals, finetuned_summaries))

In [13]:
import json

In [24]:
def generate_summaries(model, articles):
    outs = []
    for art in articles:
        inputs = tokenizer(art, truncation=True, padding="longest", max_length=512, return_tensors="pt").to(model.device)
        sum_ids = model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
        outs.append(tokenizer.decode(sum_ids[0], skip_special_tokens=True))
    return outs

# Run inference on 10 samples
articles = tokenized_test["article"][:10]
base_summaries = generate_summaries(base_model, articles)
finetuned_summaries = generate_summaries(finetuned_model, articles)

# Evaluation helpers
def truncate_to_tokens(text: str, max_chars: int = 2000) -> str:
    return text if len(text) <= max_chars else text[:max_chars] + "…"

def extract_json(content: str) -> dict:
    fence = re.search(r"```json\s*(\{.*?\})\s*```", content, flags=re.DOTALL)
    if fence:
        payload = fence.group(1)
    else:
        brace = re.search(r"\{.*\}", content, flags=re.DOTALL)
        if not brace:
            raise ValueError("No JSON found")
        payload = brace.group(0)
    return json.loads(payload)

def evaluate_with_llm(orig: str, summ: str, model_name="meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo") -> dict:
    orig_trunc = truncate_to_tokens(orig, 2000)
    prompt = f"""
You are an expert evaluator. Rate the following summary on Fluency, Factuality, and Coverage (1–5).

Original Text (truncated):
{orig_trunc}

Generated Summary:
{summ}

Respond only with a JSON object inside a markdown code fence labelled json:
```json
{{
  "fluency": <int>,
  "fluency_justification": "<brief>",
  "factuality": <int>,
  "factuality_justification": "<brief>",
  "coverage": <int>,
  "coverage_justification": "<brief>"
}}
``` """
    resp = client.chat.completions.create(model=model_name, messages=[{"role": "user", "content": prompt}])
    text = resp.choices[0].message.content.strip()
    if not text:
        time.sleep(1)
        resp = client.chat.completions.create(model=model_name, messages=[{"role": "user", "content": prompt}])
        text = resp.choices[0].message.content.strip()
    return extract_json(text)

# Run evaluation on 10 samples
records = []
for i in range(10):
    orig = tokenized_test["article"][i]
    summ = finetuned_summaries[i]
    try:
        scores = evaluate_with_llm(orig, summ)
    except Exception as e:
        print(f"Sample {i} failed:", e)
        continue
    scores["index"] = i
    records.append(scores)
    print(f"Sample {i} →", {k: scores[k] for k in ("fluency","factuality","coverage")})
    time.sleep(1)

# Aggregate results
df = pd.DataFrame(records)
avg_scores = df[["fluency","factuality","coverage"]].mean()
print("Average Judge Scores:", avg_scores.to_dict())

Sample 0 → {'fluency': 4, 'factuality': 5, 'coverage': 2}
Sample 1 → {'fluency': 3, 'factuality': 4, 'coverage': 2}
Sample 2 → {'fluency': 1, 'factuality': 1, 'coverage': 1}
Sample 3 → {'fluency': 4, 'factuality': 5, 'coverage': 2}
Sample 4 → {'fluency': 4, 'factuality': 5, 'coverage': 2}
Sample 5 → {'fluency': 1, 'factuality': 2, 'coverage': 1}
Sample 6 → {'fluency': 1, 'factuality': 1, 'coverage': 1}
Sample 7 → {'fluency': 2, 'factuality': 3, 'coverage': 1}
Sample 8 → {'fluency': 5, 'factuality': 4, 'coverage': 2}
Sample 9 → {'fluency': 4, 'factuality': 2, 'coverage': 2}
Average Judge Scores: {'fluency': 2.9, 'factuality': 3.2, 'coverage': 1.6}


In [26]:
records = []

for i in range(10):
    orig = tokenized_test["article"][i]
    summ = finetuned_summaries[i]
    try:
        scores = evaluate_with_llm(orig, summ)
    except Exception as e:
        print(f"Sample {i} failed:", e)
        continue

    # Keep both scores and justifications
    record = {
        "index": i,
        "fluency": scores["fluency"],
        "fluency_justification": scores["fluency_justification"],
        "factuality": scores["factuality"],
        "factuality_justification": scores["factuality_justification"],
        "coverage": scores["coverage"],
        "coverage_justification": scores["coverage_justification"],
    }
    records.append(record)

    # Print everything
    print(f"\n--- Sample {i} ---")
    print(f"Fluency:    {record['fluency']}")
    print(f"  ↳ {record['fluency_justification']}")
    print(f"Factuality: {record['factuality']}")
    print(f"  ↳ {record['factuality_justification']}")
    print(f"Coverage:   {record['coverage']}")
    print(f"  ↳ {record['coverage_justification']}")



--- Sample 0 ---
Fluency:    4
  ↳ The summary is well-written and easy to understand, but it contains a repeated sentence which makes it slightly less fluent.
Factuality: 5
  ↳ The summary accurately represents the main ideas of the original text without introducing any errors or inaccuracies.
Coverage:   2
  ↳ The summary only covers a small portion of the original text, missing important details about the scheduling problems and machine environments.

--- Sample 1 ---
Fluency:    3
  ↳ The summary is mostly coherent, but it contains repetitive sentences and lacks a clear structure, making it somewhat difficult to follow.
Factuality: 4
  ↳ The summary accurately reports on the detection of the acceleration of gravity and the experimental approaches used, but it omits some details and context from the original text.
Coverage:   2
  ↳ The summary only covers a limited portion of the original text, focusing on the detection of the acceleration of gravity and the experimental approaches

## PART-B

In [80]:
import os
import logging
from typing import Dict, List, Any, Optional, Tuple
import pandas as pd
import requests
from langchain.pydantic_v1 import BaseModel, Field
from langchain_core.messages import HumanMessage, AIMessage
from langchain.tools import tool
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, END
from transformers import PegasusTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel
import torch
from datetime import datetime
import json

In [116]:
# research_agent.py

import os
import json
import re
import requests
import logging
from typing import List, Dict, Any

from datasets import load_dataset, DatasetDict
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from peft import PeftModel
from together import Together
from langgraph.graph import StateGraph, START, END

# --- 0. Logging & Together.ai Client Setup ---
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

TOGETHER_API_KEY = os.getenv("TOGETHER_API_KEY")
if not TOGETHER_API_KEY:
    raise RuntimeError("Please set TOGETHER_API_KEY in your environment")
client = Together(api_key=TOGETHER_API_KEY)

# --- 1. Load & Split ArXiv Summarization Dataset ---
raw = load_dataset("ccdv/arxiv-summarization", split="train")
raw = raw.shuffle(seed=42).select(range(5000))

# 80/10/10 split
splits: DatasetDict = raw.train_test_split(test_size=0.20, seed=42)
splits["validation"], splits["test"] = \
    splits["test"].train_test_split(test_size=0.50, seed=42).values()

# 2. Initialize Pegasus tokenizer + LoRA model for summarization
pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
base_pegasus = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
peft_base = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
finetuned_pegasus = PeftModel.from_pretrained(peft_base, "./lora-pegasus-xsum")
finetuned_pegasus.eval()

# 3. Map tokenization function onto each split (keeping text)
max_input_length, max_target_length = 512, 256

def tokenize_batch(batch):
    enc = pegasus_tokenizer(
        batch["article"],
        truncation=True,
        padding="max_length",
        max_length=max_input_length
    )
    dec = pegasus_tokenizer(
        batch["abstract"],
        truncation=True,
        padding="max_length",
        max_length=max_target_length
    )
    return {
        "input_ids": enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "labels": dec["input_ids"],
        # keep text for inference
        "article": batch["article"],
        "abstract": batch["abstract"]
    }

tokenized_splits = splits.map(
    tokenize_batch,
    batched=True,
    remove_columns=[]  # keep all
)

# 4. Helper functions
# part3_search_agent.py  (excerpt)

import xmltodict
from lxml import etree
from xml.parsers.expat import ExpatError
import logging

logger = logging.getLogger(__name__)

def parse_arxiv_xml(xml_str: str) -> List[Dict[str, Any]]:
    """
    Parse arXiv Atom XML, always emitting a list of dicts that include an 'abstract' key.
    Falls back to lxml recovery if xmltodict fails on malformed XML.
    """
    def _extract(entries):
        """Normalize an entry (dict) to our schema."""
        out = []
        for e in entries:
            paper = {
                "paperId":      e.get("id", ""),
                "title":        e.get("title", "").strip(),
                # xmltodict gives you 'summary', rename to 'abstract'
                "abstract":     e.get("summary", "").strip(),
                "authors":      [a.get("name") for a in e.get("author", []) or []],
                "year":         int(e.get("published", "")[:4] or 0),
                "url":          e.get("id", "")
            }
            out.append(paper)
        return out

    # 1) Try strict xmltodict parse
    try:
        data = xmltodict.parse(xml_str)
        entries = data.get("feed", {}).get("entry", [])
        if not isinstance(entries, list):
            entries = [entries]
        return _extract(entries)
    except ExpatError as e:
        logger.warning(f"xmltodict failed: {e}; trying lxml recovery")
    except Exception as e:
        logger.error(f"Unexpected error in xmltodict parse: {e}")
        return []

    # 2) Fallback via lxml recovery
    try:
        parser     = etree.XMLParser(recover=True)
        root       = etree.fromstring(xml_str.encode("utf-8"), parser=parser)
        clean_bytes= etree.tostring(root, encoding="utf-8")
        data2      = xmltodict.parse(clean_bytes)
        entries2   = data2.get("feed", {}).get("entry", [])
        if not isinstance(entries2, list):
            entries2 = [entries2]
        return _extract(entries2)
    except Exception as e2:
        logger.error(f"lxml fallback failed: {e2}")
        return []



def dedupe_papers(papers):
    seen=set(); unique=[]
    for p in papers:
        key = p.get("paperId") or p.get("title")
        if key and key not in seen:
            seen.add(key); unique.append(p)
    return unique

# simplistic extractors

def extract_section(text, section_name):
    parts = re.split(rf"\n{section_name}", text, flags=re.IGNORECASE)
    return parts[1].split("\n\n")[0] if len(parts)>1 else ""

def extract_contributions(text):
    m=re.search(r"we (?:propose|introduce|present)[^.]+\.", text, re.IGNORECASE)
    return m.group(0) if m else ""

def extract_limitations(text):
    m=re.search(r"limitations? (?:include|are)[^.]+\.", text, re.IGNORECASE)
    return m.group(0) if m else ""

# 5. LangGraph agent nodes

def expand_keywords_node(state):
    msgs=[
        {"role":"system","content":"Respond ONLY with a JSON array of strings."},
        {"role":"user","content":f"Expand keywords into 8-12 terms: {state['seed_keywords']}"}
    ]
    resp=client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
        messages=msgs, temperature=0
    )
    raw=resp.choices[0].message.content
    logger.info(f"Keywords raw: {raw}")
    try: arr=json.loads(raw)
    except: arr=re.findall(r"\"(.*?)\"", raw)
    return {"expanded_keywords": arr}


def search_papers_node(state):
    papers=[]
    for kw in state['expanded_keywords'][:5]:
        r=requests.get("http://export.arxiv.org/api/query", params={"search_query":f"all:{kw}","max_results":10})
        papers+=parse_arxiv_xml(r.text)
    return {"raw_papers": dedupe_papers(papers)}


def rank_papers_node(state):
    scored = []
    for p in state["raw_papers"]:
        # Citation count from your initial gather (0 if missing)
        cites = p.get("citationCount", 0)
        year  = p.get("year", 0)
        # LLM‑inferred relevance: optional, 0.0 if you skip it
        rel   = 0.0  
        # Compose score
        score = 0.5 * (cites / 1000.0) \
              + 0.5 * (year / 2025)  
        p["score"] = score
        scored.append(p)
    # Pick top 5
    top = sorted(scored, key=lambda x: x["score"], reverse=True)[:5]
    return {"top_papers": top}

def summarize_papers_node(state):
    sums=[]
    for p in state['top_papers']:
        text=p['abstract']
        inp=pegasus_tokenizer(text, truncation=True, max_length=512, return_tensors='pt').to(finetuned_pegasus.device)
        out=finetuned_pegasus.generate(**inp, max_length=256, num_beams=4)
        summ=pegasus_tokenizer.decode(out[0], skip_special_tokens=True)
        sums.append({
            'title':p['title'],'authors':p['authors'],'year':p['year'],
            'summary':summ,
            'methodology':extract_section(text,'Methods'),
            'contributions':extract_contributions(text),
            'limitations':extract_limitations(text)
        })
    return {'paper_summaries': sums}

# 6. Orchestrate agents
graph=StateGraph(dict)
graph.add_node('expand',expand_keywords_node)
graph.add_node('search',search_papers_node)
graph.add_node('rank',rank_papers_node)
graph.add_node('summ',summarize_papers_node)

graph.add_edge(START,'expand')
graph.add_edge('expand','search')
graph.add_edge('search','rank')
graph.add_edge('rank','summ')
graph.add_edge('summ',END)

agent=graph.compile()

# 7. Run demos
if __name__ == '__main__':
    # A) Multi-agent pipeline demo
    result = agent.invoke({'seed_keywords': 'xai'})
    print("=== Multi-Agent Report ===")
    print(json.dumps(result.get('paper_summaries', []), indent=2))

    # B) Direct summarization of 10 held-out test samples
    print("=== Direct Pegasus Summaries (10 samples) ===")
    samples = tokenized_splits['test']['article'][:10]
    for idx, article in enumerate(samples, start=1):
        inputs = pegasus_tokenizer(
            article,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(base_pegasus.device)
        base_ids = base_pegasus.generate(**inputs, max_length=256, num_beams=4)
        finetuned_ids = finetuned_pegasus.generate(**inputs, max_length=256, num_beams=4)
        base_summary = pegasus_tokenizer.decode(base_ids[0], skip_special_tokens=True)
        fine_summary = pegasus_tokenizer.decode(finetuned_ids[0], skip_special_tokens=True)
        print(f"Sample {idx}:")
        print(f"Base Model:    {base_summary}")
        print(f"Fine-tuned:    {fine_summary}")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-16 08:29:58,573 - __main__ - INFO - Keywords raw: ["Explainable AI", "Artificial Intelligence Explainability", "Model Interpretability", "Machine Learning Transparency", "Deep Learning Explainability", "Neural Network Interpretability", "Feature Importance", "Partial Dependence Plots", "SHAP Values", "LIME", "Tree Ex

=== Multi-Agent Report ===
[
  {
    "title": "The Definitions of Interpretability and Learning of Interpretable Models",
    "authors": [
      "Weishen Pan",
      "Changshui Zhang"
    ],
    "year": 2021,
    "summary": "We propose a mathematical definition for the human-interpretable model, which can provide an entire decision-making process that is human-understandable. Experiments on image datasets show the advantages of our proposed model in two aspects: 1) The completely human-interpretable model can provide an entire decision-making process that is human-understandable; 2) The completely human-interpretable model is more robust against adversarial attacks.",
    "methodology": "",
    "contributions": "we propose a mathematical definition for the human-interpretable model.",
    "limitations": ""
  },
  {
    "title": "Bi-interpretation in weak set theories",
    "authors": [
      "Alfredo Roque Freire",
      "Joel David Hamkins"
    ],
    "year": 2020,
    "summary": "In 