In [None]:
# Install Transformers & Dataset Libraries
!pip install transformers datasets torch nltk rouge_score -q


In [None]:
!pip install ir_datasets transformers datasets sentencepiece torch biopython -q


In [None]:
import ir_datasets

# Load datasets
trec_cds = ir_datasets.load("pmc/v1/trec-cds-2014")

# Extract documents
trec_cds_docs = [{"doc_id": doc.doc_id, "title": doc.title, "abstract": doc.abstract} for doc in trec_cds.docs_iter()]


In [None]:
import tarfile
import os

# Define the path where the uploaded tar.gz file is located
tar_path = "/content/ohsumed-first-20000-docs.tar.gz"  # Update this if filename differs

# Define the extraction path
extract_path = "/content/OHSUMED"

# Extract the tar.gz file
with tarfile.open(tar_path, "r:gz") as tar:
    tar.extractall(extract_path)

print("✅ Extraction complete! Files are in:", extract_path)

# List extracted files
print("Extracted Files:", os.listdir(extract_path))


In [None]:
import os

# List files inside OHSUMED
for root, dirs, files in os.walk(extract_path):
    print(f"📂 Folder: {root}")
    for file in files:
        print(f"  📄 File: {file}")


In [None]:
import os

# Root directory of the extracted OHSUMED dataset
root_dir = "/content/OHSUMED/ohsumed-first-20000-docs"

# Store extracted documents
ohsumed_docs = []

# Recursively walk through all folders and collect text files
for root, dirs, files in os.walk(root_dir):
    for file in files:
        file_path = os.path.join(root, file)

        # Read content of each file
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read().strip()

                # Assuming first line is title, remaining is abstract
                lines = content.split("\n")
                doc_id = file  # Using filename as document ID
                title = lines[0].strip() if len(lines) > 0 else "Untitled"
                abstract = " ".join(lines[1:]).strip() if len(lines) > 1 else "No abstract available."

                # Store document details
                ohsumed_docs.append({"doc_id": doc_id, "title": title, "abstract": abstract})

        except Exception as e:
            print(f"❌ Skipping file {file_path}: {e}")

print(f"✅ Successfully loaded {len(ohsumed_docs)} documents from OHSUMED.")


In [None]:
# Merge OHSUMED and TREC CDS 2014 datasets
documents = ohsumed_docs + trec_cds_docs

print(f"✅ Total documents available: {len(documents)}")


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load BART tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")


In [None]:
def summarize_text(text):
    """Generate an extractive summary using a BERT-based model."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest", max_length=1024)
    summary_ids = model.generate(**inputs, max_length=150, min_length=50, length_penalty=2.0, num_beams=4)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Summarize a few sample documents
num_samples = 3  # Adjust as needed
for i, doc in enumerate(documents[:num_samples]):
    print(f"\n🔹 Document {i+1}: {doc['title']}\n")
    print(f"📄 Full Abstract:\n{doc['abstract']}\n")

    summary = summarize_text(doc["abstract"])
    print(f"✅ Extractive Summary:\n{summary}\n{'-'*80}")


In [None]:
!pip install evaluate -q

In [None]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Evaluate summaries
references = [doc["abstract"] for doc in documents[:3]]
hypotheses = [summarize_text(doc["abstract"]) for doc in documents[:3]]

# Compute ROUGE scores
results = rouge.compute(predictions=hypotheses, references=references)

print("🔍 ROUGE Scores:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")


In [None]:
!pip install transformers datasets torch nltk rouge_score evaluate bert-extractive-summarizer -q


In [None]:
from summarizer import Summarizer

# Load BERTSUMEXT Model
bert_summarizer = Summarizer()


In [None]:
def extractive_summarize(text, num_sentences=3):
    """
    Extracts the top `num_sentences` sentences from the input text using BERTSUMEXT.
    """
    summary = bert_summarizer(text, num_sentences=num_sentences)
    return summary

# Summarize a few sample abstracts
num_samples = 4  # Change this to summarize more abstracts
for i, doc in enumerate(documents[:num_samples]):
    print(f"\n🔹 Document {i+1}: {doc['title']}\n")
    print(f"📄 Full Abstract:\n{doc['abstract']}\n")

    summary = extractive_summarize(doc["abstract"])
    print(f"✅ Extractive Summary:\n{summary}\n{'-'*80}")


In [None]:
import evaluate

# Load ROUGE metric
rouge = evaluate.load("rouge")

# Evaluate summaries
references = [doc["abstract"] for doc in documents[:num_samples]]
hypotheses = [extractive_summarize(doc["abstract"]) for doc in documents[:num_samples]]

# Compute ROUGE scores
results = rouge.compute(predictions=hypotheses, references=references)

print("🔍 ROUGE Scores:")
for key, value in results.items():
    print(f"{key}: {value:.4f}")
