<a href="https://colab.research.google.com/github/mehakbangwal/hybrid_text_summarizer/blob/main/hybrid_text_summarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sumy
!pip install transformers
!pip install datasets
!pip install rouge-score
!pip install nltk
!pip install beautifulsoup4


Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m52.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=brea

In [None]:
import re
import nltk
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

from transformers import T5Tokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Remove HTML
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove numbers/special chars
    words = word_tokenize(text)
    cleaned_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and w not in string.punctuation]
    return ' '.join(cleaned_words)

def get_sentences(text):
    return sent_tokenize(text)


In [None]:
# Extractive using TextRank
def extractive_summary(text, num_sentences=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join(str(sentence) for sentence in summary)

# Abstractive using T5
def abstractive_summary(text):
    model_name = "t5-small"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    model = T5ForConditionalGeneration.from_pretrained(model_name)

    input_text = "summarize: " + text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)

    summary_ids = model.generate(input_ids, num_beams=4, min_length=30, max_length=120, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [None]:
def evaluate_summary(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    return scores


In [None]:
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Your raw text (replace or add file support later)
raw_text = """
Artificial Intelligence (AI) is transforming industries by automating processes, analyzing data, and improving decision-making.
However, it raises concerns around bias, privacy, and job displacement.
To address these, researchers are working on ethical frameworks and explainable AI.
"""

# Step 1: Preprocess
clean_text = preprocess_text(raw_text)

# Step 2: Run Extractive and Abstractive Summarization
extractive = extractive_summary(raw_text)
abstractive = abstractive_summary(clean_text)

# Step 3: Print Results
print("🔹 Extractive Summary:\n", extractive)
print("\n🔹 Abstractive Summary:\n", abstractive)

# Step 4: Evaluate
print("\n📊 ROUGE Evaluation (Extractive):")
scores_ex = evaluate_summary(raw_text, extractive)
for k, v in scores_ex.items():
    print(f"{k}: Precision={v.precision:.2f}, Recall={v.recall:.2f}, F1={v.fmeasure:.2f}")

print("\n📊 ROUGE Evaluation (Abstractive):")
scores_ab = evaluate_summary(raw_text, abstractive)
for k, v in scores_ab.items():
    print(f"{k}: Precision={v.precision:.2f}, Recall={v.recall:.2f}, F1={v.fmeasure:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

🔹 Extractive Summary:
 Artificial Intelligence (AI) is transforming industries by automating processes, analyzing data, and improving decision-making. However, it raises concerns around bias, privacy, and job displacement. To address these, researchers are working on ethical frameworks and explainable AI.

🔹 Abstractive Summary:
 artificial intelligence ai transforming industry automating process analyzing data improving decision making however raise concern around bias privacy job displacement address researcher working ethical framework explainable ai.

📊 ROUGE Evaluation (Extractive):
rouge1: Precision=1.00, Recall=1.00, F1=1.00
rouge2: Precision=1.00, Recall=1.00, F1=1.00
rougeL: Precision=1.00, Recall=1.00, F1=1.00

📊 ROUGE Evaluation (Abstractive):
rouge1: Precision=1.00, Recall=0.73, F1=0.84
rouge2: Precision=0.62, Recall=0.44, F1=0.52
rougeL: Precision=1.00, Recall=0.73, F1=0.84


In [None]:
!pip install PyMuPDF  # for PDF reading


Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.5


In [None]:
from google.colab import files
import fitz  # PyMuPDF for PDFs

uploaded = files.upload()  # Choose PDF or TXT
filename = list(uploaded.keys())[0]

def extract_text_from_file(filename):
    text = ""
    if filename.endswith(".pdf"):
        with fitz.open(filename) as doc:
            for page in doc:
                text += page.get_text()
    elif filename.endswith(".txt"):
        with open(filename, 'r', encoding='utf-8') as file:
            text = file.read()
    else:
        raise ValueError("Unsupported file format. Please upload a PDF or TXT file.")
    return text

raw_text = extract_text_from_file(filename)
print("✅ File uploaded and text extracted!")


Saving Artificial intelligence (AI) refers.txt to Artificial intelligence (AI) refers.txt
✅ File uploaded and text extracted!


In [None]:
# Step 1: Preprocess
clean_text = preprocess_text(raw_text)

# Step 2: Run Summarizers
extractive = extractive_summary(raw_text)
abstractive = abstractive_summary(clean_text)

# Step 3: Display Results
print("🔹 Extractive Summary:\n", extractive)
print("\n🔹 Abstractive Summary:\n", abstractive)


🔹 Extractive Summary:
 It is a field of research in computer science that develops and studies methods and software that enable machines to perceive their environment and use learning and intelligence to take actions that maximize their chances of achieving defined goals. High-profile applications of AI include advanced web search engines (e.g., Google Search); recommendation systems (used by YouTube, Amazon, and Netflix); virtual assistants (e.g., Google Assistant, Siri, and Alexa); autonomous vehicles (e.g., Waymo); generative and creative tools (e.g., ChatGPT and AI art); and superhuman play and analysis in strategy games (e.g., chess and Go). The emergence of advanced generative AI in the midst of the AI boom and its ability to create and modify content exposed several unintended consequences and harms in the present and raised concerns about the risks of AI and its long-term effects in the future, prompting discussions about regulatory policies to ensure the safety and benefits of

In [None]:
!pip install gradio pymupdf transformers nltk sumy



Collecting gradio
  Downloading gradio-5.29.1-py3-none-any.whl.metadata (16 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-p

In [None]:
import gradio as gr
import fitz  # PyMuPDF
import nltk
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from rouge_score import rouge_scorer

nltk.download("punkt")

# Load T5 model and tokenizer once
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

def extractive_summary(text, num_sentences=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join(str(sentence) for sentence in summary)

def abstractive_summary(text):
    input_text = "summarize: " + text.strip().replace("\n", " ")
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

def evaluate_summary(reference, candidate):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, candidate)
    formatted = "\n".join(
        f"{metric}: Precision={score.precision:.2f}, Recall={score.recall:.2f}, F1={score.fmeasure:.2f}"
        for metric, score in scores.items()
    )
    return formatted

def summarize(file, direct_text, num_sentences):
    text_to_summarize = None

    if direct_text and len(direct_text.strip()) > 10:
        text_to_summarize = direct_text.strip()
    elif file is not None:
        if hasattr(file, "data"):
            ext = file.name.split(".")[-1].lower()
            if ext == "pdf":
                doc = fitz.open(stream=file.data, filetype="pdf")
                raw_text = ""
                for page in doc:
                    raw_text += page.get_text()
            elif ext == "txt":
                raw_text = file.data.decode("utf-8")
            else:
                return ("Unsupported file type",) * 4
        else:
            ext = file.split(".")[-1].lower()
            if ext == "pdf":
                doc = fitz.open(file)
                raw_text = ""
                for page in doc:
                    raw_text += page.get_text()
            elif ext == "txt":
                with open(file, "r", encoding="utf-8") as f:
                    raw_text = f.read()
            else:
                return ("Unsupported file type",) * 4
        text_to_summarize = raw_text
    else:
        return ("Please upload a file or enter some text to summarize.",) * 4

    ext_sum = extractive_summary(text_to_summarize, num_sentences)
    abs_sum = abstractive_summary(text_to_summarize)

    ext_eval = evaluate_summary(text_to_summarize, ext_sum)
    abs_eval = evaluate_summary(text_to_summarize, abs_sum)

    return ext_sum, abs_sum, ext_eval, abs_eval

with gr.Blocks(theme=gr.themes.Base(primary_hue="blue")) as demo:
    gr.Markdown("<h1 style='text-align:center; color:#003366;'>📘 Hybrid Text Summarizer</h1>")
    gr.Markdown("<p style='text-align:center;'>Upload a PDF or TXT file, or enter text directly to get extractive and abstractive summaries with evaluation.</p>")

    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="📁 Upload File (.pdf / .txt)", file_types=[".pdf", ".txt"])
            direct_text_input = gr.Textbox(label="✍️ Or enter text directly here", lines=10, placeholder="Type or paste text here to summarize...")
            sentence_slider = gr.Slider(1, 10, value=3, step=1, label="🧠 Sentences for Extractive Summary")
            summarize_button = gr.Button("🔍 Summarize Now")

        with gr.Column():
            ext_output = gr.Textbox(label="🧾 Extractive Summary", lines=10)
            ext_eval_output = gr.Textbox(label="📊 ROUGE Scores (Extractive)", lines=6)
            abs_output = gr.Textbox(label="📄 Abstractive Summary", lines=10)
            abs_eval_output = gr.Textbox(label="📊 ROUGE Scores (Abstractive)", lines=6)

    summarize_button.click(
        summarize,
        inputs=[file_input, direct_text_input, sentence_slider],
        outputs=[ext_output, abs_output, ext_eval_output, abs_eval_output]
    )

    gr.Markdown("<p style='text-align: center;'>✨ Built using Gradio, Hugging Face, PyMuPDF, and Sumy</p>")

demo.launch()




ModuleNotFoundError: No module named 'rouge_score'