In [2]:
import numpy as np
import pandas as pd
import networkx as nx
import nltk
import re
from nltk.tokenize import word_tokenize
from rouge import Rouge
from pyiwn import IndoWordNet, Language
from tqdm import tqdm  # ✅ Import tqdm for progress bars

# Ensure NLTK resources are downloaded
nltk.download('punkt', quiet=True)

class HindiTextSummarizer:
    def __init__(self, summary_length=100):
        self.summary_length = summary_length
        self.rouge = Rouge()
        self.iwn = IndoWordNet(lang=Language.HINDI)

    def preprocess_text(self, text):
        if pd.isna(text):
            return []

        text = str(text)
        text = re.sub(r'\s+', ' ', text).strip()
        sentences = re.split(r'[।!?.]', text)

        cleaned_sentences = []
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence.split()) < 3:
                continue
            try:
                words = word_tokenize(sentence, language="hindi")
            except:
                words = sentence.split()
            cleaned_sentence = " ".join(words)
            if cleaned_sentence and cleaned_sentence not in cleaned_sentences:
                cleaned_sentences.append(cleaned_sentence)

        return cleaned_sentences

    def wordnet_similarity(self, s1, s2):
        words1 = set(s1.split())
        words2 = set(s2.split())
        similarity_score = 0

        for word1 in words1:
            for word2 in words2:
                try:
                    synsets1 = self.iwn.synsets(word1)
                    synsets2 = self.iwn.synsets(word2)
                    if synsets1 and synsets2:
                        similarity_score += sum(1 for _ in set(synsets1) & set(synsets2))
                except:
                    continue

        denominator = np.sqrt(len(words1)) * np.sqrt(len(words2))
        return similarity_score / denominator if denominator != 0 else 0

    def build_similarity_matrix(self, sentences):
        num_sentences = len(sentences)
        similarity_matrix = np.zeros((num_sentences, num_sentences))

        for i in range(num_sentences):
            for j in range(num_sentences):
                if i != j:
                    similarity_matrix[i][j] = self.wordnet_similarity(sentences[i], sentences[j])
        return similarity_matrix

    def rank_sentences(self, sentences, similarity_matrix):
        graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(graph, alpha=0.85)

        sorted_sentences = sorted(
            ((scores[i], sentence) for i, sentence in enumerate(sentences)),
            reverse=True
        )
        return sorted_sentences

    def generate_summary(self, text):
        sentences = self.preprocess_text(text)
        if not sentences:
            return ""

        similarity_matrix = self.build_similarity_matrix(sentences)
        ranked_sentences = self.rank_sentences(sentences, similarity_matrix)

        summary = []
        word_count = 0
        for score, sentence in ranked_sentences:
            if all(self.wordnet_similarity(sentence, existing) < 0.7 for existing in summary):
                if word_count + len(sentence.split()) <= self.summary_length:
                    summary.append(sentence)
                    word_count += len(sentence.split())
            if word_count >= self.summary_length:
                break

        return " ".join(summary)

def compute_average_rouge(summarizer, df_subset):
    rouge = Rouge()
    rouge_1, rouge_2, rouge_l = [], [], []

    for idx, row in enumerate(df_subset.itertuples(), start=1):
        article = row.article
        reference_summary = row.summary
        generated_summary = summarizer.generate_summary(article)

        if generated_summary.strip():
            try:
                scores = rouge.get_scores(generated_summary, reference_summary)[0]
                rouge_1.append(scores["rouge-1"])
                rouge_2.append(scores["rouge-2"])
                rouge_l.append(scores["rouge-l"])
            except:
                continue

        if idx % 10 == 0:
            print(f"  🔹 Processed {idx}/{len(df_subset)} documents...")

    def avg(metric_scores):
        return {
            "p": np.mean([s["p"] for s in metric_scores]) if metric_scores else 0,
            "r": np.mean([s["r"] for s in metric_scores]) if metric_scores else 0,
            "f": np.mean([s["f"] for s in metric_scores]) if metric_scores else 0
        }

    return {
        "rouge-1": avg(rouge_1),
        "rouge-2": avg(rouge_2),
        "rouge-l": avg(rouge_l)
    }

def main():
    try:
        df = pd.read_csv("test.csv", encoding="utf-8", nrows=200)
    except Exception as e:
        print(f"❌ Error reading dataset: {e}")
        return

    df.dropna(subset=["summary"], inplace=True)
    df = df[df["article"].str.split().str.len() >= 200]

    subset_sizes = [30, 50, 100]
    results = []

    for size in subset_sizes:
        print(f"\n📄 Evaluating on {size} documents...")
        summarizer = HindiTextSummarizer(summary_length=100)
        df_subset = df.iloc[:size]
        avg_scores = compute_average_rouge(summarizer, df_subset)

        for metric in ["rouge-1", "rouge-2", "rouge-l"]:
            score = avg_scores[metric]
            print(f"✅ {metric.upper()} → P: {score['p']:.4f}, R: {score['r']:.4f}, F1: {score['f']:.4f}")

        results.append({
            "Subset Size": size,
            "ROUGE-1 Precision": avg_scores["rouge-1"]["p"],
            "ROUGE-1 Recall": avg_scores["rouge-1"]["r"],
            "ROUGE-1 F1": avg_scores["rouge-1"]["f"],
            "ROUGE-2 Precision": avg_scores["rouge-2"]["p"],
            "ROUGE-2 Recall": avg_scores["rouge-2"]["r"],
            "ROUGE-2 F1": avg_scores["rouge-2"]["f"],
            "ROUGE-L Precision": avg_scores["rouge-l"]["p"],
            "ROUGE-L Recall": avg_scores["rouge-l"]["r"],
            "ROUGE-L F1": avg_scores["rouge-l"]["f"]
        })

    results_df = pd.DataFrame(results)
    results_df.to_csv("wordnet_rouge_scores.csv", index=False)
    print("\n📁 Saved scores to wordnet_rouge_scores.csv ✅")

if __name__ == "__main__":
    main()


2025-07-05:16:52:50,989 INFO     [iwn.py:43] Loading hindi language synsets...



📄 Evaluating on 30 documents...
  🔹 Processed 10/30 documents...
  🔹 Processed 20/30 documents...


2025-07-05:17:36:12,589 INFO     [iwn.py:43] Loading hindi language synsets...


  🔹 Processed 30/30 documents...
✅ ROUGE-1 → P: 0.2314, R: 0.6084, F1: 0.3319
✅ ROUGE-2 → P: 0.1035, R: 0.3240, F1: 0.1551
✅ ROUGE-L → P: 0.1900, R: 0.5012, F1: 0.2727

📄 Evaluating on 50 documents...
  🔹 Processed 10/50 documents...
  🔹 Processed 20/50 documents...
  🔹 Processed 30/50 documents...
  🔹 Processed 40/50 documents...


2025-07-05:18:27:25,622 INFO     [iwn.py:43] Loading hindi language synsets...


  🔹 Processed 50/50 documents...
✅ ROUGE-1 → P: 0.2551, R: 0.6352, F1: 0.3583
✅ ROUGE-2 → P: 0.1315, R: 0.3698, F1: 0.1901
✅ ROUGE-L → P: 0.2150, R: 0.5290, F1: 0.3008

📄 Evaluating on 100 documents...
  🔹 Processed 10/63 documents...
  🔹 Processed 20/63 documents...
  🔹 Processed 30/63 documents...
  🔹 Processed 40/63 documents...
  🔹 Processed 50/63 documents...
  🔹 Processed 60/63 documents...
✅ ROUGE-1 → P: 0.2707, R: 0.6436, F1: 0.3737
✅ ROUGE-2 → P: 0.1442, R: 0.3965, F1: 0.2059
✅ ROUGE-L → P: 0.2274, R: 0.5398, F1: 0.3140

📁 Saved scores to wordnet_rouge_scores.csv ✅
