In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import nltk
import re
import chardet
from nltk.tokenize import word_tokenize
from rouge import Rouge
from tqdm import tqdm

# Ensure NLTK resources are downloaded
nltk.download('punkt', quiet=True)

class HindiTextSummarizer:
    def __init__(self, summary_length=300):
        self.summary_length = summary_length
        self.rouge = Rouge()

    def preprocess_text(self, text):
        if pd.isna(text):
            return []
        
        text = str(text)
        text = re.sub(r'\s+', ' ', text).strip()
        sentences = re.split(r'[।!?.]', text)
        
        cleaned_sentences = []
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence.split()) < 3:
                continue
            try:
                words = word_tokenize(sentence, language="hindi")
            except:
                words = sentence.split()
            cleaned_sentence = " ".join(words)
            if cleaned_sentence and cleaned_sentence not in cleaned_sentences:
                cleaned_sentences.append(cleaned_sentence)
        
        return cleaned_sentences

    def build_similarity_matrix(self, sentences, similarity_measure="common_tokens"):
        num_sentences = len(sentences)
        similarity_matrix = np.zeros((num_sentences, num_sentences))
        
        for i in range(num_sentences):
            for j in range(num_sentences):
                if i != j and i <= j:
                    if similarity_measure == "common_tokens":
                        similarity_matrix[i][j] = self.common_tokens_similarity(sentences[i], sentences[j])
                    elif similarity_measure == "cosine":
                        similarity_matrix[i][j] = self.cosine_similarity(sentences[i], sentences[j])
                    elif similarity_measure == "jaccard":
                        similarity_matrix[i][j] = self.jaccard_similarity(sentences[i], sentences[j])
        
        return similarity_matrix

    def common_tokens_similarity(self, s1, s2):
        words1 = set(s1.split())
        words2 = set(s2.split())
        common_tokens = len(words1.intersection(words2))
        denominator = np.log(len(words1) + 1) + np.log(len(words2) + 1)
        return common_tokens / denominator if denominator != 0 else 0

    def cosine_similarity(self, s1, s2):
        words1 = set(s1.split())
        words2 = set(s2.split())
        intersection = len(words1.intersection(words2))
        denominator = np.sqrt(len(words1)) * np.sqrt(len(words2))
        return intersection / denominator if denominator != 0 else 0

    def jaccard_similarity(self, s1, s2):
        words1 = set(s1.split())
        words2 = set(s2.split())
        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))
        return intersection / union if union != 0 else 0

    def rank_sentences(self, sentences, similarity_matrix):
        graph = nx.from_numpy_array(similarity_matrix)
        scores = nx.pagerank(graph, alpha=0.85)
        return sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)

    def generate_summary(self, text, similarity_measure="common_tokens"):
        sentences = self.preprocess_text(text)
        if not sentences:
            return ""
        similarity_matrix = self.build_similarity_matrix(sentences, similarity_measure)
        ranked_sentences = self.rank_sentences(sentences, similarity_matrix)
        summary = []
        selected_sentences = set()
        word_count = 0
        for score, sentence in ranked_sentences:
            if any(self.jaccard_similarity(sentence, s) > 0.7 for s in selected_sentences):
                continue
            if word_count + len(sentence.split()) <= self.summary_length:
                summary.append(sentence)
                selected_sentences.add(sentence)
                word_count += len(sentence.split())
            else:
                break
        return " ".join(summary)

    def evaluate_summary(self, generated_summary, reference_summary):
        if pd.isna(generated_summary) or pd.isna(reference_summary):
            return None
        generated_summary = str(generated_summary).strip()
        reference_summary = str(reference_summary).strip()
        if not generated_summary or not reference_summary:
            return None
        try:
            scores = self.rouge.get_scores(generated_summary, reference_summary)
            return scores
        except Exception as e:
            print(f"ROUGE evaluation error: {e}")
            return None

def main():
    try:
        hindi_data = pd.read_csv("test.csv", encoding="utf-8", nrows=700)
    except FileNotFoundError:
        print("Dataset not found.")
        return
    except pd.errors.EmptyDataError:
        print("The CSV file is empty.")
        return

    hindi_data.dropna(subset=["summary"], inplace=True)
    hindi_data = hindi_data[hindi_data["article"].str.split().str.len() >= 200]

    summarizer = HindiTextSummarizer(summary_length=100)
    similarity_measures = ["common_tokens", "cosine", "jaccard"]
    document_counts = [30, 50, 100, 200, 300, 500]

    compilation_results = []

    for count in document_counts:
        print(f"\nProcessing {count} documents...")
        subset = hindi_data.head(count)
        results = []

        for _, row in tqdm(subset.iterrows(), total=len(subset), desc=f"Docs: {count}"):
            article, summary = row["article"], row["summary"]
            if pd.isna(article) or pd.isna(summary):
                continue

            for measure in similarity_measures:
                generated_summary = summarizer.generate_summary(article, similarity_measure=measure)
                rouge_scores = summarizer.evaluate_summary(generated_summary, summary)
                if rouge_scores:
                    # Extract all metrics (precision, recall, f1) for each ROUGE score
                    rouge1 = rouge_scores[0]['rouge-1']
                    rouge2 = rouge_scores[0]['rouge-2']
                    rougeL = rouge_scores[0]['rouge-l']
                    
                    results.append({
                        "Similarity Measure": measure,
                        "ROUGE-1 Precision": rouge1['p'],
                        "ROUGE-1 Recall": rouge1['r'],
                        "ROUGE-1 F1": rouge1['f'],
                        "ROUGE-2 Precision": rouge2['p'],
                        "ROUGE-2 Recall": rouge2['r'],
                        "ROUGE-2 F1": rouge2['f'],
                        "ROUGE-L Precision": rougeL['p'],
                        "ROUGE-L Recall": rougeL['r'],
                        "ROUGE-L F1": rougeL['f']
                    })

        result_df = pd.DataFrame(results)
        if result_df.empty:
            continue

        avg_scores = result_df.groupby("Similarity Measure").mean().reset_index()
        avg_scores["Document Count"] = count
        compilation_results.append(avg_scores)

    final_table = pd.concat(compilation_results, ignore_index=True)
    # Reorder columns for better readability
    columns_order = ["Document Count", "Similarity Measure"] + \
                    [f"ROUGE-1 {m}" for m in ["Precision", "Recall", "F1"]] + \
                    [f"ROUGE-2 {m}" for m in ["Precision", "Recall", "F1"]] + \
                    [f"ROUGE-L {m}" for m in ["Precision", "Recall", "F1"]]
    final_table = final_table[columns_order]

    print("\nCompilation Table of Average ROUGE Scores (Precision, Recall, F1):\n")
    print(final_table.to_string(index=False, float_format="%.3f"))

    final_table.to_csv("compiled_summary_rouge_scores_detailed.csv", index=False)
    print("\nDetailed results exported to compiled_summary_rouge_scores_detailed.csv")

if __name__ == "__main__":
    main()


Processing 100 documents...


Docs: 100: 100%|██████████| 100/100 [00:28<00:00,  3.47it/s]



Processing 200 documents...


Docs: 200: 100%|██████████| 100/100 [01:06<00:00,  1.51it/s]



Processing 300 documents...


Docs: 300: 100%|██████████| 100/100 [01:27<00:00,  1.15it/s]



Processing 500 documents...


Docs: 500: 100%|██████████| 100/100 [01:25<00:00,  1.17it/s]


Compilation Table of Average ROUGE Scores (Precision, Recall, F1):

 Document Count Similarity Measure  ROUGE-1 Precision  ROUGE-1 Recall  ROUGE-1 F1  ROUGE-2 Precision  ROUGE-2 Recall  ROUGE-2 F1  ROUGE-L Precision  ROUGE-L Recall  ROUGE-L F1
            100      common_tokens              0.272           0.594       0.368              0.141           0.351       0.198              0.228           0.493       0.307
            100             cosine              0.278           0.601       0.375              0.142           0.366       0.202              0.233           0.504       0.315
            100            jaccard              0.266           0.573       0.357              0.128           0.326       0.180              0.219           0.470       0.294
            200      common_tokens              0.272           0.594       0.368              0.141           0.351       0.198              0.228           0.493       0.307
            200             cosine              0.2


