In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import nltk
import re
from nltk.tokenize import word_tokenize
from rouge import Rouge
from scipy.spatial import distance
import fasttext.util

# Ensure NLTK resources are downloaded
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# Load FastText model for Hindi
print("⏳ Loading FastText model...")
fasttext.util.download_model('hi', if_exists='ignore')
model = fasttext.load_model('cc.hi.300.bin')
print("✅ FastText model loaded successfully!")

class HindiTextSummarizer:
    def __init__(self, summary_length=150):  # Increased default length
        self.summary_length = summary_length
        self.rouge = Rouge()

    def preprocess_text(self, text):
        if pd.isna(text):
            return []
        
        text = str(text)
        text = re.sub(r'\s+', ' ', text).strip()
        sentences = re.split(r'[।!?.]', text)
        
        cleaned_sentences = []
        for sentence in sentences:
            sentence = sentence.strip()
            if len(sentence.split()) < 2:  # Reduced from 3 to 2 words minimum
                continue
            try:
                words = word_tokenize(sentence, language="hindi")
            except:
                words = sentence.split()
            cleaned_sentence = " ".join(words)
            if cleaned_sentence and cleaned_sentence not in cleaned_sentences:
                cleaned_sentences.append(cleaned_sentence)
        
        return cleaned_sentences

    def jaccard_similarity(self, s1, s2):
        """Jaccard similarity for duplicate detection"""
        words1 = set(s1.split())
        words2 = set(s2.split())
        intersection = len(words1.intersection(words2))
        union = len(words1.union(words2))
        return intersection / union if union != 0 else 0

    def sentence_to_vector(self, sentence):
        """Convert sentence to vector using FastText word embeddings"""
        words = sentence.split()
        word_vectors = []
        
        for word in words:
            try:
                word_vectors.append(model[word])
            except:
                continue
        
        if len(word_vectors) == 0:
            return np.zeros(model.get_dimension())
        
        return np.mean(word_vectors, axis=0)

    def cosine_similarity_ft(self, s1, s2):
        """Cosine similarity using FastText vectors"""
        vec1 = self.sentence_to_vector(s1)
        vec2 = self.sentence_to_vector(s2)
        
        if np.allclose(vec1, 0) or np.allclose(vec2, 0):
            return 0.0
        
        dot_product = np.dot(vec1, vec2)
        norm_product = np.linalg.norm(vec1) * np.linalg.norm(vec2)
        
        if norm_product == 0:
            return 0.0
        
        return max(0.0, dot_product / norm_product)

    def euclidean_similarity_ft(self, s1, s2):
        """Convert Euclidean distance to similarity"""
        vec1 = self.sentence_to_vector(s1)
        vec2 = self.sentence_to_vector(s2)
        
        if np.allclose(vec1, 0) or np.allclose(vec2, 0):
            return 0.0
        
        dist = distance.euclidean(vec1, vec2)
        # Less aggressive conversion - increased divisor
        return np.exp(-dist / 15.0)  # Increased from 5.0 to 15.0

    def manhattan_similarity_ft(self, s1, s2):
        """Convert Manhattan distance to similarity"""
        vec1 = self.sentence_to_vector(s1)
        vec2 = self.sentence_to_vector(s2)
        
        if np.allclose(vec1, 0) or np.allclose(vec2, 0):
            return 0.0
        
        dist = distance.cityblock(vec1, vec2)
        return np.exp(-dist / 150.0)  # Increased from 50.0 to 150.0

    def build_similarity_matrix_ft(self, sentences, similarity_measure):
        """Build similarity matrix for FastText-based methods"""
        num_sentences = len(sentences)
        similarity_matrix = np.zeros((num_sentences, num_sentences))
        
        for i in range(num_sentences):
            for j in range(num_sentences):
                if i == j:
                    similarity_matrix[i][j] = 1.0  # Restore self-similarity
                else:
                    if similarity_measure == "cosine":
                        sim = self.cosine_similarity_ft(sentences[i], sentences[j])
                    elif similarity_measure == "euclidean":
                        sim = self.euclidean_similarity_ft(sentences[i], sentences[j])
                    elif similarity_measure == "manhattan":
                        sim = self.manhattan_similarity_ft(sentences[i], sentences[j])
                    else:
                        sim = self.cosine_similarity_ft(sentences[i], sentences[j])
                    
                    similarity_matrix[i][j] = sim

        return similarity_matrix

    def rank_sentences(self, sentences, similarity_matrix):
        """Rank sentences using PageRank algorithm"""
        try:
            # Add small epsilon to avoid zero connectivity
            similarity_matrix = similarity_matrix + 1e-8
            
            graph = nx.from_numpy_array(similarity_matrix)
            scores = nx.pagerank(graph, alpha=0.85, max_iter=100, tol=1e-6)
            return sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)
        except:
            # Fallback: rank by sum of similarities
            scores = np.sum(similarity_matrix, axis=1)
            return sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)

    def generate_summary_ft(self, text, similarity_measure="cosine"):
        """Generate summary using FastText embeddings with balanced length control"""
        sentences = self.preprocess_text(text)
        if not sentences:
            return ""
        
        # Process more sentences but still limit for efficiency
        if len(sentences) > 30:
            sentences = sentences[:30]  # Increased from 20 to 30
        
        similarity_matrix = self.build_similarity_matrix_ft(sentences, similarity_measure)
        ranked_sentences = self.rank_sentences(sentences, similarity_matrix)
        
        summary = []
        selected_sentences = set()
        word_count = 0
        
        for score, sentence in ranked_sentences:
            # More lenient duplicate detection
            if any(self.jaccard_similarity(sentence, s) > 0.8 for s in selected_sentences):
                continue
            
            sentence_word_count = len(sentence.split())
            
            # More flexible word count with buffer
            if word_count + sentence_word_count <= self.summary_length:
                summary.append(sentence)
                selected_sentences.add(sentence)
                word_count += sentence_word_count
            elif word_count < self.summary_length * 0.7:  # If we haven't reached 70% of target
                # Allow slightly longer summaries to improve recall
                remaining_words = int(self.summary_length * 1.2) - word_count  # 20% buffer
                if sentence_word_count <= remaining_words:
                    summary.append(sentence)
                    selected_sentences.add(sentence)
                    word_count += sentence_word_count
                else:
                    break
            else:
                break
        
        return " ".join(summary)

    def evaluate_summary(self, generated_summary, reference_summary):
        """Evaluate summary using ROUGE metrics"""
        if pd.isna(generated_summary) or pd.isna(reference_summary):
            return None
        
        generated_summary = str(generated_summary).strip()
        reference_summary = str(reference_summary).strip()
        
        if not generated_summary or not reference_summary:
            return None
        
        try:
            scores = self.rouge.get_scores(generated_summary, reference_summary)
            return scores
        except Exception as e:
            print(f"ROUGE evaluation error: {e}")
            return None

from tqdm import tqdm  # Add this import at the top

def compute_average_rouge(summarizer, df_subset, similarity_measure):
    rouge_scores = []

    print(f"Generating and evaluating summaries using {similarity_measure} similarity...")
    for _, row in tqdm(df_subset.iterrows(), total=len(df_subset), desc="Progress", leave=False):
        article = row["article"]
        reference = row["summary"]

        generated_summary = summarizer.generate_summary_ft(article, similarity_measure=similarity_measure)
        scores = summarizer.evaluate_summary(generated_summary, reference)
        if scores:
            rouge_scores.append(scores[0])

    if not rouge_scores:
        return None

    # Compute average scores
    avg_scores = {
        "rouge-1": {
            "p": np.mean([score["rouge-1"]["p"] for score in rouge_scores]),
            "r": np.mean([score["rouge-1"]["r"] for score in rouge_scores]),
            "f": np.mean([score["rouge-1"]["f"] for score in rouge_scores])
        },
        "rouge-2": {
            "p": np.mean([score["rouge-2"]["p"] for score in rouge_scores]),
            "r": np.mean([score["rouge-2"]["r"] for score in rouge_scores]),
            "f": np.mean([score["rouge-2"]["f"] for score in rouge_scores])
        },
        "rouge-l": {
            "p": np.mean([score["rouge-l"]["p"] for score in rouge_scores]),
            "r": np.mean([score["rouge-l"]["r"] for score in rouge_scores]),
            "f": np.mean([score["rouge-l"]["f"] for score in rouge_scores])
        }
    }

    return avg_scores


def main():
    try:
        hindi_data = pd.read_csv("test.csv", encoding="utf-8", nrows=600)
    except FileNotFoundError:
        print("Error: Dataset not found.")
        return
    except pd.errors.EmptyDataError:
        print("Error: The CSV file is empty.")
        return

    subset_sizes = [30, 50, 100, 200, 500]
    summary_length = 100
    fasttext_measures = ["cosine", "euclidean", "manhattan"]

    results = []

    for size in subset_sizes:
        print("\n" + "=" * 100)
        print(f"Evaluating summaries for the first {size} documents")
        print("=" * 100)

        df_subset = hindi_data.iloc[:size].dropna(subset=["article", "summary"])
        summarizer = HindiTextSummarizer(summary_length=summary_length)

        for measure in fasttext_measures:
            print(f"\nSimilarity method: {measure}")
            avg_scores = compute_average_rouge(summarizer, df_subset, measure)

            if avg_scores:
                print("Average ROUGE Scores:")
                for metric in ["rouge-1", "rouge-2", "rouge-l"]:
                    score = avg_scores[metric]
                    print(f"{metric.upper()} -> Precision: {score['p']:.4f}, Recall: {score['r']:.4f}, F1-Score: {score['f']:.4f}")

                # Add row to results
                results.append({
                    "Subset Size": size,
                    "Similarity": measure,
                    "ROUGE-1 Precision": avg_scores["rouge-1"]["p"],
                    "ROUGE-1 Recall": avg_scores["rouge-1"]["r"],
                    "ROUGE-1 F1": avg_scores["rouge-1"]["f"],
                    "ROUGE-2 Precision": avg_scores["rouge-2"]["p"],
                    "ROUGE-2 Recall": avg_scores["rouge-2"]["r"],
                    "ROUGE-2 F1": avg_scores["rouge-2"]["f"],
                    "ROUGE-L Precision": avg_scores["rouge-l"]["p"],
                    "ROUGE-L Recall": avg_scores["rouge-l"]["r"],
                    "ROUGE-L F1": avg_scores["rouge-l"]["f"]
                })
            else:
                print("ROUGE score computation failed. This may be due to empty or invalid summaries.")

    # Save results to CSV
    results_df = pd.DataFrame(results)
    results_df.to_csv("rouge_scores_summary.csv", index=False)
    print("\nSaved average ROUGE scores to 'rouge_scores_summary.csv'")


if __name__ == "__main__":
    main()

⏳ Loading FastText model...




✅ FastText model loaded successfully!

Evaluating summaries for the first 30 documents

Similarity method: cosine
Generating and evaluating summaries using cosine similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.1758, Recall: 0.4259, F1-Score: 0.2408
ROUGE-2 -> Precision: 0.0550, Recall: 0.1766, F1-Score: 0.0835
ROUGE-L -> Precision: 0.1451, Recall: 0.3446, F1-Score: 0.1969

Similarity method: euclidean
Generating and evaluating summaries using euclidean similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.1780, Recall: 0.4372, F1-Score: 0.2459
ROUGE-2 -> Precision: 0.0560, Recall: 0.1755, F1-Score: 0.0846
ROUGE-L -> Precision: 0.1469, Recall: 0.3541, F1-Score: 0.2010

Similarity method: manhattan
Generating and evaluating summaries using manhattan similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.1818, Recall: 0.4517, F1-Score: 0.2520
ROUGE-2 -> Precision: 0.0580, Recall: 0.1855, F1-Score: 0.0879
ROUGE-L -> Precision: 0.1520, Recall: 0.3727, F1-Score: 0.2091

Evaluating summaries for the first 50 documents

Similarity method: cosine
Generating and evaluating summaries using cosine similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.1714, Recall: 0.4163, F1-Score: 0.2356
ROUGE-2 -> Precision: 0.0480, Recall: 0.1479, F1-Score: 0.0717
ROUGE-L -> Precision: 0.1375, Recall: 0.3303, F1-Score: 0.1877

Similarity method: euclidean
Generating and evaluating summaries using euclidean similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.1717, Recall: 0.4218, F1-Score: 0.2379
ROUGE-2 -> Precision: 0.0471, Recall: 0.1423, F1-Score: 0.0703
ROUGE-L -> Precision: 0.1381, Recall: 0.3331, F1-Score: 0.1897

Similarity method: manhattan
Generating and evaluating summaries using manhattan similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.1742, Recall: 0.4310, F1-Score: 0.2421
ROUGE-2 -> Precision: 0.0480, Recall: 0.1502, F1-Score: 0.0722
ROUGE-L -> Precision: 0.1414, Recall: 0.3467, F1-Score: 0.1953

Evaluating summaries for the first 100 documents

Similarity method: cosine
Generating and evaluating summaries using cosine similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.2028, Recall: 0.4704, F1-Score: 0.2729
ROUGE-2 -> Precision: 0.0771, Recall: 0.2326, F1-Score: 0.1119
ROUGE-L -> Precision: 0.1707, Recall: 0.3938, F1-Score: 0.2284

Similarity method: euclidean
Generating and evaluating summaries using euclidean similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.2090, Recall: 0.4925, F1-Score: 0.2841
ROUGE-2 -> Precision: 0.0815, Recall: 0.2432, F1-Score: 0.1186
ROUGE-L -> Precision: 0.1727, Recall: 0.4038, F1-Score: 0.2332

Similarity method: manhattan
Generating and evaluating summaries using manhattan similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.2110, Recall: 0.4896, F1-Score: 0.2853
ROUGE-2 -> Precision: 0.0828, Recall: 0.2442, F1-Score: 0.1199
ROUGE-L -> Precision: 0.1811, Recall: 0.4168, F1-Score: 0.2434

Evaluating summaries for the first 200 documents

Similarity method: cosine
Generating and evaluating summaries using cosine similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.2401, Recall: 0.5370, F1-Score: 0.3213
ROUGE-2 -> Precision: 0.1157, Recall: 0.3338, F1-Score: 0.1675
ROUGE-L -> Precision: 0.2080, Recall: 0.4634, F1-Score: 0.2774

Similarity method: euclidean
Generating and evaluating summaries using euclidean similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.2440, Recall: 0.5464, F1-Score: 0.3270
ROUGE-2 -> Precision: 0.1204, Recall: 0.3409, F1-Score: 0.1734
ROUGE-L -> Precision: 0.2122, Recall: 0.4715, F1-Score: 0.2832

Similarity method: manhattan
Generating and evaluating summaries using manhattan similarity...


                                                         

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.2441, Recall: 0.5441, F1-Score: 0.3265
ROUGE-2 -> Precision: 0.1187, Recall: 0.3326, F1-Score: 0.1703
ROUGE-L -> Precision: 0.2152, Recall: 0.4759, F1-Score: 0.2866

Evaluating summaries for the first 500 documents

Similarity method: cosine
Generating and evaluating summaries using cosine similarity...


                                                           

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.2340, Recall: 0.5335, F1-Score: 0.3156
ROUGE-2 -> Precision: 0.1159, Recall: 0.3323, F1-Score: 0.1679
ROUGE-L -> Precision: 0.2006, Recall: 0.4560, F1-Score: 0.2701

Similarity method: euclidean
Generating and evaluating summaries using euclidean similarity...


                                                           

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.2360, Recall: 0.5379, F1-Score: 0.3186
ROUGE-2 -> Precision: 0.1180, Recall: 0.3324, F1-Score: 0.1702
ROUGE-L -> Precision: 0.2021, Recall: 0.4588, F1-Score: 0.2721

Similarity method: manhattan
Generating and evaluating summaries using manhattan similarity...


                                                           

Average ROUGE Scores:
ROUGE-1 -> Precision: 0.2373, Recall: 0.5376, F1-Score: 0.3196
ROUGE-2 -> Precision: 0.1185, Recall: 0.3289, F1-Score: 0.1701
ROUGE-L -> Precision: 0.2057, Recall: 0.4636, F1-Score: 0.2763

Saved average ROUGE scores to 'rouge_scores_summary.csv'


