In [6]:
# Connect to T4 GPU
!nvidia-smi  # Verify GPU connection

Thu May  1 09:12:34 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [1]:
import psutil

# Check memory usage
memory_info = psutil.virtual_memory()
print(f"Total Memory: {memory_info.total / (1024 ** 2):.2f} MB")
print(f"Available Memory: {memory_info.available / (1024 ** 2):.2f} MB")
print(f"Used Memory: {memory_info.used / (1024 ** 2):.2f} MB")
print(f"Memory Percentage: {memory_info.percent}%")

Total Memory: 12977.95 MB
Available Memory: 11640.71 MB
Used Memory: 1031.46 MB
Memory Percentage: 10.3%


In [2]:
!python --version

Python 3.11.12


In [3]:
!pip install pandas numpy matplotlib seaborn scikit-learn tensorflow==2.15.0
!pip install indic-nlp-library
!pip install ipython
!pip install xgboost



Collecting tensorflow==2.15.0
  Downloading tensorflow-2.15.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting ml-dtypes~=0.2.0 (from tensorflow==2.15.0)
  Downloading ml_dtypes-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3 (from tensorflow==2.15.0)
  Downloading protobuf-4.25.7-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.15.0)
  Downloading wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting tensorboard<2.16,>=2.15 (from tensorflow==2.15

In [1]:

# ============================================
#         Import Required Libraries
# ============================================

# After !pip installation restart the session
import scipy
import scipy.sparse
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
import joblib
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import ComplementNB
from xgboost import XGBClassifier
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from indicnlp.tokenize import indic_tokenize
from indicnlp.normalize.indic_normalize import DevanagariNormalizer
import seaborn as sns

In [2]:
# ======================================
#     ENHANCED MARATHI TEXT PROCESSOR
# ======================================
class MarathiTextProcessor:
    def __init__(self, remove_nuktas=False, nasals_mode='do_nothing',
                 normalize_chandras=False, normalize_vowel_endings=False):
        self.normalizer = DevanagariNormalizer(
            lang='mr',
            remove_nuktas=remove_nuktas,
            nasals_mode=nasals_mode,
            do_normalize_chandras=normalize_chandras,
            do_normalize_vowel_ending=normalize_vowel_endings
        )
        # Custom stop words to exclude
        self.stop_words = {'वाचा सत्य', 'व्हायरल', 'नाही', 'स्क्रिप्टेड', 'नव्हता','नव', 'यरल', 'सत', 'हत'}

    def preprocess(self, text, remove_punctuation=True):
        text = str(text)

        # Step 1: Basic cleaning
        if remove_punctuation:
            text = re.sub(r'[^\u0900-\u097F\s]', '', text)
        else:
            text = re.sub(r'[^\u0900-\u097F\u0964\u0965\s]', '', text)

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # Step 2: Script normalization
        text = self.normalizer.normalize(text)

        # Step 3: Tokenization and stop word removal
        tokens = indic_tokenize.trivial_tokenize(text)
        filtered_tokens = [
            token for token in tokens
            if token not in self.stop_words
        ]

        return ' '.join(filtered_tokens)

In [3]:
# ======================================
#       DATA LOADING FUNCTIONS
# ======================================
def load_and_preview_examples():
    """Load and preview the Marathi news datasets"""
    lokmat = pd.read_csv('lokmat_marathi_articles.csv').assign(source='Lokmat', label=1)
    ndtv = pd.read_csv('ndtv_marathi_articles.csv').assign(source='NDTV', label=1)
    factcrescendo = pd.read_csv('factcrescendo_marathi_articles.csv').assign(source='FactCrescendo', label=0)

    # Detect topics
    def detect_topic(text):
        text = str(text).lower()
        topic_keywords = {
            'Politics': ['राजकारण', 'मंत्री', 'पक्ष', 'निवडणूक'],
            'Health': ['आरोग्य', 'वैद्यकीय', 'रोग', 'विषाणू'],
            'Technology': ['तंत्रज्ञान', 'मोबाइल', 'अॅप', 'डिजिटल'],
            'Sports': ['क्रिकेट', 'फुटबॉल', 'स्पर्धा', 'पथक']
        }
        for topic, keywords in topic_keywords.items():
            if any(keyword in text for keyword in keywords):
                return topic
        return 'Others'

    for df in [lokmat, ndtv, factcrescendo]:
        df['topic'] = df['Title'].apply(detect_topic)

    # Combine and return
    positive = pd.concat([lokmat, ndtv], ignore_index=True)
    negative = factcrescendo.copy()

    print("✅ Positive Samples:", len(positive), "| Sources:", positive['source'].unique())
    print("❌ Negative Samples:", len(negative), "| Sources:", negative['source'].unique())
    return positive, negative

def prepare_datasets(positive_data, negative_data):
    """Prepare datasets with metadata"""
    data_all = pd.concat([positive_data, negative_data], ignore_index=True)
    data_all['text'] = (
        data_all['Title'].fillna('') + ' ' +
        data_all['Content'].fillna('') + ' ' +
        data_all['Paragraph'].fillna('')
    )
    return data_all[['text', 'label', 'source', 'topic']].dropna(subset=['text'])

In [4]:
# ==============================================
#    TEXT STATISTICS AND READABILITY ANALYSIS
# ==============================================
def calculate_text_stats(text_series):
    """Calculate comprehensive statistics for Marathi text"""
    stats = []
    for text in text_series:
        tokens = indic_tokenize.trivial_tokenize(text)
        words = [t for t in tokens if t.strip() and not t.isspace()]
        sentences = [s for s in text.split('।') if s.strip()]

        # Syllable counting
        syllable_counts = []
        polysyllabic_words = 0
        for word in words:
            syllables = max(1, len(re.findall(r'[\u0900-\u097F\u0951\u0952]', word)))
            syllable_counts.append(syllables)
            if syllables > 2:
                polysyllabic_words += 1

        stats.append({
            'num_tokens': len(tokens),
            'num_words': len(words),
            'num_unique_words': len(set(words)),
            'num_sentences': len(sentences),
            'avg_word_length': np.mean([len(w) for w in words]) if words else 0,
            'avg_sentence_length': len(words)/len(sentences) if sentences else 0,
            'avg_syllables_per_word': np.mean(syllable_counts) if syllable_counts else 0,
            'polysyllabic_words': polysyllabic_words
        })
    return pd.DataFrame(stats)

def compute_readability_scores(stats_df):
    """Calculate Marathi Readability Score (MRS)"""
    stats_df['mrs'] = -2.34 + 2.14 * stats_df['avg_word_length'] + 0.01 * stats_df['polysyllabic_words']
    return stats_df

def generate_text_statistics_table(original_texts, processed_texts):
    """Generate Table I comparison with proper labels"""
    # Calculate statistics
    original_stats = calculate_text_stats(original_texts)
    processed_stats = calculate_text_stats(processed_texts)

    # Create DataFrames with proper index names
    original_df = pd.DataFrame({
        "# Tokens": [
            original_stats["num_tokens"].max(),
            original_stats["num_tokens"].min(),
            original_stats["num_tokens"].mean()
        ],
        "# Words": [
            original_stats["num_words"].max(),
            original_stats["num_words"].min(),
            original_stats["num_words"].mean()
        ],
        "# Unique Words": [
            original_stats["num_unique_words"].max(),
            original_stats["num_unique_words"].min(),
            original_stats["num_unique_words"].mean()
        ],
        "# Sentences": [
            original_stats["num_sentences"].max(),
            original_stats["num_sentences"].min(),
            original_stats["num_sentences"].mean()
        ]
    }, index=["Max", "Min", "Average"])

    processed_df = pd.DataFrame({
        "# Tokens": [
            processed_stats["num_tokens"].max(),
            processed_stats["num_tokens"].min(),
            processed_stats["num_tokens"].mean()
        ],
        "# Words": [
            processed_stats["num_words"].max(),
            processed_stats["num_words"].min(),
            processed_stats["num_words"].mean()
        ],
        "# Unique Words": [
            processed_stats["num_unique_words"].max(),
            processed_stats["num_unique_words"].min(),
            processed_stats["num_unique_words"].mean()
        ],
        "# Sentences": [
            processed_stats["num_sentences"].max(),
            processed_stats["num_sentences"].min(),
            processed_stats["num_sentences"].mean()
        ]
    }, index=["Max", "Min", "Average"])

    # Create final table with section headers
    final_table = pd.concat({
        "Original Articles": original_df,
        "Processed Articles": processed_df
    }, names=['Article Type', 'Metric'])

    # Format display
    print("TABLE I")
    print("\nTHE TABLE SHOWS THE MAXIMUM, MINIMUM, AND AVERAGE VALUES FOR TOKENS, WORDS, UNIQUE WORDS, AND SENTENCES\n")

    return final_table.reset_index(level='Article Type')

def generate_complete_readability_table(df, text_col='text', label_col='label',
                                     source_col='source', topic_col='topic'):
    """Generate three separate tables for readability scores"""
    # Calculate statistics
    stats_df = calculate_text_stats(df[text_col])
    df_with_mrs = df.copy()
    df_with_mrs['mrs'] = compute_readability_scores(stats_df)['mrs']

    # Build comparison data
    source_mrs = df_with_mrs.groupby(source_col)['mrs'].mean().sort_values(ascending=False)
    topic_comparison = df_with_mrs.pivot_table(index=topic_col, columns=label_col,
                                            values='mrs', aggfunc='mean')
    class_mrs = df_with_mrs.groupby(label_col)['mrs'].mean()

    # =====================================
    # Table 1: Source-wise Scores
    # =====================================
    source_table = pd.DataFrame({
        'Source': source_mrs.index,
        'MRS Score': [round(score, 2) for score in source_mrs.values]
    })

    # =====================================
    # Table 2: Topic-wise Comparison
    # =====================================
    topic_table = pd.DataFrame({
        'Topic': topic_comparison.index,
        'False (HRS)': [round(score, 2) for score in topic_comparison[0].values],
        'True (HRS)': [round(score, 2) for score in topic_comparison[1].values]
    })

    # =====================================
    # Table 3: Class-wise Comparison
    # =====================================
    class_table = pd.DataFrame({
        'Class': ['False', 'True'],
        'MRS Score': [round(class_mrs[0], 2), round(class_mrs[1], 2)]
    })

    # =====================================
    #         Display all tables
    # =====================================
    print("\nTABLE II: MARATHI READABILITY SCORES (MRS) COMPARISON")

    # Common style settings
    table_style = [
        {'selector': 'th',
         'props': [('background-color', '#f2f2f2'), ('text-align', 'center')]},
        {'selector': 'td',
         'props': [('text-align', 'center')]},
        {'selector': '',
         'props': [('border', '1px solid black')]}
    ]

    print("\n1. Source-wise Scores:")
    source_styler = source_table.style.set_table_styles(table_style)
    if hasattr(source_styler, 'hide_index'):
        source_styler = source_styler.hide_index()
    display(source_styler)

    print("\n2. Topic-wise Comparison:")
    topic_styler = topic_table.style.set_table_styles(table_style)
    if hasattr(topic_styler, 'hide_index'):
        topic_styler = topic_styler.hide_index()
    display(topic_styler)

    print("\n3. Class-wise Comparison:")
    class_styler = class_table.style.set_table_styles(table_style)
    if hasattr(class_styler, 'hide_index'):
        class_styler = class_styler.hide_index()
    display(class_styler)

In [5]:
# ======================================
#        MAIN EXECUTION PIPELINE
# ======================================
if __name__ == "__main__":
    # Initialize text processor
    marathi_processor = MarathiTextProcessor()

    # Load and prepare data
    positive, negative = load_and_preview_examples()
    data_all = prepare_datasets(positive, negative)

    # Split data
    X = data_all[['text', 'source', 'topic']]
    y = data_all['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Preprocess text (MUST HAPPEN FIRST)
    print("\n🔄 Preprocessing text...")
    X_train_processed = X_train['text'].apply(marathi_processor.preprocess)
    X_test_processed = X_test['text'].apply(marathi_processor.preprocess)

✅ Positive Samples: 2223 | Sources: ['Lokmat' 'NDTV']
❌ Negative Samples: 2086 | Sources: ['FactCrescendo']

🔄 Preprocessing text...


In [6]:
# ==================================
#          GENERATE TABLES
# ==================================

# Generate Table I
print("\n📊 Text Statistics Comparison\n")
display(generate_text_statistics_table(X_train['text'], X_train_processed))

# Generate Table II
print("\n🔍 Readability Analysis")
generate_complete_readability_table(
    X_train.assign(label=y_train),
    text_col='text',
    label_col='label',
    source_col='source',
    topic_col='topic'
)


📊 Text Statistics Comparison

TABLE I

THE TABLE SHOWS THE MAXIMUM, MINIMUM, AND AVERAGE VALUES FOR TOKENS, WORDS, UNIQUE WORDS, AND SENTENCES



Unnamed: 0_level_0,Article Type,# Tokens,# Words,# Unique Words,# Sentences
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Max,Original Articles,4972.0,4971.0,1939.0,109.0
Min,Original Articles,1.0,0.0,0.0,0.0
Average,Original Articles,235.56832,234.651291,149.015376,1.056281
Max,Processed Articles,3698.0,3698.0,1721.0,109.0
Min,Processed Articles,1.0,0.0,0.0,0.0
Average,Processed Articles,183.742385,183.736873,127.54018,1.049318



🔍 Readability Analysis

TABLE II: MARATHI READABILITY SCORES (MRS) COMPARISON

1. Source-wise Scores:


Unnamed: 0,Source,MRS Score
0,Lokmat,10.35
1,FactCrescendo,10.35
2,NDTV,10.33



2. Topic-wise Comparison:


Unnamed: 0,Topic,False (HRS),True (HRS)
0,Health,10.13,10.51
1,Others,10.33,10.34
2,Politics,11.27,10.34
3,Sports,9.87,10.39
4,Technology,10.17,10.93



3. Class-wise Comparison:


Unnamed: 0,Class,MRS Score
0,False,10.35
1,True,10.34


In [7]:
# ====================================
#          Vectorization
# ====================================
marathi_stop_words = ['वाचा सत्य', 'व्हायरल', 'नाही', 'स्क्रिप्टेड', 'नव्हता','नव', 'यरल', 'सत', 'हत']

vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 3),
    min_df=5,
    max_df=0.7,
    analyzer='word',
    sublinear_tf=True,
    stop_words=marathi_stop_words  # Explicitly exclude these words
)

In [9]:
# ==========================================
# Enhanced Classifier Dictionary (Optimized)
# ==========================================
dict_classifiers = {
    "Logistic Regression": LogisticRegression(
        max_iter=10000,  # Increased for complex Marathi features
        class_weight='balanced',
        solver='saga',
        penalty='elasticnet',
        l1_ratio=0.7,    # More L1 regularization for feature selection
        random_state=42
    ),
    "SVM (Linear)": LinearSVC(  # Changed to LinearSVC for better efficiency
        C=0.5,           # More regularization
        class_weight='balanced',
        max_iter=10000,
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=300,  # Reduced to prevent overfitting
        max_depth=12,
        class_weight='balanced_subsample',
        min_samples_leaf=10,  # More conservative
        n_jobs=-1,       # Parallel processing
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        n_estimators=250,
        max_depth=5,     # Shallower trees
        learning_rate=0.05,  # Slower learning
        tree_method='hist',
        scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]),
        eval_metric='logloss',
        random_state=42
    ),
    "Naive Bayes": ComplementNB(
        alpha=0.1        # Additive smoothing
    ),
    "MLP": MLPClassifier(
        hidden_layer_sizes=(128,64),  # Deeper architecture
        early_stopping=True,
        validation_fraction=0.15,
        alpha=0.001,     # Stronger regularization
        batch_size=64,
        random_state=42,
        max_iter=200
    )
}

In [10]:
# ==========================
# Enhanced Batch Classifier
# ==========================
def batch_classify(X_train, Y_train, X_test, Y_test, verbose=True):
    dict_models = {}

    for classifier_name, classifier in dict_classifiers.items():
        try:
            t_start = time.perf_counter()

            # Handle sparse matrices
            if scipy.sparse.issparse(X_train):
                X_train_ = X_train.toarray() if X_train.shape[1] <= 50000 else X_train
            else:
                X_train_ = X_train

            if scipy.sparse.issparse(X_test):
                X_test_ = X_test.toarray() if X_test.shape[1] <= 50000 else X_test
            else:
                X_test_ = X_test

            # Training
            if verbose:
                print(f"🔄 Training {classifier_name}...")

            classifier.fit(X_train_, Y_train)

            # Predictions
            y_pred = classifier.predict(X_test_)

            # Calculate metrics
            cm = confusion_matrix(Y_test, y_pred)
            metrics = {
                'train_score': classifier.score(X_train_, Y_train),
                'test_score': classifier.score(X_test_, Y_test),
                'precision': precision_score(Y_test, y_pred, zero_division=0),
                'recall': recall_score(Y_test, y_pred),
                'f1': f1_score(Y_test, y_pred),
                'roc_auc': roc_auc_score(Y_test, y_pred) if len(np.unique(Y_test)) > 1 else 0,
                'train_time': time.perf_counter() - t_start,
                'confusion_matrix': cm,
                'tp': cm[1,1],  # True positives
                'fp': cm[0,1],  # False positives
                'tn': cm[0,0],  # True negatives
                'fn': cm[1,0]   # False negatives
            }

            dict_models[classifier_name] = {
                'model': classifier,
                **{k: round(v, 4) if isinstance(v, (int, float)) else v
                   for k,v in metrics.items()}
            }

            if verbose:
                print(f"✅ {classifier_name:25s} | Test F1: {metrics['f1']:.3f} | AUC: {metrics['roc_auc']:.3f} | Time: {metrics['train_time']:.1f}s")

        except Exception as e:
            print(f"❌ {classifier_name:25s} | Failed: {str(e)}")
            continue

    return dict_models

In [11]:
# ========================
#  Robust Results Display
# ========================
def display_results(dict_models, sort_by='f1'):
    if not dict_models:
        print("⚠️ No models trained successfully!")
        return

    # Prepare dataframe with safe metric access
    results = []
    for name, model_info in dict_models.items():
        row = {
            'Classifier': name,
            'Train Acc': model_info.get('train_score', 0),
            'Test Acc': model_info.get('test_score', 0),
            'F1': model_info.get('f1', 0),
            'Precision': model_info.get('precision', 0),
            'Recall': model_info.get('recall', 0),
            'AUC': model_info.get('roc_auc', 0),
            'TP': model_info.get('tp', 0),
            'FP': model_info.get('fp', 0),
            'TN': model_info.get('tn', 0),
            'FN': model_info.get('fn', 0),
            'Time (s)': model_info.get('train_time', 0)
        }
        results.append(row)

    df = pd.DataFrame(results)

    # Ensure sort column exists
    if sort_by not in df.columns:
        sort_by = 'Test Acc'  # Fallback to accuracy

    df = df.sort_values(by=sort_by, ascending=False)

    # Style the dataframe
    def highlight_max(s):
        is_max = s == s.max()
        return ['background-color: lightgreen' if v else '' for v in is_max]

    styler = df.style
    numeric_cols = ['Train Acc', 'Test Acc', 'F1', 'Precision', 'Recall', 'AUC']

    # Apply styling only to existing columns
    for col in numeric_cols:
        if col in df.columns:
            styler = styler.apply(highlight_max, subset=[col])
            styler = styler.format("{:.3f}", subset=[col])

    if 'Time (s)' in df.columns:
        styler = styler.background_gradient(cmap='Blues', subset=['Time (s)'])

    # Hide index using compatible method
    if hasattr(styler, 'hide_index'):
        styler = styler.hide_index()
    else:
        styler = styler.set_table_styles([{
            'selector': 'th.row_heading, td.row_heading',
            'props': [('display', 'none')]
        }])

    display(styler)

    # Print best model safely
    if not df.empty:
        best_model = df.iloc[0]
        print(f"\n🏆 Best Model: {best_model['Classifier']} ({sort_by}: {best_model[sort_by]:.3f})")

In [12]:
# ========================
#     Robust Execution
# ========================
print("🔍 Vectorizing Marathi text...")
try:
    X_train_vec = vectorizer.fit_transform(X_train_processed)
    X_test_vec = vectorizer.transform(X_test_processed)

    print("\n🧪 Evaluating classifiers...")
    dict_models = batch_classify(X_train_vec, y_train, X_test_vec, y_test)

    if dict_models:
        print("\n📊 Results Summary:")
        display_results(dict_models)

        # Safely get best model
        valid_models = {k:v for k,v in dict_models.items() if 'f1' in v}
        if valid_models:
            best_model_name = max(valid_models, key=lambda x: valid_models[x]['f1'])
            model_path = f"best_marathi_model_{best_model_name.replace(' ', '_').lower()}.pkl"
            joblib.dump({
                'model': valid_models[best_model_name]['model'],
                'vectorizer': vectorizer,
                'metrics': {k:v for k,v in valid_models[best_model_name].items() if k != 'model'}
            }, model_path)
            print(f"\n💾 Saved best model to: {model_path}")
        else:
            print("\n⚠️ No models with valid F1 scores")
    else:
        print("\n⚠️ All classifiers failed! Check error messages above.")

except Exception as e:
    print(f"\n🔥 Critical error in pipeline: {str(e)}")

🔍 Vectorizing Marathi text...

🧪 Evaluating classifiers...
🔄 Training Logistic Regression...
✅ Logistic Regression       | Test F1: 0.981 | AUC: 0.978 | Time: 19.0s
🔄 Training SVM (Linear)...
✅ SVM (Linear)              | Test F1: 0.993 | AUC: 0.993 | Time: 0.6s
🔄 Training Random Forest...
✅ Random Forest             | Test F1: 0.990 | AUC: 0.990 | Time: 8.9s
🔄 Training XGBoost...
✅ XGBoost                   | Test F1: 0.995 | AUC: 0.995 | Time: 88.6s
🔄 Training Naive Bayes...
✅ Naive Bayes               | Test F1: 0.978 | AUC: 0.974 | Time: 1.2s
🔄 Training MLP...
✅ MLP                       | Test F1: 0.995 | AUC: 0.994 | Time: 29.4s

📊 Results Summary:


Unnamed: 0,Classifier,Train Acc,Test Acc,F1,Precision,Recall,AUC,TP,FP,TN,FN,Time (s)
3,XGBoost,1.0,0.994,0.995,1.0,0.989,0.995,454,0,403,5,88.5581
5,MLP,0.998,0.994,0.995,0.994,0.996,0.994,457,3,400,2,29.4177
1,SVM (Linear),1.0,0.993,0.994,0.991,0.996,0.993,457,4,399,2,0.6055
2,Random Forest,0.994,0.99,0.99,0.998,0.983,0.99,451,1,402,8,8.86
0,Logistic Regression,0.985,0.979,0.981,0.968,0.994,0.978,456,15,388,3,18.9784
4,Naive Bayes,0.991,0.976,0.978,0.956,1.0,0.974,459,21,382,0,1.2352



🏆 Best Model: XGBoost (Test Acc: 0.994)

💾 Saved best model to: best_marathi_model_mlp.pkl


In [13]:
# =======================================
#  TensorFlow Data Preparation (Updated)
# =======================================
def prepare_tensorflow_data(texts, labels, max_len=100, test_size=0.2):
    """Prepare data for TensorFlow models with stop word filtering"""
    # Custom stop words to exclude
    marathi_stop_words = ['वाचा सत्य', 'व्हायरल', 'नाही', 'स्क्रिप्टेड', 'नव्हता','नव', 'यरल', 'सत', 'हत']

    # Step 1: Filter stop words
    filtered_texts = [
        ' '.join([word for word in str(text).split()
        if word not in marathi_stop_words])
        for text in texts
    ]

    # Step 2: Tokenization
    tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(filtered_texts)
    sequences = tokenizer.texts_to_sequences(filtered_texts)

    # Step 3: Padding
    X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    y = np.array(labels)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )

    return (X_train, y_train), (X_test, y_test), tokenizer  # Now returns tokenizer


In [14]:
# ======================================
#            Main Execution
# ======================================
if __name__ == "__main__":
    # Initialize text processor
    marathi_processor = MarathiTextProcessor()

    # Load and prepare data
    positive, negative = load_and_preview_examples()
    data_all = prepare_datasets(positive, negative)

    # TensorFlow Pipeline
    print("\n🧠 Preparing TensorFlow data...")
    (X_train_tf, y_train_tf), (X_test_tf, y_test_tf), tokenizer = prepare_tensorflow_data(
        data_all['text'],
        data_all['label']
    )

    # Verify tokenizer
    print("\n✅ Tokenizer Summary:")
    print("Vocabulary size:", len(tokenizer.word_index))
    print("Sample words:", list(tokenizer.word_index.items())[:5])

✅ Positive Samples: 2223 | Sources: ['Lokmat' 'NDTV']
❌ Negative Samples: 2086 | Sources: ['FactCrescendo']

🧠 Preparing TensorFlow data...

✅ Tokenizer Summary:
Vocabulary size: 89388
Sample words: [('आहे', 1), ('या', 2), ('आणि', 3), ('आहेत', 4), ('केली', 5)]


In [15]:
    # ======================================
    #     TensorFlow Model Construction
    # ======================================
    def build_marathi_text_model(vocab_size=10000, max_len=100, embedding_dim=128):
        """Build a BiLSTM model for Marathi text classification"""
        # Input layer
        input_layer = Input(shape=(max_len,))

        # Embedding layer with pretrained weights option
        embedding = Embedding(
            input_dim=vocab_size,
            output_dim=embedding_dim,
            input_length=max_len,
            mask_zero=True
        )(input_layer)

        # BiLSTM layers
        bilstm1 = Bidirectional(LSTM(
            128,
            return_sequences=True,
            dropout=0.2,
            recurrent_dropout=0.2
        ))(embedding)

        bilstm2 = Bidirectional(LSTM(
            64,
            dropout=0.2,
            recurrent_dropout=0.2
        ))(bilstm1)

        # Dense layers
        dense1 = Dense(64, activation='relu')(bilstm2)
        dropout1 = Dropout(0.3)(dense1)
        dense2 = Dense(32, activation='relu')(dropout1)
        output = Dense(1, activation='sigmoid')(dense2)

        # Create model
        model = Model(inputs=input_layer, outputs=output)

        # Compile model
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
        )

        return model

    # Build and show model summary
    model = build_marathi_text_model(
        vocab_size=len(tokenizer.word_index) + 1,  # Now tokenizer is defined
        max_len=100
    )
    model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 128)          11441792  
                                                                 
 bidirectional (Bidirection  (None, 100, 256)          263168    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 128)               164352    
 onal)                                                           
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0     

In [16]:
    # ======================================
    #             Model Training
    # ======================================
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True
    )

    history = model.fit(
        X_train_tf, y_train_tf,
        validation_data=(X_test_tf, y_test_tf),
        epochs=20,
        batch_size=64,
        callbacks=[early_stopping],
        verbose=1
    )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [17]:
    # ======================================
    #           Model Evaluation
    # ======================================
    def evaluate_model(model, X_test, y_test):
        """Evaluate model performance"""
        results = model.evaluate(X_test, y_test, verbose=0)
        metrics = {
            'loss': results[0],
            'accuracy': results[1],
            'precision': results[2],
            'recall': results[3],
            'f1': 2 * (results[2] * results[3]) / (results[2] + results[3] + 1e-7)
        }
        return metrics

    metrics = evaluate_model(model, X_test_tf, y_test_tf)
    print("\n📊 Model Evaluation:")
    for name, value in metrics.items():
        print(f"{name.capitalize():<10}: {value:.4f}")


📊 Model Evaluation:
Loss      : 0.0034
Accuracy  : 0.9988
Precision : 1.0000
Recall    : 0.9978
F1        : 0.9989


In [24]:
# =============================================
#           PREDICTION FUNCTION (First!)
# =============================================
def predict_article(article_text, model, tokenizer, max_len=100):
    """
    Predict if a Marathi article is real (1) or fake (0)

    Args:
        article_text: Raw Marathi text to classify
        model: Your trained TensorFlow model
        tokenizer: The tokenizer used during training
        max_len: Same length used during training (default 100)
    """
    # Initialize processor (must match your training setup)
    processor = MarathiTextProcessor()

    # 1. Preprocess exactly like training data
    processed_text = processor.preprocess(article_text)

    # 2. Tokenize and pad
    sequence = tokenizer.texts_to_sequences([processed_text])
    padded = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    # 3. Predict
    proba = model.predict(padded, verbose=0)[0][0]
    prediction = 1 if proba >= 0.5 else 0

    # 4. Return human-readable results
    return {
        'prediction': prediction,
        'probability': float(proba),
        'confidence': f"{max(proba, 1-proba)*100:.1f}%",
        'interpretation': "Real/Propaganda" if prediction == 1 else "Fake/Fact-checked",
        'processed_text': processed_text  # For debugging
    }

# =============================================
#          INTERACTIVE TEST INTERFACE
# =============================================
def test_articles_interactively(model, tokenizer):
    """Test your model with custom articles"""
    from IPython.display import clear_output

    print("✅ Model Testing Ready (Type 'exit' to quit)")
    while True:
        article = input("\n📝 Paste Marathi article:\n")
        if article.lower() == 'exit':
            break

        result = predict_article(article, model, tokenizer)

        # Display results
        clear_output(wait=True)
        print("🔍 Prediction Results:")
        print(f"• Classification: {result['interpretation']} ({result['prediction']})")
        print(f"• Confidence: {result['confidence']}")
        print(f"• Probability: {result['probability']:.4f}")
        print("\n🛠️ Processed Text (Debug):")
        print(result['processed_text'][:200] + "...")  # Show first 200 chars

# =============================================
#               RUN THE TESTER
# =============================================
# Make sure these variables exist (from your training)
print("Checking dependencies...")
assert 'model' in globals(), "Model not found - train your model first!"
assert 'tokenizer' in globals(), "Tokenizer not found - run data preparation first!"

# Start testing
test_articles_interactively(model, tokenizer)

🔍 Prediction Results:
• Classification: Fake/Fact-checked (0)
• Confidence: 99.2%
• Probability: 0.0076

🛠️ Processed Text (Debug):
कुंभमेळ्यात लष्कराच्या जवानांवर चप्पलफेक नेटकऱ्यांनी तुफान शेअर केला पण नेमकं घडलं काय वाचा सत्य...

📝 Paste Marathi article:
exit
