# Sentiment Analysis of Code-Switching Hindi and Marathi in Roman Script

The implementation includes:
1. Setup and Imports
2. Data preprocessing and exploration
3. Text Preprocessing (Roman Script)
4. Transliteration from Roman to Devanagari
5. Preprocessing for Devanagari Text
6. MuRIL model Baseline Accuracy
7. Hyperparameter Tuning for Best Parameters
8. Fine-Tunning Cross Validation for MuRIL
9. Baseline Implementations for MuRIL, IndicBERTv1, XLM-RoBERTa, mBERT
10. Hyperparameter Tuning for All Models using Optuna
11. Model Testing: Intra-sentential vs Inter-sentential vs Tag-Switch (Testing Separately)
12. Cross-Script Evaluation: Train on Roman, Test on Devanagari
13. Cross-Script Evaluation: Train on Devnagri, Test on Roman




## 1. Setup and Imports

In [None]:
!pip install optuna

In [None]:
# Install required packages
!pip install indic-transliteration


In [None]:
#Necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
import re
import string
import nltk
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import optuna
import warnings
import os
import random
import time
import gc
warnings.filterwarnings("ignore")

#Set random seeds for reproducibility
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed_value)

set_seed(42)

# Download NLTK resources
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

# Check for CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Data Loading and Exploration

In [None]:
# Load the dataset
df = pd.read_csv("Dataset.csv")
df.columns = ["text", "sentiment"]

# Display basic information
print(f"Dataset shape: {df.shape}")
print("\nFirst few rows:")
display(df.head())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Distribution of sentiment classes
print("\nSentiment distribution:")
sentiment_counts = df["sentiment"].value_counts()
print(sentiment_counts)

# Visualize sentiment distribution
plt.figure(figsize=(8, 6))
sns.countplot(x="sentiment", data=df)
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

# Text length statistics
df["text_length"] = df["text"].apply(len)
print("\nText length statistics:")
print(df["text_length"].describe())

# Visualize text length distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="text_length", hue="sentiment", bins=30, kde=True)
plt.title("Text Length Distribution by Sentiment")
plt.xlabel("Text Length")
plt.ylabel("Count")
plt.show()

## 3. Text Preprocessing (Roman Script)

In [None]:
def preprocess_text(text):
    """
    Preprocess text for Roman script Hindi-Marathi code-switched text
    """
    # Convert to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

    # Remove user mentions and hashtags
    text = re.sub(r"@\w+|#\w+", "", text)

    # Remove punctuation except those that might be important for sentiment
    text = re.sub(r"[^\w\s!?.]", "", text)

    # Remove extra whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Apply preprocessing
df["cleaned_text"] = df["text"].apply(preprocess_text)

# Display examples of cleaned text
print("Original vs Cleaned Text Examples:")
for i in range(5):
    print(f"Original: {df['text'].iloc[i]}")
    print(f"Cleaned: {df['cleaned_text'].iloc[i]}")
    print("-" * 50)

## 4. Transliteration from Roman to Devanagari



In [None]:
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate

def transliterate_roman_to_devanagari(text):
    """
    Transliterate Roman script to Devanagari script with enhanced accuracy
    for Hindi/Marathi code-switched text
    """
    try:
        # Pre-processing to improve transliteration accuracy
        text = re.sub(r"([.,!?])", r"\1 ", text)
        text = re.sub(r"\s+", " ", text).strip()

        # Common word mappings for better accuracy
        word_mappings = {
            # Hindi common words
            "hai": "है", "hota": "होता", "mujhe": "मुझे", "tumhara": "तुम्हारा",
            "tumhari": "तुम्हारी", "tera": "तेरा", "mera": "मेरा", "achha": "अच्छा",
            "acha": "अच्छा", "bahut": "बहुत", "khush": "खुश", "nahin": "नहीं",
            "nahi": "नहीं", "aur": "और", "par": "पर", "mein": "में",
            "kya": "क्या", "ko": "को", "se": "से", "ke": "के",
            "ki": "की", "ka": "का", "sab": "सब", "sabko": "सबको",
            "khatam": "खतम", "yaad": "याद", "roj": "रोज", "idea": "आइडिया",
            "project": "प्रोजेक्ट", "movie": "मूवी", "smile": "स्माइल", "tension": "टेंशन",
            "party": "पार्टी", "energy": "एनर्जी", "support": "सपोर्ट", "weather": "वेदर",
            "walk": "वॉक", "enjoy": "एन्जॉय", "gift": "गिफ्ट", "surprise": "सरप्राइज",
            "effort": "एफर्ट",

            # Marathi common words
            "ahe": "आहे", "hota": "होता", "mala": "मला", "tula": "तुला",
            "tumcha": "तुमचा", "tumchi": "तुमची", "tujha": "तुझा", "tujhi": "तुझी",
            "tujhya": "तुझ्या", "majha": "माझा", "mazya": "माझ्या", "chan": "छान",
            "khup": "खूप", "nahi": "नाही", "ani": "आणि", "pan": "पण",
            "madhe": "मध्ये", "var": "वर", "khar": "खर", "la": "ला",
            "chi": "ची", "cha": "चा", "che": "चे", "katha": "कथा",
            "baghun": "बघून", "vatla": "वाटलं", "gela": "गेला", "dur": "दूर",
            "kalachya": "कालच्या", "shakya": "शक्य", "navhati": "नव्हती", "aajchya": "आजच्या",
            "maja": "मजा", "thandak": "थंडक", "keli": "केली", "asanyane": "असण्याने",
            "sukun": "सुखून", "khana": "खाना", "chav": "चव", "unique": "युनिक",
            "kela": "केला", "avdela": "आवडला", "anand": "आनंद", "zala": "झाला",
            "jagi": "जगी", "konalach": "कोणालाच",

            # Common expressions
            "namaste": "नमस्ते", "namaskar": "नमस्कार", "dhanyavad": "धन्यवाद", "shukriya": "शुक्रिया",
            "kaise": "कैसे", "kasa": "कसा", "kashi": "कशी", "ekdam": "एकदम",
            "zabardast": "जबरदस्त", "mast": "मस्त", "tyamule": "त्यामुळे", "jhala": "झाला",
            "aayegi": "आएगी", "shivay": "शिवाय", "kamyabi": "कामयाबी", "karne": "करने",
            "ala": "आला", "dekh": "देख", "chinta": "चिंता", "geli": "गेली",
            "mom": "मॉम", "khas": "खास", "banavla": "बनवला", "dekhake": "देखके",
            "tumchy": "तुमच्या",
        }

        # Split text into words for word-by-word processing
        words = text.split()
        processed_words = []

        for word in words:
            # Check if word is in our mapping dictionary
            word_lower = word.lower()
            if word_lower in word_mappings:
                processed_words.append(word_mappings[word_lower])
            else:
                # Try to transliterate using the library
                try:
                    # Convert to ITRANS format first (an intermediate step)
                    # This helps with better recognition of Hindi/Marathi phonetics
                    itrans_word = word
                    # Apply transliteration
                    dev_word = transliterate(itrans_word, sanscript.ITRANS, sanscript.DEVANAGARI)
                    processed_words.append(dev_word)
                except Exception as e:
                    # If transliteration fails, keep the original word
                    processed_words.append(word)

        # Join the processed words back into text
        result = " ".join(processed_words)

        # Post-processing for better readability
        # Fix common transliteration issues
        result = result.replace(".", "|")

        return result
    except Exception as e:
        print(f"Error in transliteration: {e}")
        return text  # Return original text if transliteration fails

# Apply transliteration to the dataset
df["devanagari_text"] = df["cleaned_text"].apply(transliterate_roman_to_devanagari)

# Display examples of transliterated text
print("Roman vs Devanagari Text Examples:")
for i in range(5):
    print(f"Roman: {df['cleaned_text'].iloc[i]}")
    print(f"Devanagari: {df['devanagari_text'].iloc[i]}")
    print("-" * 50)

## 5. Preprocessing for Devanagari Text

In [None]:
# Define Hindi and Marathi stopwords in Devanagari
hindi_stopwords = [
    "और", "का", "की", "के", "को", "में", "से", "पर", "है", "हैं", "था", "थे", "थी", "थीं",
    "हो", "होता", "होती", "होते", "हुआ", "हुई", "हुए", "एक", "यह", "वह", "ये", "वे", "जो",
    "कि", "तो", "अगर", "या", "एवं", "तथा", "कर", "करे", "करें", "कहते", "कहा", "गया", "अब",
    "जब", "तब", "कब", "यदि", "यहाँ", "वहाँ", "कहाँ", "क्यों", "क्योंकि", "परंतु", "लेकिन",
    "अपना", "अपनी", "अपने", "स्वयं", "इस", "उस", "इन", "उन", "सकता", "सकते", "सकती",
    "रहा", "रहे", "रही", "रखें", "रखे", "रखता", "रखती", "रखते", "बहुत", "थोड़ा", "थोड़े",
    "थोड़ी", "साथ", "बाद", "पहले", "फिर", "वापस", "द्वारा", "प्रति", "अंदर", "बाहर", "आज",
    "कल", "अभी", "जल्दी", "शीघ्र", "धीरे", "अधिक", "कम", "ज्यादा", "सभी", "कुछ", "कौन",
    "क्या", "कैसे", "कहां", "किसे", "किसका", "किसकी", "किसके", "किस", "किसी", "कोई",
    "हूँ", "हूं", "हो", "हम", "तुम", "आप", "वो", "मैं", "मुझे", "मेरा", "मेरी", "मेरे",
    "तेरा", "तेरी", "तेरे", "उसका", "उसकी", "उसके", "हमारा", "हमारी", "हमारे", "आपका",
    "आपकी", "आपके", "उनका", "उनकी", "उनके", "इसका", "इसकी", "इसके", "उसको", "इसको"
]

marathi_stopwords = [
    "आणि", "आहे", "आहेत", "होते", "होता", "होती", "होत्या", "असून", "असे", "असा", "अशी",
    "असलेल्या", "असलेला", "असलेली", "असतो", "असते", "असतात", "म्हणून", "परंतु", "पण",
    "तर", "मात्र", "की", "कारण", "म्हणजे", "हे", "ही", "हा", "ते", "ती", "तो", "त्या",
    "त्याचा", "त्याची", "त्याचे", "त्यांचा", "त्यांची", "त्यांचे", "त्यांना", "त्याला",
    "त्याने", "त्यांनी", "त्यामुळे", "या", "याचा", "याची", "याचे", "यांचा", "यांची",
    "यांचे", "यांना", "याला", "याने", "यांनी", "यामुळे", "जे", "जो", "जी", "ज्या",
    "ज्याचा", "ज्याची", "ज्याचे", "ज्यांचा", "ज्यांची", "ज्यांचे", "ज्यांना", "ज्याला",
    "ज्याने", "ज्यांनी", "ज्यामुळे", "मी", "माझा", "माझी", "माझे", "मला", "आम्ही",
    "आमचा", "आमची", "आमचे", "आम्हाला", "आपण", "आपला", "आपली", "आपले", "आपल्याला",
    "तू", "तुझा", "तुझी", "तुझे", "तुला", "तुम्ही", "तुमचा", "तुमची", "तुमचे", "तुम्हाला",
    "एक", "दोन", "तीन", "चार", "पाच", "सहा", "सात", "आठ", "नऊ", "दहा", "वर", "वरती",
    "खाली", "येथे", "तेथे", "कधी", "कसे", "का", "काय", "कोण", "कोणता", "कोणती", "कोणते",
    "कोणाचा", "कोणाची", "कोणाचे", "कोणाला", "कोणी", "कोणास", "काही", "कुठे", "कुठल्या",
    "कुठला", "कुठली", "कुठले", "कसा", "कशी", "कशाला", "कशामुळे", "कशासाठी", "कशात",
    "इतर", "इतका", "इतकी", "इतके", "इकडे", "तिकडे", "मध्ये", "मधील", "वरील", "खालील",
    "पुढील", "मागील", "आतील", "बाहेरील", "अंतर्गत", "बाहेर", "आत", "पुढे", "मागे", "वरून",
    "खालून", "पर्यंत", "पासून", "साठी", "करिता", "द्वारे", "विषयी", "बद्दल", "शिवाय",
    "प्रमाणे", "नुसार", "ऐवजी", "सोबत", "बरोबर", "विना", "नंतर", "पूर्वी", "आधी", "दरम्यान"
]

# Combine Hindi and Marathi stopwords
indic_stopwords = set(hindi_stopwords + marathi_stopwords)

def preprocess_devanagari_text(text):
    """
    Preprocess Devanagari text for Hindi-Marathi code-switched text
    """
    # Tokenize text (simple space-based tokenization for Devanagari)
    tokens = text.split()

    # Remove stopwords
    tokens = [token for token in tokens if token not in indic_stopwords]

    # Join tokens back into text
    processed_text = " ".join(tokens)

    return processed_text

# Apply Devanagari preprocessing
df["processed_devanagari"] = df["devanagari_text"].apply(preprocess_devanagari_text)

# Display examples of processed Devanagari text
print("Original vs Processed Devanagari Text Examples:")
for i in range(5):
    print(f"Original Devanagari: {df['devanagari_text'].iloc[i]}")
    print(f"Processed Devanagari: {df['processed_devanagari'].iloc[i]}")
    print("-" * 50)

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        encoding = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["label"] = torch.tensor(label)
        return item


#6.MuRIL model Baseline Accuracy

In [None]:
# Evaluate base MuRIL model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader

# Prepare features and labels from original dataframe
X_all = df["devanagari_text"]
y_all = df["sentiment"].map({"Positive": 1, "Negative": 0})



# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)
model.to(device)
model.eval()

# Prepare dataset and dataloader
base_dataset = SentimentDataset(X_all, y_all, tokenizer)
base_loader = DataLoader(base_dataset, batch_size=8)

# Evaluation
correct, total = 0, 0
with torch.no_grad():
    for batch in base_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
        labels = batch["label"].to(device)
        outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

base_accuracy = correct / total
print(f"\nAccuracy of base MuRIL model (without fine-tuning): {base_accuracy:.4f}")


# 7. Hyperparameter Tuning for Best Parameters






In [None]:


# Stratified Split for Tuning
X_train_tf, X_val_tf, y_train_tf, y_val_tf = train_test_split(
    X_all, y_all, test_size=0.2, stratify=y_all, random_state=42
)

def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    epochs = trial.suggest_int("epochs", 3, 15)
    freeze_layers = trial.suggest_categorical("freeze_layers", [True, False])
    freeze_embeddings = trial.suggest_categorical("freeze_embeddings", [True, False])

    tokenizer = AutoTokenizer.from_pretrained("google/muril-base-cased")
    model = AutoModelForSequenceClassification.from_pretrained("google/muril-base-cased", num_labels=2)

    if freeze_embeddings:
        for param in model.bert.embeddings.parameters():
            param.requires_grad = False
    if freeze_layers:
        for param in model.bert.encoder.parameters():
            param.requires_grad = False

    model.to(device)
    train_dataset = SentimentDataset(X_train_tf, y_train_tf, tokenizer)
    val_dataset = SentimentDataset(X_val_tf, y_val_tf, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, 0, len(train_loader) * epochs)

    for _ in range(epochs):
        model.train()
        for batch in train_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)
            optimizer.zero_grad()
            loss = model(**inputs, labels=labels).loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

    # Validation
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    torch.cuda.empty_cache()
    gc.collect()
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=8)

best_params = study.best_params
print("Best Hyperparameters:", best_params)


# 8. Fine-Tunning Cross Validation

In [None]:


def lightweight_finetune_transformer(
    model_name,
    X_train,
    y_train,
    X_test,
    y_test,
    epochs,
    batch_size,
    learning_rate,
    freeze_layers,
    freeze_embeddings,
):
    print(f"\nLightweight fine-tuning of {model_name} with best hyperparameters...")
    print(f"  Epochs: {epochs}, Batch size: {batch_size}, Learning rate: {learning_rate:.2e}")
    print(f"  Freeze transformer layers: {freeze_layers}, Freeze embeddings: {freeze_embeddings}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    if freeze_embeddings:
      print("  Freezing embedding layers...")

      if hasattr(model, 'bert'):
          for param in model.bert.embeddings.parameters():
              param.requires_grad = False
      elif hasattr(model, 'roberta'):
          for param in model.roberta.embeddings.parameters():
              param.requires_grad = False
      elif hasattr(model, 'xlm_roberta'):
          for param in model.xlm_roberta.embeddings.parameters():
              param.requires_grad = False
      elif hasattr(model, 'albert'):
          for param in model.albert.embeddings.parameters():
              param.requires_grad = False
      else:
          print("  [Warning] Could not freeze embeddings: Unknown model structure.")

    if freeze_layers:
      print("  Freezing transformer encoder layers...")

      if hasattr(model, 'bert'):
          for param in model.bert.encoder.parameters():
              param.requires_grad = False
      elif hasattr(model, 'roberta'):
          for param in model.roberta.encoder.parameters():
              param.requires_grad = False
      elif hasattr(model, 'albert'):
          for param in model.albert.encoder.parameters():
              param.requires_grad = False
      elif hasattr(model, 'base_model') and hasattr(model.base_model, 'encoder'):
          for param in model.base_model.encoder.parameters():
              param.requires_grad = False
      else:
          print("  [Warning] Could not freeze encoder layers: Unknown model structure.")


    model.to(device)

    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"  Trainable parameters: {trainable_params:,} / {total_params:,} ({100 * trainable_params / total_params:.2f}%)")

    train_dataset = SentimentDataset(X_train, y_train, tokenizer)
    test_dataset = SentimentDataset(X_test, y_test, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, 0, len(train_loader) * epochs)

    best_accuracy = 0.0

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")

        # Training phase
        model.train()
        total_train_loss, correct_train, total_train = 0, 0, 0
        start_time = time.time()
        for batch in train_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_train_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct_train += (preds == labels).sum().item()
            total_train += labels.size(0)

        train_acc = correct_train / total_train
        avg_train_loss = total_train_loss / len(train_loader)
        train_time = time.time() - start_time

        print(f"  Training Loss: {avg_train_loss:.4f}")
        print(f"  Training Accuracy: {train_acc:.4f}")
        print(f"  Training Time: {train_time:.2f} seconds")

        # Evaluation phase
        model.eval()
        total_test_loss, correct_test, total_test = 0, 0, 0
        all_preds, all_labels = [], []
        start_time = time.time()
        with torch.no_grad():
            for batch in test_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
                labels = batch["label"].to(device)

                outputs = model(**inputs, labels=labels)
                loss = outputs.loss
                total_test_loss += loss.item()

                preds = torch.argmax(outputs.logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

                correct_test += (preds == labels).sum().item()
                total_test += labels.size(0)

        test_acc = correct_test / total_test
        avg_test_loss = total_test_loss / len(test_loader)
        test_time = time.time() - start_time

        print(f"  Test Loss: {avg_test_loss:.4f}")
        print(f"  Test Accuracy: {test_acc:.4f}")
        print(f"  Test Time: {test_time:.2f} seconds")

        if test_acc > best_accuracy:
            best_accuracy = test_acc
            print(f"  New best model saved with test accuracy: {test_acc:.4f}")

    # Final report
    print(f"\n{model_name} Final Test Accuracy: {best_accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=["Negative", "Positive"]))

    output_dir = model_name.replace("/", "_") + "_model_final"
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"Final model saved to {output_dir}")

    return {"accuracy": best_accuracy}



In [None]:
from sklearn.model_selection import StratifiedKFold

X_all = df["devanagari_text"].reset_index(drop=True)
y_all = df["sentiment"].map({"Positive": 1, "Negative": 0}).reset_index(drop=True)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold = 1
cv_results = []

for train_index, test_index in skf.split(X_all, y_all):
    print(f"\nFold {fold}")

    X_train_fold = X_all.iloc[train_index].reset_index(drop=True)
    X_test_fold = X_all.iloc[test_index].reset_index(drop=True)
    y_train_fold = y_all.iloc[train_index].reset_index(drop=True)
    y_test_fold = y_all.iloc[test_index].reset_index(drop=True)

    result = lightweight_finetune_transformer(
        model_name="google/muril-base-cased",
        X_train=X_train_fold,
        y_train=y_train_fold,
        X_test=X_test_fold,
        y_test=y_test_fold,
        epochs=best_params["epochs"],  #using tuned value
        batch_size=best_params["batch_size"],
        learning_rate=best_params["learning_rate"],
        freeze_layers=best_params["freeze_layers"],
        freeze_embeddings=best_params["freeze_embeddings"]
    )

    cv_results.append(result['accuracy'])
    print(f" Fold {fold} Accuracy: {result['accuracy']:.4f}")
    fold += 1

mean_acc = np.mean(cv_results)
std_acc = np.std(cv_results)
print(f"\nFinal Cross-Validation Accuracy: {mean_acc:.4f} ± {std_acc:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Accuracy values
muril_baseline = base_accuracy
muril_kfold = mean_acc
muril_kfold_std = std_acc

# Create the figure
plt.figure(figsize=(5, 4))
bar_width = 0.08  # Thin bars

# Plot bars with black border
plt.bar(0 - bar_width/2, muril_baseline, width=bar_width, color='lightblue',
        edgecolor='black', label='Baseline Accuracy')

# Fine-tuned with standard deviation as error bar
plt.bar(0 + bar_width/2, muril_kfold, width=bar_width, color='dodgerblue',
        edgecolor='black', yerr=muril_kfold_std, capsize=5, label='Fine-Tuned Accuracy (K-Fold)')

# Set chart details
plt.xticks([0], ["MuRIL"])
plt.ylim(0, 1.0)
plt.ylabel("Accuracy")
plt.title("MuRIL Accuracy: Baseline vs Fine-Tuned (K-Fold)")
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


## 9. Baseline Implementations for MuRIL, IndicBERTv1, XLM-RoBERTa, mBERT

In [None]:


# Label mapping and preparation
label_map = {"Positive": 1, "Negative": 0}
df["labels"] = df["sentiment"].map(label_map)



BATCH_SIZE = 16

# Models to evaluate
MODELS = {
    "MuRIL": "google/muril-base-cased",
    "IndicBERTv1": "ai4bharat/indic-bert",
    "XLM-RoBERTa": "xlm-roberta-base",
    "mBERT": "bert-base-multilingual-cased",
}

# Placeholder for results
baseline_results = {}

print("--- Baseline Implementations ---")
for model_name, model_path in MODELS.items():
    print(f"\n--- Running Baseline for {model_name} ---")

    # Split the data
    df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["labels"])
    df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42, stratify=df_train["labels"])

    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(label_map))
    model.to(device)
    model.eval()

    # Create datasets and dataloaders
    train_dataset = SentimentDataset(df_train["devanagari_text"], df_train["labels"], tokenizer)
    val_dataset = SentimentDataset(df_val["devanagari_text"], df_val["labels"], tokenizer)
    test_dataset = SentimentDataset(df_test["devanagari_text"], df_test["labels"], tokenizer)

    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

    # Evaluate without fine-tuning (like your MuRIL code)
    correct, total = 0, 0
    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    test_accuracy = correct / total
    baseline_results[model_name] = test_accuracy
    print(f"{model_name} Test Accuracy: {test_accuracy:.4f}")

    del model
    torch.cuda.empty_cache()
    gc.collect()

# Summary
print("\n--- Baseline Results ---")
for model, accuracy in baseline_results.items():
    print(f"{model}: {accuracy:.4f}")


## 10. Hyperparameter Tuning for All Models using Optuna

In [None]:
best_hyperparameters = {}
print("--- Running Hyperparameter Tuning ---")

for model_name, model_path in MODELS.items():
    print(f"\n--- Running Hyperparameter Tuning for {model_name} ---")

    def objective(trial):
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
        batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
        epochs = trial.suggest_int("epochs", 3, 15)
        freeze_layers = trial.suggest_categorical("freeze_layers", [True, False])
        freeze_embeddings = trial.suggest_categorical("freeze_embeddings", [True, False])

        df_train, df_test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["labels"])
        df_train, df_val = train_test_split(df_train, test_size=0.1, random_state=42, stratify=df_train["labels"])

        tokenizer = AutoTokenizer.from_pretrained(model_path)
        model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(label_map))


        base_model = model.base_model

        if freeze_embeddings:
            for param in base_model.embeddings.parameters():
                param.requires_grad = False
        if freeze_layers:
            for param in base_model.encoder.parameters():
                param.requires_grad = False

        model.to(device)

        train_dataset = SentimentDataset(df_train["devanagari_text"], df_train["labels"], tokenizer)
        val_dataset = SentimentDataset(df_val["devanagari_text"], df_val["labels"], tokenizer)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size)

        optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=learning_rate)
        scheduler = get_linear_schedule_with_warmup(optimizer, 0, len(train_loader) * epochs)

        # Training loop
        for _ in range(epochs):
            model.train()
            for batch in train_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
                labels = batch["label"].to(device)

                optimizer.zero_grad()
                loss = model(**inputs, labels=labels).loss
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

        # Validation loop
        model.eval()
        correct, total = 0, 0
        with torch.no_grad():
            for batch in val_loader:
                inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
                labels = batch["label"].to(device)
                outputs = model(**inputs)
                preds = torch.argmax(outputs.logits, dim=1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        torch.cuda.empty_cache()
        gc.collect()

        return correct / total

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=8)

    print(f"\nBest trial for {model_name}:")
    print(f"  Value: {study.best_trial.value:.4f}")
    print(f"  Params: {study.best_trial.params}")
    best_hyperparameters[model_name] = study.best_trial.params

print("\n--- Best Hyperparameters for Each Model ---")
for model, params in best_hyperparameters.items():
    print(f"{model}: {params}")


In [None]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
import os

kfold_results = {}
print("--- Running K-Fold Cross-Validation ---")

for model_name, model_path in MODELS.items():
    print(f"--- Running K-Fold Cross-Validation for {model_name} ---")

    if model_name not in best_hyperparameters:
        print(f"Skipping K-Fold for {model_name} as no best hyperparameters found. Using default baseline parameters.")
        epochs = EPOCHS
        batch_size = BATCH_SIZE
        learning_rate = LEARNING_RATE
    else:
        params = best_hyperparameters[model_name]
        epochs = params["epochs"]
        batch_size = params["batch_size"]
        learning_rate = params["learning_rate"]
        freeze_layers = params.get("freeze_layers", False)
        freeze_embeddings = params.get("freeze_embeddings", False)

    X_all = df["devanagari_text"].reset_index(drop=True)
    y_all = df["sentiment"].map({"Positive": 1, "Negative": 0}).reset_index(drop=True)

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold = 1
    fold_accuracies = []

    for train_idx, test_idx in skf.split(X_all, y_all):
        print(f"\nFold {fold}")

        X_train = X_all.iloc[train_idx].reset_index(drop=True)
        y_train = y_all.iloc[train_idx].reset_index(drop=True)
        X_test = X_all.iloc[test_idx].reset_index(drop=True)
        y_test = y_all.iloc[test_idx].reset_index(drop=True)

        result = lightweight_finetune_transformer(
            model_name=model_path,
            X_train=X_train,
            y_train=y_train,
            X_test=X_test,
            y_test=y_test,
            epochs=epochs,
            batch_size=batch_size,
            learning_rate=learning_rate,
            freeze_layers=freeze_layers,
            freeze_embeddings=freeze_embeddings
        )

        fold_accuracies.append(result["accuracy"])
        print(f" Fold {fold} Accuracy: {result['accuracy']:.4f}")
        fold += 1

    avg_acc = np.mean(fold_accuracies)
    std_acc = np.std(fold_accuracies)
    print(f"\n{model_name} Cross-Validation Accuracy: {avg_acc:.4f} ± {std_acc:.4f}")
    kfold_results[model_name] = {"mean": avg_acc, "std": std_acc}

print("\nK-Fold Cross-Validation Results:")
for model, acc_dict in kfold_results.items():
    print(f"{model}: {acc_dict['mean']:.4f} ± {acc_dict['std']:.4f}")


In [None]:
import matplotlib.pyplot as plt

# Extract model names
models = list(baseline_results.keys())

# Extract baseline and k-fold means/stds
baseline_acc = [baseline_results[m] for m in models]
kfold_acc = [kfold_results[m]["mean"] for m in models]
kfold_std = [kfold_results[m]["std"] for m in models]

x = range(len(models))
bar_width = 0.35

plt.figure(figsize=(10, 6))

# Baseline accuracy bars
plt.bar(
    [i - bar_width / 2 for i in x],
    baseline_acc,
    bar_width,
    label='Baseline Accuracy',
    color='lightblue',
    edgecolor='black'
)

# K-Fold accuracy bars with std error bars
bars = plt.bar(
    [i + bar_width / 2 for i in x],
    kfold_acc,
    bar_width,
    yerr=kfold_std,
    capsize=5,
    label='Fine-Tuned Accuracy (K-Fold)',
    color='steelblue',
    edgecolor='black'
)

# Add std deviation labels above K-Fold bars
for i, (acc, std) in enumerate(zip(kfold_acc, kfold_std)):
    plt.text(
        i + bar_width / 2,
        acc + std + 0.01,
        f"±{std:.2f}",
        ha='center',
        va='bottom',
        fontsize=9
    )

# Plot styling
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison: Baseline vs Fine-Tuned (K-Fold)')
plt.xticks(x, models)
plt.ylim(0, 1.0)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


#11.**Model Testing: Intra-sentential vs Inter-sentential vs Tag-Switch (Testing Separately)**


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import matplotlib.pyplot as plt
import numpy as np

# Load datasets
intra_df = pd.read_csv("Intra_sentential.csv", header=None, names=["text", "sentiment"])
inter_df = pd.read_csv("Inter-sentential.csv", header=None, names=["text", "sentiment"])
tag_df = pd.read_csv("Tag_Switch.csv", header=None, names=["text", "sentiment"])

test_sets = {
    "Intra-sentential": intra_df,
    "Inter-sentential": inter_df,
    "Tag-Switch": tag_df
}

# Label mapping
label_map = {"Positive": 1, "Negative": 0}

# Pretrained model folders
MODELS = {
    "MuRIL": "google_muril-base-cased_model_final",
    "IndicBERTv1": "ai4bharat_indic-bert_model_final",
    "XLM-RoBERTa": "xlm-roberta-base_model_final",
    "mBERT": "bert-base-multilingual-cased_model_final"
}

# Store accuracy by model and test set
accuracy_by_model = {model: [] for model in MODELS.keys()}

# Evaluation function
def evaluate_saved_model(model_name, model_path, df, device):
    df = df.dropna(subset=["text", "sentiment"])
    df["cleaned_text"] = df["text"].apply(preprocess_text)
    df["devanagari_text"] = df["cleaned_text"].apply(transliterate_roman_to_devanagari)
    df["processed_devanagari"] = df["devanagari_text"].apply(preprocess_devanagari_text)
    df["labels"] = df["sentiment"].map(label_map)
    df = df.dropna(subset=["labels"])

    X_test = df["processed_devanagari"]
    y_test = df["labels"]

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    test_dataset = SentimentDataset(X_test, y_test, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=8)

    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.to(device)
    model.eval()

    total, correct = 0, 0
    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    return correct / total

# Run evaluation and print results
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("\n=== Model Accuracy Report ===\n")
for model_name, model_path in MODELS.items():
    print(f"--- {model_name} ---")
    for set_name in ["Intra-sentential", "Inter-sentential", "Tag-Switch"]:
        df = test_sets[set_name].copy()
        accuracy = evaluate_saved_model(model_name, model_path, df, device)
        accuracy_by_model[model_name].append(accuracy)
        print(f"{set_name:<20}: {accuracy:.4f}")

    # Compute and print average accuracy
    avg_accuracy = sum(accuracy_by_model[model_name]) / len(accuracy_by_model[model_name])
    print(f"{'Average Accuracy':<20}: {avg_accuracy:.4f}\n")

# Prepare data for plotting
models = list(accuracy_by_model.keys())
intra_acc = [accuracy_by_model[m][0] for m in models]
inter_acc = [accuracy_by_model[m][1] for m in models]
tag_acc   = [accuracy_by_model[m][2] for m in models]

x = np.arange(len(models))
bar_width = 0.2

# Plot grouped bar chart
plt.figure(figsize=(10, 6))
plt.bar(x - bar_width, intra_acc, width=bar_width, label='Intra-sentential',
        color='#66c2a5', edgecolor='black')
plt.bar(x, inter_acc, width=bar_width, label='Inter-sentential',
        color='#fc8d62', edgecolor='black')
plt.bar(x + bar_width, tag_acc, width=bar_width, label='Tag-Switch',
        color='#8da0cb', edgecolor='black')

plt.xticks(x, models)
plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison Across Test Set Types")
plt.ylim(0, 1.0)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


## 12. **Cross-Script Evaluation: Train on Roman, Test on Devanagari**

In [None]:
# --- Cross-Script Evaluation: Train on Roman, Test on Devanagari (Fresh Models + No Data Leakage) ---

print("\n--- Train on Roman Script, Test on Devanagari Script ---")

# Step 1: Reload dataset and preprocess Roman
df = pd.read_csv("Dataset.csv")
df.columns = ["text", "sentiment"]
df["cleaned_text"] = df["text"].apply(preprocess_text)

# Encode labels
label_map = {"Positive": 1, "Negative": 0}
df["labels"] = df["sentiment"].map(label_map)

# Step 2: Split dataset BEFORE transliteration to avoid leakage
from sklearn.model_selection import train_test_split

X_train_rom, X_test_rom, y_train, y_test = train_test_split(
    df["cleaned_text"], df["labels"], test_size=0.2, random_state=42, stratify=df["labels"]
)

# Step 3: Transliterate ONLY the test data to Devanagari
X_test_dev = X_test_rom.apply(transliterate_roman_to_devanagari).reset_index(drop=True)
X_train_rom = X_train_rom.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Step 4: Define fresh model names from Hugging Face
MODELS = {
    "MuRIL": "google/muril-base-cased",
    "IndicBERTv1": "ai4bharat/indic-bert",
    "XLM-RoBERTa": "xlm-roberta-base",
    "mBERT": "bert-base-multilingual-cased",
}

cross_script_results = {}

for model_name, model_path in MODELS.items():
    print(f"\n--- Training on Roman | Testing on Devanagari — {model_name} ---")

    if model_name not in best_hyperparameters:
        print(f"Skipping {model_name}: No best hyperparameters found.")
        continue

    # Load best hyperparameters
    params = best_hyperparameters[model_name]
    epochs = params["epochs"]
    batch_size = params["batch_size"]
    learning_rate = params["learning_rate"]
    freeze_layers = params.get("freeze_layers", False)
    freeze_embeddings = params.get("freeze_embeddings", False)

    # Print hyperparameters used
    print("Best Hyperparameters:")
    print(f"  Epochs: {epochs}")
    print(f"  Batch Size: {batch_size}")
    print(f"  Learning Rate: {learning_rate}")
    print(f"  Freeze Layers: {freeze_layers}")
    print(f"  Freeze Embeddings: {freeze_embeddings}")

    # Load fresh tokenizer and model from Hugging Face
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

    # Freeze layers if required
    if freeze_embeddings:
        if hasattr(model, 'bert'):
            for param in model.bert.embeddings.parameters():
                param.requires_grad = False
        elif hasattr(model, 'roberta'):
            for param in model.roberta.embeddings.parameters():
                param.requires_grad = False
        elif hasattr(model, 'xlm_roberta'):
            for param in model.xlm_roberta.embeddings.parameters():
                param.requires_grad = False

    if freeze_layers:
        if hasattr(model, 'bert'):
            for param in model.bert.encoder.parameters():
                param.requires_grad = False
        elif hasattr(model, 'roberta'):
            for param in model.roberta.encoder.parameters():
                param.requires_grad = False
        elif hasattr(model, 'xlm_roberta'):
            for param in model.xlm_roberta.encoder.parameters():
                param.requires_grad = False

    model.to(device)

    # Prepare datasets and loaders
    train_dataset = SentimentDataset(X_train_rom, y_train, tokenizer)
    test_dataset = SentimentDataset(X_test_dev, y_test, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, 0, len(train_loader) * epochs)

    # Training loop
    print(f"\n[Training {model_name}]")
    for epoch in range(epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0
        for batch in train_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f} - Accuracy: {train_acc:.4f}")

    # Evaluation loop
    print(f"\n[Testing {model_name} on Devanagari]")
    model.eval()
    correct, total = 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)

            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    cross_script_results[model_name] = acc

    print(f"\n{model_name} Test Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=["Negative", "Positive"]))

    # Clean up
    del model
    torch.cuda.empty_cache()
    gc.collect()

# Step 6: Summary
print("\n--- Cross-Script Accuracy Summary (Roman → Devanagari) ---")
for model, acc in cross_script_results.items():
    print(f"{model}: {acc:.4f}")


In [None]:
# --- Plot Histogram of Cross-Script Accuracies (Without Error Bars, Custom Color) ---

import matplotlib.pyplot as plt

# Extract model names and accuracies
model_names = list(cross_script_results.keys())
accuracies = [cross_script_results[model] for model in model_names]

# Plot
plt.figure(figsize=(10, 5))
bars = plt.bar(model_names, accuracies, color='skyblue', edgecolor='black')

# Add accuracy labels on top of each bar
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.015, f"{acc:.2f}", ha='center', fontsize=11, weight='bold')

# Formatting
plt.title("Cross-Script Accuracy: Roman → Devanagari", fontsize=14)
plt.ylabel("Accuracy")
plt.ylim(0, 1.05)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


## 13. **Cross-Script Evaluation: Train on Devnagri, Test on Roman**

In [None]:

# Step 1: Reload dataset and preprocess Roman
df = pd.read_csv("Dataset.csv")
df.columns = ["text", "sentiment"]
df["cleaned_text"] = df["text"].apply(preprocess_text)

# Encode labels
label_map = {"Positive": 1, "Negative": 0}
df["labels"] = df["sentiment"].map(label_map)

# Step 2: Split dataset BEFORE transliteration to avoid leakage
from sklearn.model_selection import train_test_split

X_train_rom, X_test_rom, y_train, y_test = train_test_split(
    df["cleaned_text"], df["labels"], test_size=0.2, random_state=42, stratify=df["labels"]
)

# Step 3: Transliterate ONLY the train data to Devanagari
X_test_rom = X_test_rom.reset_index(drop=True)
X_train_dev = X_train_rom.apply(transliterate_roman_to_devanagari).reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Step 4: Define model names from Hugging Face
MODELS = {
    "MuRIL": "google/muril-base-cased",
    "IndicBERTv1": "ai4bharat/indic-bert",
    "XLM-RoBERTa": "xlm-roberta-base",
    "mBERT": "bert-base-multilingual-cased",
}

cross_script_results = {}

for model_name, model_path in MODELS.items():
    print(f"\n--- Training on Devanagari | Testing on Roman — {model_name} ---")

    if model_name not in best_hyperparameters:
        print(f"Skipping {model_name}: No best hyperparameters found.")
        continue

    # Load best hyperparameters
    params = best_hyperparameters[model_name]
    epochs = params["epochs"]
    batch_size = params["batch_size"]
    learning_rate = params["learning_rate"]
    freeze_layers = params.get("freeze_layers", False)
    freeze_embeddings = params.get("freeze_embeddings", False)

    # Print hyperparameters used
    print("Best Hyperparameters:")
    print(f"  Epochs: {epochs}")
    print(f"  Batch Size: {batch_size}")
    print(f"  Learning Rate: {learning_rate}")
    print(f"  Freeze Layers: {freeze_layers}")
    print(f"  Freeze Embeddings: {freeze_embeddings}")

    # Load fresh tokenizer and model from Hugging Face
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=2)

    # Freeze layers if required
    if freeze_embeddings:
        if hasattr(model, 'bert'):
            for param in model.bert.embeddings.parameters():
                param.requires_grad = False
        elif hasattr(model, 'roberta'):
            for param in model.roberta.embeddings.parameters():
                param.requires_grad = False
        elif hasattr(model, 'xlm_roberta'):
            for param in model.xlm_roberta.embeddings.parameters():
                param.requires_grad = False

    if freeze_layers:
        if hasattr(model, 'bert'):
            for param in model.bert.encoder.parameters():
                param.requires_grad = False
        elif hasattr(model, 'roberta'):
            for param in model.roberta.encoder.parameters():
                param.requires_grad = False
        elif hasattr(model, 'xlm_roberta'):
            for param in model.xlm_roberta.encoder.parameters():
                param.requires_grad = False

    model.to(device)

    # Prepare datasets and loaders
    train_dataset = SentimentDataset(X_train_dev, y_train, tokenizer)
    test_dataset = SentimentDataset(X_test_rom, y_test, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=learning_rate)
    scheduler = get_linear_schedule_with_warmup(optimizer, 0, len(train_loader) * epochs)

    # Training loop
    print(f"\n[Training {model_name}]")
    for epoch in range(epochs):
        model.train()
        total_loss, correct, total = 0, 0, 0
        for batch in train_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        train_acc = correct / total
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f} - Accuracy: {train_acc:.4f}")

    # Evaluation loop
    print(f"\n[Testing {model_name} on Devanagari]")
    model.eval()
    correct, total = 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in test_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != "label"}
            labels = batch["label"].to(device)

            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    acc = correct / total
    cross_script_results[model_name] = acc

    print(f"\n{model_name} Test Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=["Negative", "Positive"]))

    # Clean up
    del model
    torch.cuda.empty_cache()
    gc.collect()

# Step 6: Summary
print("\n--- Cross-Script Accuracy Summary (Devanagari → Roman ) ---")
for model, acc in cross_script_results.items():
    print(f"{model}: {acc:.4f}")


In [None]:
# --- Plot Histogram for Devanagari → Roman  (Train: Roman, Test: Devanagari) Accuracy ---

import matplotlib.pyplot as plt

# Extract model names and accuracies
model_names = list(cross_script_results.keys())
accuracies = [cross_script_results[model] for model in model_names]

# Plot
plt.figure(figsize=(10, 6))
bars = plt.bar(model_names, accuracies, color='steelblue', edgecolor='black')

# Add accuracy labels on top of each bar
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height + 0.015, f"{acc:.2f}", ha='center', fontsize=11, weight='bold')

# Formatting
plt.title("Cross-Script Accuracy: Devanagari → Roman ", fontsize=14)
plt.ylabel("Accuracy")
plt.ylim(0, 1.05)
plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()
