In [1]:
import os
import json
import torch
import matplotlib.pyplot as plt
from datasets import load_dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np
from IPython.display import clear_output
import pandas as pd

CONFIGS = [
    {"mode": "baseline", "use_scheduler": False, "early_stopping": False},
    {"mode": "improved", "use_scheduler": True, "early_stopping": True},
    {"mode": "extra_tuned", "use_scheduler": True, "early_stopping": True, "lr": 3e-5, "batch_size": 32}
]

BASE_CONFIG = {
    "model_name": "bert-base-multilingual-cased",
    "model_cache": "./hf_models_cache",
    "dataset_cache": "./hf_datasets_cache",
    "output_dir": "./outputs",
    "selected_languages": ["en-US", "hi-IN", "es-ES", "fr-FR"],
    "text_col": "utt",
    "label_col": "intent",
    "sample_frac": 0.0001,
    "batch_size": 64,
    "lr": 5e-5,
    "epochs": 5,
    "early_stopping_patience": 2,
    "log_interval": 1
}

os.makedirs(BASE_CONFIG["output_dir"], exist_ok=True)

def save_label_info(label_list, save_dir, prefix=None, also_save_to_model_dir=False):
    """
    Saves label list and index-to-label mapping as JSON files.

    Args:
        label_list (list): Ordered list of intent labels.
        save_dir (str): Base directory to save the label files.
        prefix (str or None): Prefix for filenames (e.g., "full_massive", "filtered", "baseline").
                              If None, files will be named just 'label_list.json' and 'label_map.json'.
        also_save_to_model_dir (bool): If True, saves 'label_list.json' to save_dir for inference use.
    """
    os.makedirs(save_dir, exist_ok=True)

    # File name logic
    list_filename = f"label_list_{prefix}.json" if prefix else "label_list.json"
    map_filename = f"label_map_{prefix}.json" if prefix else "label_map.json"

    # Save label list
    list_path = os.path.join(save_dir, list_filename)
    with open(list_path, "w", encoding="utf-8") as f:
        json.dump(label_list, f, indent=2, ensure_ascii=False)

    # Save index-to-label map
    label_map = {str(i): label for i, label in enumerate(label_list)}
    map_path = os.path.join(save_dir, map_filename)
    with open(map_path, "w", encoding="utf-8") as f:
        json.dump(label_map, f, indent=2, ensure_ascii=False)

    # ✅ Also save 'label_list.json' for inference if requested (in model folder)
    if also_save_to_model_dir:
        model_list_path = os.path.join(save_dir, "label_list.json")
        with open(model_list_path, "w", encoding="utf-8") as f:
            json.dump(label_list, f, indent=2, ensure_ascii=False)

    # ✅ Log first few mappings for sanity check
    print(f"\n📦 Saved {len(label_list)} labels to '{save_dir}':")
    print(f"  📄 Label list file → {list_path}")
    print(f"  📄 Index→label map → {map_path}")
    if also_save_to_model_dir:
        print(f"  🧭 Also saved 'label_list.json' for model loading → {model_list_path}")
    print("🔍 Sample mappings:")
    for i, label in enumerate(label_list[:5]):
        print(f"   {i}: {label}")


print("🔁 Loading and preprocessing dataset...")
tokenizer = BertTokenizer.from_pretrained(BASE_CONFIG["model_name"], cache_dir=BASE_CONFIG["model_cache"])
raw_dataset = load_dataset("AmazonScience/massive", "all_1.1", cache_dir=BASE_CONFIG["dataset_cache"])

save_label_info(full_label_list, BASE_CONFIG["output_dir"], prefix="full_massive")

dataset = DatasetDict({
    split: raw_dataset[split].filter(lambda x: x["locale"] in BASE_CONFIG["selected_languages"])
    for split in ["train", "validation", "test"]
})


save_label_info(filtered_label_list, BASE_CONFIG["output_dir"], prefix="filtered")


if BASE_CONFIG["sample_frac"] < 1.0:
    dataset = DatasetDict({
        k: v.shuffle(seed=42).select(range(max(1, int(len(v) * BASE_CONFIG["sample_frac"]))))
        for k, v in dataset.items()
    })

def tokenize(example):
    return tokenizer(example[BASE_CONFIG["text_col"]], truncation=True, padding=True)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.rename_column(BASE_CONFIG["label_col"], "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

label_list = tokenized_dataset["train"].features["labels"].names
num_labels = len(label_list)

final_loss_logs = {}
final_metrics = {}

for config in CONFIGS:
    CONFIG = BASE_CONFIG.copy()
    CONFIG.update(config)
    print(f"\n\n🔄 Starting mode: {CONFIG['mode']}")

    data_collator = DataCollatorWithPadding(tokenizer)
    train_dataloader = DataLoader(tokenized_dataset["train"], shuffle=True, batch_size=CONFIG["batch_size"], collate_fn=data_collator)
    val_dataloader = DataLoader(tokenized_dataset["validation"], batch_size=CONFIG["batch_size"], collate_fn=data_collator)

    model = BertForSequenceClassification.from_pretrained(CONFIG["model_name"], cache_dir=CONFIG["model_cache"], num_labels=num_labels)
    optimizer = AdamW(model.parameters(), lr=CONFIG["lr"])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    steps, losses, cum_avgs = [], [], []
    global_step = 0
    best_val_loss = float("inf")
    patience_counter = 0
    loss_log = []

    for epoch in range(CONFIG["epochs"]):
        model.train()
        total_train_loss = 0

        for step, batch in enumerate(tqdm(train_dataloader, desc=f"Training [{CONFIG['mode']}]", disable=True)):
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            global_step += 1
            steps.append(global_step)
            losses.append(loss.item())
            cum_avgs.append(sum(losses) / len(losses))

            is_last = (epoch == CONFIG["epochs"] - 1 and step == len(train_dataloader) - 1)
            if (step + 1) % CONFIG["log_interval"] == 0 and not is_last:
                clear_output(wait=True)
                plt.figure(figsize=(10, 4))
                plt.plot(steps, losses, label='Step-wise Loss', color='blue')
                plt.plot(steps, cum_avgs, label='Cumulative Avg Loss', color='orange', linestyle='--')
                plt.title(f"Training Loss - {CONFIG['mode']}")
                plt.xlabel("Global Step")
                plt.ylabel("Loss")
                plt.grid(True)
                plt.legend()

                cumulative_loss = sum(losses) / len(losses)

                plt.text(0.01, 0.95, 
                         f"Epoch {epoch+1}/{CONFIG['epochs']} | Step {step+1}/{len(train_dataloader)}\n"
                         f"Loss: {loss.item():.4f} | Cumulative Avg: {cumulative_loss:.4f}",
                         transform=plt.gca().transAxes, fontsize=10, verticalalignment='top',
                         bbox=dict(boxstyle='round,pad=0.3', facecolor='lightyellow'))

                plt.tight_layout()
                plt.show()

        avg_train_loss = total_train_loss / len(train_dataloader)

        model.eval()
        total_val_loss, all_preds, all_labels = 0, [], []
        with torch.no_grad():
            for batch in val_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                total_val_loss += outputs.loss.item()
                preds = torch.argmax(outputs.logits, dim=1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(batch["labels"].cpu().numpy())

        avg_val_loss = total_val_loss / len(val_dataloader)
        val_acc = accuracy_score(all_labels, all_preds)
        val_prec = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
        val_rec = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
        val_f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

        loss_log.append({
            "epoch": epoch + 1,
            "train_loss": avg_train_loss,
            "val_loss": avg_val_loss,
            "val_accuracy": val_acc,
            "val_precision": val_prec,
            "val_recall": val_rec,
            "val_f1": val_f1
        })

    # Save logs
    final_loss_logs[CONFIG["mode"]] = {"steps": steps, "losses": losses, "cumulative": cum_avgs}
    final_metrics[CONFIG["mode"]] = loss_log
    with open(os.path.join(CONFIG["output_dir"], f"training_log_{CONFIG['mode']}.json"), "w") as f:
        json.dump(loss_log, f, indent=2)

    # ✅ Save final model after training
    model_path = os.path.join(CONFIG["output_dir"], f"mbert_massive_finetuned_{CONFIG['mode']}")
    os.makedirs(model_path, exist_ok=True)
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)
    save_label_info(label_list, model_path, also_save_to_model_dir=True)


    print(f"\n✅ Final fine-tuned model saved for mode '{CONFIG['mode']}'")
    print("📁 Saved to:", model_path)
    print("📄 Files inside folder:", os.listdir(model_path))

clear_output(wait=True)
print("\n📈 Final Training Loss Graphs")
for mode, logs in final_loss_logs.items():
    plt.figure(figsize=(10, 4))
    plt.plot(logs["steps"], logs["losses"], label="Step-wise Loss")
    plt.plot(logs["steps"], logs["cumulative"], label="Cumulative Avg Loss", linestyle="--")
    plt.title(f"Final Training Loss - {mode}")
    plt.xlabel("Global Step")
    plt.ylabel("Loss")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    save_path = os.path.join(BASE_CONFIG["output_dir"], f"loss_plot_{mode}.png")
    plt.savefig(save_path)
    print(f"📸 Saved: {save_path}")
    plt.show()

def extract_metrics(log):
    return {
        "epoch": [e["epoch"] for e in log],
        "train_loss": [e["train_loss"] for e in log],
        "val_loss": [e["val_loss"] for e in log],
        "val_accuracy": [e["val_accuracy"] for e in log],
        "val_precision": [e["val_precision"] for e in log],
        "val_recall": [e["val_recall"] for e in log],
        "val_f1": [e["val_f1"] for e in log]
    }

modes = [cfg["mode"] for cfg in CONFIGS]
logs = {mode: extract_metrics(final_metrics[mode]) for mode in modes}

fig, axes = plt.subplots(3, 2, figsize=(14, 14))
fig.suptitle("Training Metrics Across All Configs", fontsize=18)

metric_keys = [
    ("train_loss", "Training Loss"),
    ("val_loss", "Validation Loss"),
    ("val_accuracy", "Validation Accuracy"),
    ("val_precision", "Validation Precision"),
    ("val_recall", "Validation Recall"),
    ("val_f1", "Validation F1-Score"),
]

for ax, (key, title) in zip(axes.flat, metric_keys):
    for mode in modes:
        ax.plot(logs[mode]["epoch"], logs[mode][key], label=mode, marker='o')
    ax.set_title(title)
    ax.set_xlabel("Epoch")
    ax.set_ylabel("Score / Loss")
    ax.grid(True)
    ax.legend()

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

summary_rows = []
for mode, log in final_metrics.items():
    last_epoch = log[-1]
    summary_rows.append({
        "Mode": mode,
        "Train Loss": f"{last_epoch['train_loss']:.4f}",
        "Val Loss": f"{last_epoch['val_loss']:.4f}",
        "Val Acc": f"{last_epoch['val_accuracy']:.4f}",
        "Val Prec": f"{last_epoch['val_precision']:.4f}",
        "Val Recall": f"{last_epoch['val_recall']:.4f}",
        "Val F1": f"{last_epoch['val_f1']:.4f}"
    })

summary_df = pd.DataFrame(summary_rows)
print("\n📊 Final Metrics Comparison Table:")
display(summary_df)

import shutil
import zipfile

# Paths
src_dir = BASE_CONFIG["output_dir"]
final_dir = "./final_package"
zip_name = "mbert_final_package.zip"
zip_path = os.path.join("/kaggle/working", zip_name)

# Create final packaging folder
if os.path.exists(final_dir):
    shutil.rmtree(final_dir)
os.makedirs(final_dir, exist_ok=True)

# Move all files and folders from outputs to final_package
#for item in os.listdir(src_dir):
#    s = os.path.join(src_dir, item)
#    d = os.path.join(final_dir, item)
#    shutil.move(s, d)

for item in os.listdir(src_dir):
    s = os.path.join(src_dir, item)
    d = os.path.join(final_dir, item)
    if os.path.isdir(s):
        shutil.copytree(s, d)
    else:
        shutil.copy2(s, d)


print(f"\n📦 Moved all contents from {src_dir} → {final_dir}")

# Zip the folder
shutil.make_archive(base_name=zip_path.replace(".zip", ""), format="zip", root_dir=final_dir)
print(f"\n✅ Zipped folder created at: {zip_path}")
print("📥 You can now download it from the sidebar under `/kaggle/working/`.")

#################################################################
#  
#################################################################

import os

# Base output and cache directory
output_dir = BASE_CONFIG["output_dir"]
model_cache_dir = BASE_CONFIG["model_cache"]

print("\n📁 Checking saved fine-tuned models...")
for config in CONFIGS:
    mode = config["mode"]
    model_folder = os.path.join(output_dir, f"mbert_massive_finetuned_{mode}")
    
    if os.path.exists(model_folder):
        print(f"\n✅ Fine-tuned model for mode '{mode}' is saved at:\n→ {model_folder}")
        print("📄 Files inside:")
        for fname in os.listdir(model_folder):
            print("  -", fname)
    else:
        print(f"\n❌ Model folder for mode '{mode}' not found at:\n→ {model_folder}")

# ✅ Also show pretrained model cache if available
pretrained_model_folder = os.path.join(model_cache_dir, "models--bert-base-multilingual-cased")

if os.path.exists(pretrained_model_folder):
    print(f"\n📦 Pretrained model is cached at:\n→ {pretrained_model_folder}")
    
    print("\n📄 Listing files inside pretrained model folder:")
    for root, dirs, files in os.walk(pretrained_model_folder):
        for file in files:
            full_path = os.path.join(root, file)
            rel_path = os.path.relpath(full_path, pretrained_model_folder)
            print("  -", rel_path)
else:
    print(f"\n❌ Pretrained model not found in cache dir: {pretrained_model_folder}")

#################################################################
#  
#################################################################

import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset

# ✅ Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🖥️  Using device: {device}")

# ✅ Choose model to test
available_modes = ["baseline", "improved", "extra_tuned"]
mode_to_test = "improved"  # 👈 Change this as needed

if mode_to_test not in available_modes:
    raise ValueError(f"Invalid mode '{mode_to_test}'. Choose from: {available_modes}")

# ✅ Define paths
model_dir = f"./outputs/mbert_massive_finetuned_{mode_to_test}"
print(f"\n📁 Loading model and tokenizer from: {model_dir}")

if not os.path.exists(model_dir):
    raise FileNotFoundError(f"❌ Model directory not found: {model_dir}")

# ✅ Load tokenizer and model (local only)
tokenizer = BertTokenizer.from_pretrained(model_dir, local_files_only=True)
model = BertForSequenceClassification.from_pretrained(model_dir, local_files_only=True)
model.to(device)
print("✅ Model and tokenizer loaded successfully.")

# ✅ Load label names (intent classes)
print("🔍 Loading intent label classes from MASSIVE dataset...")
label_names = load_dataset("AmazonScience/massive", "all_1.1")["train"].features["intent"].names
class_names = label_names
print(f"✅ Loaded {len(class_names)} intent classes.")

# ✅ Define prediction function
def predict_intent(text, model, tokenizer, class_names):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(probs, dim=1).item()
        confidence = probs[0, predicted_class].item()
    return class_names[predicted_class], confidence

# ✅ Test sentences
sentences = {
    "greeting": {
        "English": "How are you?",
        "Spanish": "¿Cómo estás?",
        "French": "Comment ça va?",
        "Hindi": "आप कैसे हैं?"
    },
    "weather_query": {
        "English": "What's the weather like today?",
        "Spanish": "¿Cómo está el clima hoy?",
        "French": "Quel temps fait-il aujourd'hui?",
        "Hindi": "आज मौसम कैसा है?"
    },
    "alarm_set": {
        "English": "Set an alarm for 7 AM.",
        "Spanish": "Configura una alarma para las 7 AM.",
        "French": "Mets une alarme à 7 heures.",
        "Hindi": "सुबह 7 बजे के लिए अलार्म सेट करें।"
    }
}

# ✅ Run predictions
print(f"\n🔮 Running predictions using model: '{mode_to_test}'")
for intent_type, lang_dict in sentences.items():
    print(f"\n=== 🌍 Intent: {intent_type.upper()} ===")
    for lang, sentence in lang_dict.items():
        intent, confidence = predict_intent(sentence, model, tokenizer, class_names)
        print(f"[{lang}] → \"{sentence}\"")
        print(f"  ↪ Predicted: {intent} (Confidence: {confidence:.2f})")


SyntaxError: closing parenthesis ']' does not match opening parenthesis '{' on line 16 (2274657562.py, line 19)