In [1]:
# Cell 1 ‚Äî Load config and core libs
%run ./00_config.ipynb

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

print("Model:", cfg.train.model_name)
print("Max length:", cfg.train.max_len)
print("Device:", "CUDA" if torch.cuda.is_available() else "CPU")


Python version: 3.13.7 (tags/v3.13.7:bcee1c3, Aug 14 2025, 14:15:11) [MSC v.1944 64 bit (AMD64)]
CUDA available: False
Running on CPU
../../data/train_data.csv
microsoft/mdeberta-v3-base
‚úÖ Config loaded and random seed set to: 42
üìÇ Model directory: ../models/best
üìÇ Reports directory: ../reports
‚úÖ Folder setup complete.
‚úÖ Found: ..\..\data\train_data.csv
‚úÖ Found: ..\..\data\test_data.csv

All required data files are present and accessible.
‚úÖ Configuration snapshot saved at:
../reports\config_snapshot.json


  from .autonotebook import tqdm as notebook_tqdm


Model: microsoft/mdeberta-v3-base
Max length: 256
Device: CPU


In [2]:
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1Ô∏è‚É£ Load tokenizer
def get_tokenizer(cfg=None):
    cfg = cfg or default_cfg()
    return AutoTokenizer.from_pretrained(cfg.train.model_name)

# 2Ô∏è‚É£ Load model (multi-label classification)
def get_model(cfg=None, num_labels=6):
    cfg = cfg or default_cfg()
    model = AutoModelForSequenceClassification.from_pretrained(
        cfg.train.model_name,
        num_labels=num_labels,
        problem_type="multi_label_classification"
    )
    return model

# 3Ô∏è‚É£ Define loss function
def get_loss(cfg, label_freq: np.ndarray):
    """
    Weighted BCEWithLogitsLoss if cfg.train.class_weighting == 'auto',
    else plain BCEWithLogitsLoss.
    """
    if cfg.train.class_weighting == "auto":
        weights = 1.0 / (label_freq + 1e-6)
        weights = weights / weights.sum() * len(label_freq)
        weights = torch.tensor(weights, dtype=torch.float)
    else:
        weights = torch.ones(len(label_freq), dtype=torch.float)

    def loss_fn(logits, targets):
        return nn.functional.binary_cross_entropy_with_logits(
            logits, targets, weight=weights.to(logits.device)
        )
    return loss_fn


In [5]:
# Cell 3 ‚Äî Enable local Hugging Face cache + load model/tokenizer
import os, torch

# ------------------------------------------------------------------
# 1Ô∏è‚É£ Set a local cache path (once)
# This will be inside your project folder so you can reuse the model.
# ------------------------------------------------------------------
os.environ["HF_HOME"] = os.path.abspath("../../../.cache/huggingface")
os.environ["TRANSFORMERS_CACHE"] = os.path.join(os.environ["HF_HOME"], "models")

print("Using Hugging Face cache at:", os.environ["TRANSFORMERS_CACHE"])

# ------------------------------------------------------------------
# 2Ô∏è‚É£ (Optional) install Xet optimization if you want faster downloads
# ------------------------------------------------------------------
# Uncomment if you want to actually install it:
!pip install -q "huggingface_hub[hf_xet]"

# ------------------------------------------------------------------
# 3Ô∏è‚É£ Load tokenizer and model
# ------------------------------------------------------------------
tokenizer = get_tokenizer(cfg)
model = get_model(cfg, num_labels=len(cfg.labels))

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# ------------------------------------------------------------------
# 4Ô∏è‚É£ Show summary info
# ------------------------------------------------------------------
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\n‚úÖ Model initialized: {cfg.train.model_name}")
print(f"Device: {device}")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")


Using Hugging Face cache at: c:\Users\alaud\OneDrive\Desktop\iitm-ds-lab-proj\CleanSpeech\.cache\huggingface\models



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



‚úÖ Model initialized: microsoft/mdeberta-v3-base
Device: cpu
Total parameters: 278,813,958
Trainable parameters: 278,813,958


In [4]:
# Cell 4 ‚Äî Compute label frequencies and instantiate weighted loss

import pandas as pd

def _label_freq_from_train_df(df, labels):
    # frequency = (#positives per label) / (#rows)
    return (df[list(labels)].sum(axis=0) / len(df))

def _label_freq_fallback(cfg, labels):
    # Fallback: compute directly from raw CSV if train_df isn't in this kernel
    train_raw = pd.read_csv(cfg.paths.raw_train)
    # ensure the label columns exist
    present = [c for c in labels if c in train_raw.columns]
    if not present:
        raise ValueError(f"No label columns from {list(labels)} found in {cfg.paths.raw_train}.")
    return (train_raw[present].sum(axis=0) / len(train_raw)).reindex(labels).fillna(0.0)

# Try to use train_df from 01_data; else fallback
try:
    _ = train_df  # raises NameError if not present
    label_freq = _label_freq_from_train_df(train_df, cfg.labels)
    print("Using label frequencies from train_df (01_data).")
except NameError:
    label_freq = _label_freq_fallback(cfg, cfg.labels)
    print("Using fallback: label frequencies computed from raw train CSV.")

display(label_freq)

# Build loss
loss_fn = get_loss(cfg, label_freq.values)
print("\n‚úÖ Loss function ready (weighted BCE with logits if class_weighting='auto').")


Using fallback: label frequencies computed from raw train CSV.


toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64


‚úÖ Loss function ready (weighted BCE with logits if class_weighting='auto').
