In [27]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import mlflow
import mlflow.pytorch
from tqdm import tqdm
from pathlib import Path

GPU

In [28]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


DATA LOAD

In [29]:
# Load data

base_dir = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
data_dir = base_dir / "src" / "data" / "raw"

train = pd.read_csv(data_dir / "train.csv")
test = pd.read_csv(data_dir / "test.csv")
test_labels = pd.read_csv(data_dir / "test_labels.csv") 

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']



TRAIN TEST SPLIT

In [30]:
# Prepare test
def prepare_test_for_evaluation(test_df, test_labels_df):
    valid_mask = (test_labels_df[labels] != -1).all(axis=1)
    print(f"Test samples: {len(test_df)} total, {valid_mask.sum()} valid for evaluation")
    return test_df[valid_mask].copy(), test_labels_df[valid_mask].copy()


In [31]:
def create_balanced_subset(df, labels, n_per_class=200):
    """
    Crea un subset balanceado para evaluación más justa
    """
    idxs = set()
    print(f"Creating balanced subset with {n_per_class} samples per class...")
    
    for i, label in enumerate(labels):
        # Encuentra índices donde esa clase está presente
        class_idxs = np.where(df[label].values == 1)[0]
        available = len(class_idxs)
        to_sample = min(n_per_class, available)
        
        if available > 0:
            chosen = np.random.choice(class_idxs, to_sample, replace=False)
            idxs.update(chosen)
            print(f"  {label}: {to_sample}/{available} samples")
        else:
            print(f"  {label}: 0 samples available!")
    
    # Añade algunos neutros (all zero)
    neutral_idxs = np.where(df[labels].sum(axis=1) == 0)[0]
    neutral_available = len(neutral_idxs)
    neutral_to_sample = min(n_per_class, neutral_available)
    
    if neutral_available > 0:
        chosen_neutral = np.random.choice(neutral_idxs, neutral_to_sample, replace=False)
        idxs.update(chosen_neutral)
        print(f"  neutral: {neutral_to_sample}/{neutral_available} samples")
    
    # Devuelve el subset balanceado
    idxs = list(idxs)
    balanced_df = df.iloc[idxs].copy()
    print(f"Total balanced samples: {len(balanced_df)}")
    return balanced_df

def calculate_class_weights(train_df, labels):
    """
    Calcula pesos para manejar el desbalanceo de clases
    """
    class_counts = train_df[labels].sum(axis=0)
    total = len(train_df)
    pos_weights = (total - class_counts) / (class_counts + 1e-6)
    
    print("Class distribution and weights:")
    for label, count, weight in zip(labels, class_counts, pos_weights):
        percentage = (count / total) * 100
        print(f"  {label}: {count} samples ({percentage:.2f}%) -> weight: {weight:.2f}")
    
    return pos_weights.values

In [32]:
def evaluate_neutral_performance(y_true, y_pred_probs, threshold=0.5):
    """
    Evalúa específicamente el rendimiento en comentarios neutros
    """
    # Convertir probabilidades a predicciones binarias
    y_pred_bin = (y_pred_probs > threshold).astype(int)
    
    # Identificar comentarios neutros (todas las etiquetas = 0)
    neutral_mask = (y_true.sum(axis=1) == 0)
    neutral_total = neutral_mask.sum()
    
    if neutral_total == 0:
        return {
            'neutral_total': 0,
            'neutral_correct': 0,
            'neutral_accuracy': 0.0,
            'neutral_fp_rate': 0.0
        }
    
    # ¿Cuántos neutros fueron predichos como neutros?
    neutral_pred_mask = (y_pred_bin[neutral_mask].sum(axis=1) == 0)
    neutral_correct = neutral_pred_mask.sum()
    neutral_fp = neutral_total - neutral_correct
    
    # Calcular métricas
    neutral_accuracy = neutral_correct / neutral_total
    neutral_fp_rate = neutral_fp / neutral_total
    
    return {
        'neutral_total': int(neutral_total),
        'neutral_correct': int(neutral_correct),
        'neutral_fp': int(neutral_fp),
        'neutral_accuracy': float(neutral_accuracy),
        'neutral_fp_rate': float(neutral_fp_rate)
    }



In [None]:
# Split training data
test_eval, test_labels_eval = prepare_test_for_evaluation(test, test_labels)


# Split training data
X = train['comment_text'].values
y = train[labels].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y[:, 0]
)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test evaluation size: {len(test_eval)}")

# Calcular pesos para el desbalanceo de clases
class_weights = calculate_class_weights(train, labels)
pos_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Crear datasets balanceados para evaluación
print("Creating balanced validation set")
val_df_temp = pd.DataFrame({'comment_text': X_val})
val_df_temp[labels] = y_val
balanced_val_df = create_balanced_subset(val_df_temp, labels, n_per_class=300)

print("Creating balanced test set")
test_df_temp = test_eval.copy()
test_df_temp[labels] = test_labels_eval[labels].values
balanced_test_df = create_balanced_subset(test_df_temp, labels, n_per_class=200)

SyntaxError: invalid syntax (4050372971.py, line 28)

MODELS TO TEST

In [None]:
# Define models to test
MODELS_TO_TEST = [
    {
        'name': 'distilbert-base-uncased',
        'batch_size': 32,
        'max_length': 128
    },
    {
        'name': 'bert-base-uncased',
        'batch_size': 16,
        'max_length': 128
    },
    {
        'name': 'roberta-base',
        'batch_size': 16,
        'max_length': 128
    },
    {
        'name': 'microsoft/deberta-v3-base',
        'batch_size': 8,
        'max_length': 128
    },
    {
        'name': 'unitary/toxic-bert',
        'batch_size': 16,
        'max_length': 128
    }
]

EVALUATIONS

In [None]:
def evaluate_model(model_name, eval_df, max_length=128):
    try:
        print(f"Evaluating {model_name} on balanced validation set...")
        
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(labels),
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        ).to(device)
        
        model.eval()
        all_predictions = []
        
        batch_size = 16
        X_eval = eval_df['comment_text'].values
        y_eval = eval_df[labels].values
        
        for j in tqdm(range(0, len(X_eval), batch_size), desc=f"Evaluating {model_name}"):
            batch_texts = X_eval[j:j+batch_size]
            
            inputs = tokenizer(
                batch_texts.tolist(),
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            ).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
                probs = torch.sigmoid(outputs.logits)
                all_predictions.append(probs.cpu().numpy())
        
        predictions = np.vstack(all_predictions)
        
        # Calcular AUC por etiqueta
        label_aucs = []
        for i, label in enumerate(labels):
            if len(np.unique(y_eval[:, i])) > 1:  # Solo si hay ambas clases
                auc = roc_auc_score(y_eval[:, i], predictions[:, i])
                label_aucs.append(auc)
                print(f"  {label}: {auc:.4f}")
            else:
                label_aucs.append(0.0)
                print(f"  {label}: No samples or single class")
        
        mean_auc = np.mean([auc for auc in label_aucs if auc > 0])
        print(f"  Mean AUC: {mean_auc:.4f}")
        
        # Evaluar rendimiento en neutros
        neutral_metrics = evaluate_neutral_performance(y_eval, predictions)
        print(f"  Neutral accuracy: {neutral_metrics['neutral_accuracy']:.4f}")
        print(f"  Neutral FP rate: {neutral_metrics['neutral_fp_rate']:.4f}")
        print(f"  Neutral samples: {neutral_metrics['neutral_total']}")
        
        del model
        torch.cuda.empty_cache()
        
        return mean_auc, label_aucs, neutral_metrics, predictions
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return 0.0, [0.0] * len(labels), {}, None

In [None]:
def evaluate_model_on_test(model_name, test_eval_df, max_length=128):
    try:
        print(f"Evaluating {model_name} on balanced test set...")
        
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(labels),
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        ).to(device)
        
        model.eval()
        all_predictions = []
        
        batch_size = 16
        X_test = test_eval_df['comment_text'].values
        y_test = test_eval_df[labels].values
        
        for j in tqdm(range(0, len(X_test), batch_size), desc=f"Test evaluation"):
            batch_texts = X_test[j:j+batch_size]
            
            inputs = tokenizer(
                batch_texts.tolist(),
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            ).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
                probs = torch.sigmoid(outputs.logits)
                all_predictions.append(probs.cpu().numpy())
        
        predictions = np.vstack(all_predictions)
        
        # Calcular AUC por etiqueta
        label_aucs = []
        for i, label in enumerate(labels):
            if len(np.unique(y_test[:, i])) > 1:
                auc = roc_auc_score(y_test[:, i], predictions[:, i])
                label_aucs.append(auc)
                print(f"  {label}: {auc:.4f}")
            else:
                label_aucs.append(0.0)
                print(f"  {label}: No samples or single class")
        
        mean_auc = np.mean([auc for auc in label_aucs if auc > 0])
        print(f"  Test Mean AUC: {mean_auc:.4f}")
        
        # Evaluar rendimiento en neutros
        neutral_metrics = evaluate_neutral_performance(y_test, predictions)
        print(f"  Test Neutral accuracy: {neutral_metrics['neutral_accuracy']:.4f}")
        print(f"  Test Neutral FP rate: {neutral_metrics['neutral_fp_rate']:.4f}")
        
        del model
        torch.cuda.empty_cache()
        
        return mean_auc, label_aucs, neutral_metrics, predictions
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return 0.0, [0.0] * len(labels), {}, None

TEST MODELS

In [None]:
# Start MLflow experiment
mlflow.set_experiment("HuggingFace_Baselines_balanced")


<Experiment: artifact_location='file:///c:/wd/wd_demos/toxic_comment_classification/notebooks/mlruns/167733686526390127', creation_time=1754472152381, experiment_id='167733686526390127', last_update_time=1754472152381, lifecycle_stage='active', name='HuggingFace_Baselines', tags={}>

In [None]:

# Test each model
results = {}

for model_config in MODELS_TO_TEST:
    with mlflow.start_run(run_name=f"balanced_{model_config['name'].split('/')[-1]}"):
        # Log parameters
        mlflow.log_params(model_config)
        mlflow.log_param('class_weights_used', True)
        mlflow.log_param('balanced_evaluation', True)
        
        # Evaluate on balanced validation set
        val_auc, val_label_aucs, val_neutral_metrics, _ = evaluate_model(
            model_config['name'],
            balanced_val_df,
            model_config['max_length']
        )
        
        # Evaluate on balanced test set
        test_auc, test_label_aucs, test_neutral_metrics, _ = evaluate_model_on_test(
            model_config['name'],
            balanced_test_df,
            model_config['max_length']
        )
        
        # Log metrics
        mlflow.log_metric('val_mean_auc', val_auc)
        mlflow.log_metric('test_mean_auc', test_auc)
        
        # Log per-label AUCs
        for label, val_auc_label, test_auc_label in zip(labels, val_label_aucs, test_label_aucs):
            mlflow.log_metric(f'val_auc_{label}', val_auc_label)
            mlflow.log_metric(f'test_auc_{label}', test_auc_label)
        
        # Log neutral performance metrics
        for metric_name, value in val_neutral_metrics.items():
            mlflow.log_metric(f'val_{metric_name}', value)
        
        for metric_name, value in test_neutral_metrics.items():
            mlflow.log_metric(f'test_{metric_name}', value)
        
        results[model_config['name']] = {
            'val_mean_auc': val_auc,
            'test_mean_auc': test_auc,
            'val_label_aucs': val_label_aucs,
            'test_label_aucs': test_label_aucs,
            'val_neutral_metrics': val_neutral_metrics,
            'test_neutral_metrics': test_neutral_metrics
        }

# Display results summary
print("\n" + "="*80)
print("BALANCED EVALUATION RESULTS SUMMARY")
print("="*80)
print(f"{'Model':<30} {'Val AUC':<10} {'Test AUC':<10} {'Val Neutral Acc':<15} {'Test Neutral Acc':<15}")
print("-" * 80)

for model_name, metrics in sorted(results.items(), key=lambda x: x[1]['val_mean_auc'], reverse=True):
    val_neutral_acc = metrics['val_neutral_metrics'].get('neutral_accuracy', 0.0)
    test_neutral_acc = metrics['test_neutral_metrics'].get('neutral_accuracy', 0.0)
    
    print(f"{model_name:<30} {metrics['val_mean_auc']:.4f}     {metrics['test_mean_auc']:.4f}     "
          f"{val_neutral_acc:.4f}          {test_neutral_acc:.4f}")

print("\n" + "="*80)
print("NEUTRAL PERFORMANCE DETAILS")
print("="*80)

for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    val_neutral = metrics['val_neutral_metrics']
    test_neutral = metrics['test_neutral_metrics']
    
    print(f"  Validation - Total neutros: {val_neutral.get('neutral_total', 0)}, "
          f"Correctos: {val_neutral.get('neutral_correct', 0)}, "
          f"FP Rate: {val_neutral.get('neutral_fp_rate', 0.0):.4f}")
    
    print(f"  Test - Total neutros: {test_neutral.get('neutral_total', 0)}, "
          f"Correctos: {test_neutral.get('neutral_correct', 0)}, "
          f"FP Rate: {test_neutral.get('neutral_fp_rate', 0.0):.4f}")

print(f"Class weights used: {dict(zip(labels, class_weights))}")


🔍 Evaluating distilbert-base-uncased on validation set...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating distilbert-base-uncased: 100%|██████████| 313/313 [00:29<00:00, 10.54it/s]


  toxic: 0.7482
  severe_toxic: 0.2788
  obscene: 0.5819
  threat: 0.6947
  insult: 0.2340
  identity_hate: 0.4773
  Mean AUC: 0.5025

🔍 Evaluating distilbert-base-uncased on test set...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test evaluation: 100%|██████████| 63/63 [00:05<00:00, 10.52it/s]


  toxic: 0.6151
  severe_toxic: 0.1958
  obscene: 0.3210
  threat: 0.8783
  insult: 0.4823
  identity_hate: 0.4376
  Test Mean AUC: 0.4883

🔍 Evaluating bert-base-uncased on validation set...


Downloading tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<?, ?B/s]
Downloading config.json: 100%|██████████| 570/570 [00:00<00:00, 556kB/s]
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 412kB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.38MB/s]
  _torch_pytree._register_pytree_node(
Downloading model.safetensors: 100%|██████████| 440M/440M [02:27<00:00, 2.99MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating bert-base-uncased: 100%|██████████| 313/313 [00:54<00:00,  5.72it/s]


  toxic: 0.4053
  severe_toxic: 0.2658
  obscene: 0.4578
  threat: 0.7185
  insult: 0.5386
  identity_hate: 0.4495
  Mean AUC: 0.4726

🔍 Evaluating bert-base-uncased on test set...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test evaluation: 100%|██████████| 63/63 [00:10<00:00,  5.73it/s]


  toxic: 0.5181
  severe_toxic: 0.4171
  obscene: 0.4007
  threat: 0.6169
  insult: 0.4938
  identity_hate: 0.4872
  Test Mean AUC: 0.4890

🔍 Evaluating roberta-base on validation set...


Downloading tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<?, ?B/s]
Downloading config.json: 100%|██████████| 481/481 [00:00<?, ?B/s] 
Downloading vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 5.18MB/s]
Downloading merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 7.86MB/s]
Downloading tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.29MB/s]
Downloading model.safetensors: 100%|██████████| 499M/499M [01:37<00:00, 5.12MB/s] 
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating roberta-base: 100%|██████████| 313/313 [00:55<00:00,  5.60it/s]


  toxic: 0.4604
  severe_toxic: 0.4941
  obscene: 0.6331
  threat: 0.8391
  insult: 0.4797
  identity_hate: 0.3154
  Mean AUC: 0.5370

🔍 Evaluating roberta-base on test set...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test evaluation: 100%|██████████| 63/63 [00:11<00:00,  5.63it/s]


  toxic: 0.4827
  severe_toxic: 0.6430
  obscene: 0.5116
  threat: 0.3725
  insult: 0.5086
  identity_hate: 0.4466
  Test Mean AUC: 0.4942

🔍 Evaluating microsoft/deberta-v3-base on validation set...


Downloading tokenizer_config.json: 100%|██████████| 52.0/52.0 [00:00<00:00, 50.0kB/s]
Downloading config.json: 100%|██████████| 579/579 [00:00<?, ?B/s] 
Downloading spm.model: 100%|██████████| 2.46M/2.46M [00:00<00:00, 9.79MB/s]


❌ Error: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.

🔍 Evaluating microsoft/deberta-v3-base on test set...
❌ Error: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.

🔍 Evaluating unitary/toxic-bert on validation set...


Downloading tokenizer_config.json: 100%|██████████| 174/174 [00:00<?, ?B/s] 
Downloading config.json: 100%|██████████| 811/811 [00:00<?, ?B/s] 
Downloading vocab.txt: 232kB [00:00, 9.69MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 37.3kB/s]
Downloading model.safetensors: 100%|██████████| 438M/438M [01:25<00:00, 5.11MB/s] 
Evaluating unitary/toxic-bert: 100%|██████████| 313/313 [00:55<00:00,  5.61it/s]


  toxic: 0.9968
  severe_toxic: 0.9912
  obscene: 0.9957
  threat: 0.9980
  insult: 0.9962
  identity_hate: 0.9971
  Mean AUC: 0.9958

🔍 Evaluating unitary/toxic-bert on test set...


Test evaluation: 100%|██████████| 63/63 [00:11<00:00,  5.52it/s]


  toxic: 0.9715
  severe_toxic: 0.9741
  obscene: 0.9833
  threat: 1.0000
  insult: 0.9784
  identity_hate: 0.9922
  Test Mean AUC: 0.9832

📊 Baseline Results Summary:
----------------------------------------------------------------------
Model                          Val AUC    Test AUC  
----------------------------------------------------------------------
unitary/toxic-bert             0.9958     0.9922
roberta-base                   0.5370     0.4466
distilbert-base-uncased        0.5025     0.4376
bert-base-uncased              0.4726     0.4872
microsoft/deberta-v3-base      0.0000     0.0000
