In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import mlflow
import mlflow.pytorch
from tqdm import tqdm
from pathlib import Path

In [11]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [12]:
# Load data

base_dir = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
data_dir = base_dir / "src" / "data" / "raw"

train = pd.read_csv(data_dir / "train.csv")
test = pd.read_csv(data_dir / "test.csv")
test_labels = pd.read_csv(data_dir / "test_labels.csv") 

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']



In [13]:
# Prepare test
def prepare_test_for_evaluation(test_df, test_labels_df):
    valid_mask = (test_labels_df[labels] != -1).all(axis=1)
    print(f"Test samples: {len(test_df)} total, {valid_mask.sum()} valid for evaluation")
    return test_df[valid_mask].copy(), test_labels_df[valid_mask].copy()

test_eval, test_labels_eval = prepare_test_for_evaluation(test, test_labels)


Test samples: 153164 total, 63978 valid for evaluation


In [14]:
# Split training data
X = train['comment_text'].values
y = train[labels].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y[:, 0]
)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test evaluation size: {len(test_eval)}")


Train size: 143613
Validation size: 15958
Test evaluation size: 63978


In [15]:
# Define models to test
MODELS_TO_TEST = [
    {
        'name': 'distilbert-base-uncased',
        'batch_size': 32,
        'max_length': 128
    },
    {
        'name': 'bert-base-uncased',
        'batch_size': 16,
        'max_length': 128
    },
    {
        'name': 'roberta-base',
        'batch_size': 16,
        'max_length': 128
    },
    {
        'name': 'microsoft/deberta-v3-base',
        'batch_size': 8,
        'max_length': 128
    },
    {
        'name': 'unitary/toxic-bert',
        'batch_size': 16,
        'max_length': 128
    }
]

In [None]:
def evaluate_model(model_name, X_val_sample, y_val_sample, max_length=128):
    try:
        print(f"Evaluating {model_name} on validation set...")
        
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(labels),
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        ).to(device)
        
        model.eval()
        all_predictions = []
        
        batch_size = 16
        for j in tqdm(range(0, len(X_val_sample), batch_size), desc=f"Evaluating {model_name}"):
            batch_texts = X_val_sample[j:j+batch_size]
            
            inputs = tokenizer(
                batch_texts.tolist(),
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            ).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
                probs = torch.sigmoid(outputs.logits)
                all_predictions.append(probs.cpu().numpy())
        
        predictions = np.vstack(all_predictions)
        
        label_aucs = []
        for i, label in enumerate(labels):
            auc = roc_auc_score(y_val_sample[:, i], predictions[:, i])
            label_aucs.append(auc)
            print(f"  {label}: {auc:.4f}")
        
        mean_auc = np.mean(label_aucs)
        print(f"  Mean AUC: {mean_auc:.4f}")
        
        del model
        torch.cuda.empty_cache()
        
        return mean_auc, label_aucs
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return 0.0, [0.0] * len(labels)

In [None]:
def evaluate_model_on_test(model_name, test_eval_df, test_labels_eval_df, max_length=128):
    try:
        print(f"Evaluating {model_name} on test set...")
        
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(labels),
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        ).to(device)
        
        model.eval()
        all_predictions = []
        
        batch_size = 16
        X_test = test_eval_df['comment_text'].values
        
        for j in tqdm(range(0, len(X_test), batch_size), desc=f"Test evaluation"):
            batch_texts = X_test[j:j+batch_size]
            
            inputs = tokenizer(
                batch_texts.tolist(),
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            ).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
                probs = torch.sigmoid(outputs.logits)
                all_predictions.append(probs.cpu().numpy())
        
        predictions = np.vstack(all_predictions)
        
        # Usar las etiquetas verdaderas del test set filtrado
        y_true = test_labels_eval_df[labels].values
        
        label_aucs = []
        for i, label in enumerate(labels):
            auc = roc_auc_score(y_true[:, i], predictions[:, i])
            label_aucs.append(auc)
            print(f"  {label}: {auc:.4f}")
        
        mean_auc = np.mean(label_aucs)
        print(f"  Test Mean AUC: {mean_auc:.4f}")
        
        del model
        torch.cuda.empty_cache()
        
        return mean_auc, label_aucs, predictions
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return 0.0, [0.0] * len(labels), None

In [18]:
# Start MLflow experiment
mlflow.set_experiment("HuggingFace_Baselines")


<Experiment: artifact_location='file:///c:/wd/wd_demos/toxic_comment_classification/notebooks/mlruns/167733686526390127', creation_time=1754472152381, experiment_id='167733686526390127', last_update_time=1754472152381, lifecycle_stage='active', name='HuggingFace_Baselines', tags={}>

In [None]:

# Test each model
results = {}

# Use a sample for quick baseline evaluation
sample_size = 5000
idx = np.random.choice(len(X_val), sample_size, replace=False)
X_val_sample = X_val[idx]
y_val_sample = y_val[idx]

for model_config in MODELS_TO_TEST:
    with mlflow.start_run(run_name=f"baseline_{model_config['name'].split('/')[-1]}"):
        # Log parameters
        mlflow.log_params(model_config)
        
        # Evaluate on validation set
        mean_auc, label_aucs = evaluate_model(
            model_config['name'],
            X_val_sample,
            y_val_sample,
            model_config['max_length']
        )
        
        # evaluate on test
        test_sample_size = min(1000, len(test_eval))
        test_sample = test_eval.sample(test_sample_size, random_state=42)
        test_labels_sample = test_labels_eval.loc[test_sample.index]
        
        test_auc, test_label_aucs, _ = evaluate_model_on_test(
            model_config['name'],
            test_sample,
            test_labels_sample,
            model_config['max_length']
        )
        
        # Log metrics
        mlflow.log_metric('val_mean_auc', mean_auc)
        mlflow.log_metric('test_mean_auc', test_auc)
        
        for label, auc, test_auc in zip(labels, label_aucs, test_label_aucs):
            mlflow.log_metric(f'val_auc_{label}', auc)
            mlflow.log_metric(f'test_auc_{label}', test_auc)
        
        results[model_config['name']] = {
            'val_mean_auc': mean_auc,
            'test_mean_auc': test_auc,
            'val_label_aucs': label_aucs,
            'test_label_aucs': test_label_aucs
        }

# Display results summary
print("Baseline Results Summary:")
print("-" * 70)
print(f"{'Model':<30} {'Val AUC':<10} {'Test AUC':<10}")
print("-" * 70)
for model_name, metrics in sorted(results.items(), key=lambda x: x[1]['val_mean_auc'], reverse=True):
    print(f"{model_name:<30} {metrics['val_mean_auc']:.4f}     {metrics['test_mean_auc']:.4f}")


🔍 Evaluating distilbert-base-uncased on validation set...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating distilbert-base-uncased: 100%|██████████| 313/313 [00:29<00:00, 10.54it/s]


  toxic: 0.7482
  severe_toxic: 0.2788
  obscene: 0.5819
  threat: 0.6947
  insult: 0.2340
  identity_hate: 0.4773
  Mean AUC: 0.5025

🔍 Evaluating distilbert-base-uncased on test set...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test evaluation: 100%|██████████| 63/63 [00:05<00:00, 10.52it/s]


  toxic: 0.6151
  severe_toxic: 0.1958
  obscene: 0.3210
  threat: 0.8783
  insult: 0.4823
  identity_hate: 0.4376
  Test Mean AUC: 0.4883

🔍 Evaluating bert-base-uncased on validation set...


Downloading tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<?, ?B/s]
Downloading config.json: 100%|██████████| 570/570 [00:00<00:00, 556kB/s]
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 412kB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.38MB/s]
  _torch_pytree._register_pytree_node(
Downloading model.safetensors: 100%|██████████| 440M/440M [02:27<00:00, 2.99MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating bert-base-uncased: 100%|██████████| 313/313 [00:54<00:00,  5.72it/s]


  toxic: 0.4053
  severe_toxic: 0.2658
  obscene: 0.4578
  threat: 0.7185
  insult: 0.5386
  identity_hate: 0.4495
  Mean AUC: 0.4726

🔍 Evaluating bert-base-uncased on test set...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test evaluation: 100%|██████████| 63/63 [00:10<00:00,  5.73it/s]


  toxic: 0.5181
  severe_toxic: 0.4171
  obscene: 0.4007
  threat: 0.6169
  insult: 0.4938
  identity_hate: 0.4872
  Test Mean AUC: 0.4890

🔍 Evaluating roberta-base on validation set...


Downloading tokenizer_config.json: 100%|██████████| 25.0/25.0 [00:00<?, ?B/s]
Downloading config.json: 100%|██████████| 481/481 [00:00<?, ?B/s] 
Downloading vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 5.18MB/s]
Downloading merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 7.86MB/s]
Downloading tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 3.29MB/s]
Downloading model.safetensors:  23%|██▎       | 115M/499M [00:17<01:11, 5.34MB/s] 