In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import mlflow
import mlflow.pytorch
from tqdm import tqdm
from pathlib import Path

GPU

In [14]:
# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


DATA LOAD

In [15]:
# Load data

base_dir = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
data_dir = base_dir / "src" / "data" / "raw"

train = pd.read_csv(data_dir / "train.csv")
test = pd.read_csv(data_dir / "test.csv")
test_labels = pd.read_csv(data_dir / "test_labels.csv") 

labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']



TRAIN TEST SPLIT

In [16]:
# Prepare test
def prepare_test_for_evaluation(test_df, test_labels_df):
    valid_mask = (test_labels_df[labels] != -1).all(axis=1)
    print(f"Test samples: {len(test_df)} total, {valid_mask.sum()} valid for evaluation")
    return test_df[valid_mask].copy(), test_labels_df[valid_mask].copy()


In [17]:
def create_balanced_subset(df, labels, n_per_class=200):
    """
    Crea un subset balanceado para evaluación más justa
    """
    idxs = set()
    print(f"Creating balanced subset with {n_per_class} samples per class...")
    
    for i, label in enumerate(labels):
        # Encuentra índices donde esa clase está presente
        class_idxs = np.where(df[label].values == 1)[0]
        available = len(class_idxs)
        to_sample = min(n_per_class, available)
        
        if available > 0:
            chosen = np.random.choice(class_idxs, to_sample, replace=False)
            idxs.update(chosen)
            print(f"  {label}: {to_sample}/{available} samples")
        else:
            print(f"  {label}: 0 samples available!")
    
    # Añade algunos neutros (all zero)
    neutral_idxs = np.where(df[labels].sum(axis=1) == 0)[0]
    neutral_available = len(neutral_idxs)
    neutral_to_sample = min(n_per_class, neutral_available)
    
    if neutral_available > 0:
        chosen_neutral = np.random.choice(neutral_idxs, neutral_to_sample, replace=False)
        idxs.update(chosen_neutral)
        print(f"  neutral: {neutral_to_sample}/{neutral_available} samples")
    
    # Devuelve el subset balanceado
    idxs = list(idxs)
    balanced_df = df.iloc[idxs].copy()
    print(f"Total balanced samples: {len(balanced_df)}")
    return balanced_df

def calculate_class_weights(train_df, labels):
    """
    Calcula pesos para manejar el desbalanceo de clases
    """
    class_counts = train_df[labels].sum(axis=0)
    total = len(train_df)
    pos_weights = (total - class_counts) / (class_counts + 1e-6)
    
    print("Class distribution and weights:")
    for label, count, weight in zip(labels, class_counts, pos_weights):
        percentage = (count / total) * 100
        print(f"  {label}: {count} samples ({percentage:.2f}%) -> weight: {weight:.2f}")
    
    return pos_weights.values

In [18]:
def evaluate_neutral_performance(y_true, y_pred_probs, threshold=0.5):
    """
    Evalúa específicamente el rendimiento en comentarios neutros
    """
    # Convertir probabilidades a predicciones binarias
    y_pred_bin = (y_pred_probs > threshold).astype(int)
    
    # Identificar comentarios neutros (todas las etiquetas = 0)
    neutral_mask = (y_true.sum(axis=1) == 0)
    neutral_total = neutral_mask.sum()
    
    if neutral_total == 0:
        return {
            'neutral_total': 0,
            'neutral_correct': 0,
            'neutral_accuracy': 0.0,
            'neutral_fp_rate': 0.0
        }
    
    # ¿Cuántos neutros fueron predichos como neutros?
    neutral_pred_mask = (y_pred_bin[neutral_mask].sum(axis=1) == 0)
    neutral_correct = neutral_pred_mask.sum()
    neutral_fp = neutral_total - neutral_correct
    
    # Calcular métricas
    neutral_accuracy = neutral_correct / neutral_total
    neutral_fp_rate = neutral_fp / neutral_total
    
    return {
        'neutral_total': int(neutral_total),
        'neutral_correct': int(neutral_correct),
        'neutral_fp': int(neutral_fp),
        'neutral_accuracy': float(neutral_accuracy),
        'neutral_fp_rate': float(neutral_fp_rate)
    }



In [19]:
# Split training data
test_eval, test_labels_eval = prepare_test_for_evaluation(test, test_labels)


# Split training data
X = train['comment_text'].values
y = train[labels].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y[:, 0]
)

print(f"Train size: {len(X_train)}")
print(f"Validation size: {len(X_val)}")
print(f"Test evaluation size: {len(test_eval)}")

# Calcular pesos para el desbalanceo de clases
class_weights = calculate_class_weights(train, labels)
pos_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Crear datasets balanceados para evaluación
print("Creating balanced validation set")
val_df_temp = pd.DataFrame({'comment_text': X_val})
val_df_temp[labels] = y_val
balanced_val_df = create_balanced_subset(val_df_temp, labels, n_per_class=300)

print("Creating balanced test set")
test_df_temp = test_eval.copy()
test_df_temp[labels] = test_labels_eval[labels].values
balanced_test_df = create_balanced_subset(test_df_temp, labels, n_per_class=200)

Test samples: 153164 total, 63978 valid for evaluation
Train size: 143613
Validation size: 15958
Test evaluation size: 63978
Class distribution and weights:
  toxic: 15294 samples (9.58%) -> weight: 9.43
  severe_toxic: 1595 samples (1.00%) -> weight: 99.04
  obscene: 8449 samples (5.29%) -> weight: 17.89
  threat: 478 samples (0.30%) -> weight: 332.83
  insult: 7877 samples (4.94%) -> weight: 19.26
  identity_hate: 1405 samples (0.88%) -> weight: 112.57
Creating balanced validation set
Creating balanced subset with 300 samples per class...
  toxic: 300/1529 samples
  severe_toxic: 149/149 samples
  obscene: 300/847 samples
  threat: 50/50 samples
  insult: 300/800 samples
  identity_hate: 153/153 samples
  neutral: 300/14355 samples
Total balanced samples: 1153
Creating balanced test set
Creating balanced subset with 200 samples per class...
  toxic: 200/6090 samples
  severe_toxic: 200/367 samples
  obscene: 200/3691 samples
  threat: 200/211 samples
  insult: 200/3427 samples
  iden

MODELS TO TEST

In [20]:
# Define models to test
MODELS_TO_TEST = [
    {
        'name': 'distilbert-base-uncased',
        'batch_size': 32,
        'max_length': 128
    },
    {
        'name': 'bert-base-uncased',
        'batch_size': 16,
        'max_length': 128
    },
    {
        'name': 'roberta-base',
        'batch_size': 16,
        'max_length': 128
    },
    {
        'name': 'microsoft/deberta-v3-base',
        'batch_size': 8,
        'max_length': 128
    },
    {
        'name': 'unitary/toxic-bert',
        'batch_size': 16,
        'max_length': 128
    }
]

EVALUATIONS

In [21]:
def evaluate_model(model_name, eval_df, max_length=128):
    try:
        print(f"Evaluating {model_name} on balanced validation set...")
        
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(labels),
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        ).to(device)
        
        model.eval()
        all_predictions = []
        
        batch_size = 16
        X_eval = eval_df['comment_text'].values
        y_eval = eval_df[labels].values
        
        for j in tqdm(range(0, len(X_eval), batch_size), desc=f"Evaluating {model_name}"):
            batch_texts = X_eval[j:j+batch_size]
            
            inputs = tokenizer(
                batch_texts.tolist(),
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            ).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
                probs = torch.sigmoid(outputs.logits)
                all_predictions.append(probs.cpu().numpy())
        
        predictions = np.vstack(all_predictions)
        
        # Calcular AUC por etiqueta
        label_aucs = []
        for i, label in enumerate(labels):
            if len(np.unique(y_eval[:, i])) > 1:  # Solo si hay ambas clases
                auc = roc_auc_score(y_eval[:, i], predictions[:, i])
                label_aucs.append(auc)
                print(f"  {label}: {auc:.4f}")
            else:
                label_aucs.append(0.0)
                print(f"  {label}: No samples or single class")
        
        mean_auc = np.mean([auc for auc in label_aucs if auc > 0])
        print(f"  Mean AUC: {mean_auc:.4f}")
        
        # Evaluar rendimiento en neutros
        neutral_metrics = evaluate_neutral_performance(y_eval, predictions)
        print(f"  Neutral accuracy: {neutral_metrics['neutral_accuracy']:.4f}")
        print(f"  Neutral FP rate: {neutral_metrics['neutral_fp_rate']:.4f}")
        print(f"  Neutral samples: {neutral_metrics['neutral_total']}")
        
        del model
        torch.cuda.empty_cache()
        
        return mean_auc, label_aucs, neutral_metrics, predictions
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return 0.0, [0.0] * len(labels), {}, None

In [22]:
def evaluate_model_on_test(model_name, test_eval_df, max_length=128):
    try:
        print(f"Evaluating {model_name} on balanced test set...")
        
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=len(labels),
            problem_type="multi_label_classification",
            ignore_mismatched_sizes=True
        ).to(device)
        
        model.eval()
        all_predictions = []
        
        batch_size = 16
        X_test = test_eval_df['comment_text'].values
        y_test = test_eval_df[labels].values
        
        for j in tqdm(range(0, len(X_test), batch_size), desc=f"Test evaluation"):
            batch_texts = X_test[j:j+batch_size]
            
            inputs = tokenizer(
                batch_texts.tolist(),
                padding=True,
                truncation=True,
                max_length=max_length,
                return_tensors='pt'
            ).to(device)
            
            with torch.no_grad():
                outputs = model(**inputs)
                probs = torch.sigmoid(outputs.logits)
                all_predictions.append(probs.cpu().numpy())
        
        predictions = np.vstack(all_predictions)
        
        # Calcular AUC por etiqueta
        label_aucs = []
        for i, label in enumerate(labels):
            if len(np.unique(y_test[:, i])) > 1:
                auc = roc_auc_score(y_test[:, i], predictions[:, i])
                label_aucs.append(auc)
                print(f"  {label}: {auc:.4f}")
            else:
                label_aucs.append(0.0)
                print(f"  {label}: No samples or single class")
        
        mean_auc = np.mean([auc for auc in label_aucs if auc > 0])
        print(f"  Test Mean AUC: {mean_auc:.4f}")
        
        # Evaluar rendimiento en neutros
        neutral_metrics = evaluate_neutral_performance(y_test, predictions)
        print(f"  Test Neutral accuracy: {neutral_metrics['neutral_accuracy']:.4f}")
        print(f"  Test Neutral FP rate: {neutral_metrics['neutral_fp_rate']:.4f}")
        
        del model
        torch.cuda.empty_cache()
        
        return mean_auc, label_aucs, neutral_metrics, predictions
        
    except Exception as e:
        print(f"❌ Error: {str(e)}")
        return 0.0, [0.0] * len(labels), {}, None

TEST MODELS

In [23]:
# Start MLflow experiment
mlflow.set_experiment("HuggingFace_Baselines_balanced")


<Experiment: artifact_location='file:///c:/wd/wd_demos/toxic_comment_classification/notebooks/mlruns/179631418936248154', creation_time=1754475070403, experiment_id='179631418936248154', last_update_time=1754475070403, lifecycle_stage='active', name='HuggingFace_Baselines_balanced', tags={}>

In [24]:

# Test each model
results = {}

for model_config in MODELS_TO_TEST:
    with mlflow.start_run(run_name=f"balanced_{model_config['name'].split('/')[-1]}"):
        # Log parameters
        mlflow.log_params(model_config)
        mlflow.log_param('class_weights_used', True)
        mlflow.log_param('balanced_evaluation', True)
        
        # Evaluate on balanced validation set
        val_auc, val_label_aucs, val_neutral_metrics, _ = evaluate_model(
            model_config['name'],
            balanced_val_df,
            model_config['max_length']
        )
        
        # Evaluate on balanced test set
        test_auc, test_label_aucs, test_neutral_metrics, _ = evaluate_model_on_test(
            model_config['name'],
            balanced_test_df,
            model_config['max_length']
        )
        
        # Log metrics
        mlflow.log_metric('val_mean_auc', val_auc)
        mlflow.log_metric('test_mean_auc', test_auc)
        
        # Log per-label AUCs
        for label, val_auc_label, test_auc_label in zip(labels, val_label_aucs, test_label_aucs):
            mlflow.log_metric(f'val_auc_{label}', val_auc_label)
            mlflow.log_metric(f'test_auc_{label}', test_auc_label)
        
        # Log neutral performance metrics
        for metric_name, value in val_neutral_metrics.items():
            mlflow.log_metric(f'val_{metric_name}', value)
        
        for metric_name, value in test_neutral_metrics.items():
            mlflow.log_metric(f'test_{metric_name}', value)
        
        results[model_config['name']] = {
            'val_mean_auc': val_auc,
            'test_mean_auc': test_auc,
            'val_label_aucs': val_label_aucs,
            'test_label_aucs': test_label_aucs,
            'val_neutral_metrics': val_neutral_metrics,
            'test_neutral_metrics': test_neutral_metrics
        }

# Display results summary
print("\n" + "="*80)
print("BALANCED EVALUATION RESULTS SUMMARY")
print("="*80)
print(f"{'Model':<30} {'Val AUC':<10} {'Test AUC':<10} {'Val Neutral Acc':<15} {'Test Neutral Acc':<15}")
print("-" * 80)

for model_name, metrics in sorted(results.items(), key=lambda x: x[1]['val_mean_auc'], reverse=True):
    val_neutral_acc = metrics['val_neutral_metrics'].get('neutral_accuracy', 0.0)
    test_neutral_acc = metrics['test_neutral_metrics'].get('neutral_accuracy', 0.0)
    
    print(f"{model_name:<30} {metrics['val_mean_auc']:.4f}     {metrics['test_mean_auc']:.4f}     "
          f"{val_neutral_acc:.4f}          {test_neutral_acc:.4f}")

print("\n" + "="*80)
print("NEUTRAL PERFORMANCE DETAILS")
print("="*80)

for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    val_neutral = metrics['val_neutral_metrics']
    test_neutral = metrics['test_neutral_metrics']
    
    print(f"  Validation - Total neutral: {val_neutral.get('neutral_total', 0)}, "
          f"correct: {val_neutral.get('neutral_correct', 0)}, "
          f"FP Rate: {val_neutral.get('neutral_fp_rate', 0.0):.4f}")
    
    print(f"  Test - Total neutral: {test_neutral.get('neutral_total', 0)}, "
          f"correct: {test_neutral.get('neutral_correct', 0)}, "
          f"FP Rate: {test_neutral.get('neutral_fp_rate', 0.0):.4f}")

print(f"Class weights used: {dict(zip(labels, class_weights))}")

Evaluating distilbert-base-uncased on balanced validation set...


  _torch_pytree._register_pytree_node(
W0806 12:17:34.329000 14872 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  _torch_pytree._register_pytree_node(
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating distilbert-base-uncased: 100%|██████████| 73/73 [00:07<00:00,  9.98it/s]


  toxic: 0.5924
  severe_toxic: 0.5704
  obscene: 0.6236
  threat: 0.5844
  insult: 0.5519
  identity_hate: 0.4113
  Mean AUC: 0.5557
  Neutral accuracy: 0.0733
  Neutral FP rate: 0.9267
  Neutral samples: 300
Evaluating distilbert-base-uncased on balanced test set...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test evaluation: 100%|██████████| 79/79 [00:06<00:00, 11.33it/s]


  toxic: 0.4377
  severe_toxic: 0.4613
  obscene: 0.4438
  threat: 0.5070
  insult: 0.4269
  identity_hate: 0.4768
  Test Mean AUC: 0.4589
  Test Neutral accuracy: 0.0000
  Test Neutral FP rate: 1.0000
Evaluating bert-base-uncased on balanced validation set...


  _torch_pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating bert-base-uncased: 100%|██████████| 73/73 [00:12<00:00,  5.77it/s]


  toxic: 0.6282
  severe_toxic: 0.4156
  obscene: 0.5628
  threat: 0.5102
  insult: 0.3966
  identity_hate: 0.4030
  Mean AUC: 0.4861
  Neutral accuracy: 0.0000
  Neutral FP rate: 1.0000
  Neutral samples: 300
Evaluating bert-base-uncased on balanced test set...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test evaluation: 100%|██████████| 79/79 [00:13<00:00,  6.03it/s]


  toxic: 0.4013
  severe_toxic: 0.5330
  obscene: 0.3676
  threat: 0.4758
  insult: 0.3992
  identity_hate: 0.5768
  Test Mean AUC: 0.4590
  Test Neutral accuracy: 0.0000
  Test Neutral FP rate: 1.0000
Evaluating roberta-base on balanced validation set...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating roberta-base: 100%|██████████| 73/73 [00:12<00:00,  5.87it/s]


  toxic: 0.4107
  severe_toxic: 0.4396
  obscene: 0.5500
  threat: 0.4592
  insult: 0.5205
  identity_hate: 0.6310
  Mean AUC: 0.5018
  Neutral accuracy: 0.0000
  Neutral FP rate: 1.0000
  Neutral samples: 300
Evaluating roberta-base on balanced test set...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test evaluation: 100%|██████████| 79/79 [00:13<00:00,  5.92it/s]


  toxic: 0.6612
  severe_toxic: 0.5367
  obscene: 0.5758
  threat: 0.4825
  insult: 0.4510
  identity_hate: 0.5201
  Test Mean AUC: 0.5379
  Test Neutral accuracy: 0.0000
  Test Neutral FP rate: 1.0000
Evaluating microsoft/deberta-v3-base on balanced validation set...


Downloading pytorch_model.bin: 100%|██████████| 371M/371M [00:26<00:00, 14.2MB/s] 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating microsoft/deberta-v3-base: 100%|██████████| 73/73 [00:18<00:00,  4.03it/s]


  toxic: 0.5487
  severe_toxic: 0.5608
  obscene: 0.3897
  threat: 0.5005
  insult: 0.5794
  identity_hate: 0.4684
  Mean AUC: 0.5079
  Neutral accuracy: 0.0000
  Neutral FP rate: 1.0000
  Neutral samples: 300
Evaluating microsoft/deberta-v3-base on balanced test set...


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.weight', 'classifier.bias', 'pooler.dense.weight', 'pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Test evaluation: 100%|██████████| 79/79 [00:19<00:00,  4.03it/s]


  toxic: 0.4189
  severe_toxic: 0.5330
  obscene: 0.4278
  threat: 0.4793
  insult: 0.5496
  identity_hate: 0.5291
  Test Mean AUC: 0.4896
  Test Neutral accuracy: 0.0000
  Test Neutral FP rate: 1.0000
Evaluating unitary/toxic-bert on balanced validation set...


Evaluating unitary/toxic-bert: 100%|██████████| 73/73 [00:12<00:00,  5.97it/s]


  toxic: 0.9835
  severe_toxic: 0.9045
  obscene: 0.9819
  threat: 0.9728
  insult: 0.9558
  identity_hate: 0.9755
  Mean AUC: 0.9623
  Neutral accuracy: 1.0000
  Neutral FP rate: 0.0000
  Neutral samples: 300
Evaluating unitary/toxic-bert on balanced test set...


Test evaluation: 100%|██████████| 79/79 [00:13<00:00,  6.06it/s]


  toxic: 0.9669
  severe_toxic: 0.8725
  obscene: 0.9484
  threat: 0.9730
  insult: 0.9085
  identity_hate: 0.9570
  Test Mean AUC: 0.9377
  Test Neutral accuracy: 0.9050
  Test Neutral FP rate: 0.0950

BALANCED EVALUATION RESULTS SUMMARY
Model                          Val AUC    Test AUC   Val Neutral Acc Test Neutral Acc
--------------------------------------------------------------------------------
unitary/toxic-bert             0.9623     0.9377     1.0000          0.9050
distilbert-base-uncased        0.5557     0.4589     0.0733          0.0000
microsoft/deberta-v3-base      0.5079     0.4896     0.0000          0.0000
roberta-base                   0.5018     0.5379     0.0000          0.0000
bert-base-uncased              0.4861     0.4590     0.0000          0.0000

NEUTRAL PERFORMANCE DETAILS

distilbert-base-uncased:
  Validation - Total neutral: 300, correct: 22, FP Rate: 0.9267
  Test - Total neutral: 200, correct: 0, FP Rate: 1.0000

bert-base-uncased:
  Validation - Tot

Clearly, unitary/toxic-bert has the best results.


The model achieves very high AUCs across all toxic classes, indicating strong discrimination between toxic and non-toxic comments.

On the validation set, the model perfectly classifies all neutral comments (no false positives). On the test set, it maintains a very high neutral accuracy (90.5%), with only 9.5% of neutral comments incorrectly flagged as toxic.

The small drop in AUC and neutral accuracy from validation to test suggests the model generalizes well and is not overfitting to the validation data.

**CONCLUSION:** Select 'unitary/toxic-bert' as baseline model and try to improve it.


In [30]:
experiments = client.search_experiments()
for exp in experiments:
    print(f"ID: {exp.experiment_id} | Name: {exp.name} | Artifact Location: {exp.artifact_location}")

ID: 179631418936248154 | Name: HuggingFace_Baselines_balanced | Artifact Location: file:///c:/wd/wd_demos/toxic_comment_classification/notebooks/mlruns/179631418936248154
ID: 167733686526390127 | Name: HuggingFace_Baselines | Artifact Location: file:///c:/wd/wd_demos/toxic_comment_classification/notebooks/mlruns/167733686526390127
ID: 365461217584339427 | Name: MultiLabel_EDA | Artifact Location: file:///c:/wd/wd_demos/toxic_comment_classification/notebooks/mlruns/365461217584339427
ID: 0 | Name: Default | Artifact Location: file:///c:/wd/wd_demos/toxic_comment_classification/notebooks/mlruns/0
