In [None]:
!pip install -U transformers datasets huggingface_hub

Collecting transformers
  Using cached transformers-4.52.2-py3-none-any.whl.metadata (40 kB)
Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface_hub
  Using cached huggingface_hub-0.31.4-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Using cached transformers-4.52.2-py3-none-any.whl (10.5 MB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached huggingface_hub-0.31.4-py3-none-any.whl (489 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import pandas as pd
import torch
from datasets import Dataset

In [2]:
import numpy as np

In [None]:
try:
    df = pd.read_csv("data/train.csv", on_bad_lines='skip', engine='python')
    df = df.sample(frac=0.005, random_state=42).reset_index(drop=True)
    print("Loaded successfully:", df.shape)
except Exception as e:
    print("Error:", e)

Loaded successfully: (1117, 8)


In [8]:
label_columns = ['toxic', 'insult', 'threat', 'obscene', 'identity_hate', 'severe_toxic']
label_counts = df[label_columns].sum().sort_values(ascending=False)
print(label_counts)

toxic            110
obscene           60
insult            59
severe_toxic      10
identity_hate      9
threat             4
dtype: int64


In [None]:
!pip install transformers datasets scikit-learn pandas



🔢 Step 1: Compute Class Weights

In [9]:
import torch
import numpy as np

# Your label columns
label_cols = ['toxic', 'obscene', 'insult', 'severe_toxic', 'identity_hate', 'threat']

# Count positives and negatives for each class
pos_counts = np.array([df[label].sum() for label in label_cols])
neg_counts = np.array([len(df) - df[label].sum() for label in label_cols])

# Compute weights: more weight for underrepresented labels
class_weights = neg_counts / (pos_counts + 1e-5)  # Avoid divide by zero
pos_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

🧠 Step 2: Custom Trainer with Class Weights

In [13]:
!pip uninstall -y transformers tokenizers huggingface-hub

Found existing installation: transformers 4.37.2
Uninstalling transformers-4.37.2:
  Successfully uninstalled transformers-4.37.2
Found existing installation: tokenizers 0.15.2
Uninstalling tokenizers-0.15.2:
  Successfully uninstalled tokenizers-0.15.2
Found existing installation: huggingface-hub 0.31.2
Uninstalling huggingface-hub-0.31.2:
  Successfully uninstalled huggingface-hub-0.31.2


In [7]:
!python --version

Python 3.10.11


In [9]:
!rm -rf /usr/local/lib/python3.10.11/dist-packages/transformers

'rm' is not recognized as an internal or external command,
operable program or batch file.


In [14]:
!pip install -U transformers==4.37.2 peft==0.7.1

Collecting transformers==4.37.2
  Using cached transformers-4.37.2-py3-none-any.whl (8.4 MB)
Collecting huggingface-hub<1.0,>=0.19.3
  Downloading huggingface_hub-0.31.4-py3-none-any.whl (489 kB)
     ---------------------------------------- 0.0/489.3 kB ? eta -:--:--
      --------------------------------------- 10.2/489.3 kB ? eta -:--:--
     -- ---------------------------------- 30.7/489.3 kB 660.6 kB/s eta 0:00:01
     ----- ------------------------------- 71.7/489.3 kB 653.6 kB/s eta 0:00:01
     ------ ------------------------------ 92.2/489.3 kB 581.0 kB/s eta 0:00:01
     ------- ---------------------------- 102.4/489.3 kB 535.8 kB/s eta 0:00:01
     ----------- ------------------------ 153.6/489.3 kB 612.6 kB/s eta 0:00:01
     ---------------- ------------------- 225.3/489.3 kB 765.3 kB/s eta 0:00:01
     ----------------- ------------------ 235.5/489.3 kB 758.5 kB/s eta 0:00:01
     ---------------------- ------------- 307.2/489.3 kB 731.4 kB/s eta 0:00:01
     ------------


[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fct = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights_tensor.to(logits.device))
        loss = loss_fct(logits, labels.float())

        return (loss, outputs) if return_outputs else loss

W0523 16:08:50.068000 13772 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [11]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_cols),
    problem_type="multi_label_classification"
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import transformers
print(transformers.__version__)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

4.37.2


In [None]:
!pip uninstall -y transformers peft

In [None]:
!pip uninstall -y transformers

Found existing installation: transformers 4.52.2
Uninstalling transformers-4.52.2:
  Successfully uninstalled transformers-4.52.2


In [17]:
!pip uninstall -y transformers

Found existing installation: transformers 4.37.2
Uninstalling transformers-4.37.2:
  Successfully uninstalled transformers-4.37.2


In [18]:
!pip install transformers==4.37.2

Collecting transformers==4.37.2
  Using cached transformers-4.37.2-py3-none-any.whl (8.4 MB)
Installing collected packages: transformers
Successfully installed transformers-4.37.2



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1_micro",
    greater_is_better=True
)

In [13]:
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

# Your multi-label columns
label_columns = ['toxic', 'obscene', 'insult', 'severe_toxic', 'identity_hate', 'threat']

# ✅ Load your dataset (make sure df is defined)
# df = pd.read_csv("your_file.csv")

# 1. Split the dataset
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# 2. Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# 3. Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# 4. Tokenization + format for multi-label
def tokenize_and_format(example):
    # Tokenize the text
    encoding = tokenizer(
        example["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    # Add the multi-labels
    labels = [float(example[col]) for col in label_columns]  # Keep as float
    encoding["labels"] = labels
    return encoding

# 5. Map the tokenization
train_dataset = train_dataset.map(tokenize_and_format, batched=False)
val_dataset = val_dataset.map(tokenize_and_format, batched=False)

# 6. Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map:   0%|          | 0/893 [00:00<?, ? examples/s]

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

🏋️ Step 3: Use This Trainer for Training

In [14]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import numpy as np

def compute_metrics(pred):
    logits, labels = pred
    # Apply sigmoid for multi-label classification
    probs = 1 / (1 + np.exp(-logits))

    # Convert probabilities to binary (threshold = 0.5)
    preds = (probs >= 0.5).astype(int)

    return {
        "eval_accuracy": accuracy_score(labels, preds),
        "eval_f1_micro": f1_score(labels, preds, average="micro"),
        "eval_f1_macro": f1_score(labels, preds, average="macro"),
        "eval_precision_micro": precision_score(labels, preds, average="micro"),
        "eval_recall_micro": recall_score(labels, preds, average="micro"),
    }

In [None]:
!pip install --upgrade accelerate

Collecting accelerate
  Downloading accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.7.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.1/362.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.6.0
    Uninstalling accelerate-1.6.0:
      Successfully uninstalled accelerate-1.6.0
Successfully installed accelerate-1.7.0


In [11]:
import transformers
import accelerate
print(transformers.__version__)
print(accelerate.__version__)

4.37.2
0.26.1


In [19]:
pip install accelerate==0.26.1

Collecting accelerate==0.26.1
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
     ---------------------------------------- 0.0/270.9 kB ? eta -:--:--
     ------------- ------------------------- 92.2/270.9 kB 1.7 MB/s eta 0:00:01
     -------------------- ----------------- 143.4/270.9 kB 1.4 MB/s eta 0:00:01
     --------------------------- ---------- 194.6/270.9 kB 1.3 MB/s eta 0:00:01
     -------------------------------------  266.2/270.9 kB 1.3 MB/s eta 0:00:01
     -------------------------------------  266.2/270.9 kB 1.3 MB/s eta 0:00:01
     -------------------------------------  266.2/270.9 kB 1.3 MB/s eta 0:00:01
     ------------------------------------ 270.9/270.9 kB 757.7 kB/s eta 0:00:00
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 1.7.0
    Uninstalling accelerate-1.7.0:
      Successfully uninstalled accelerate-1.7.0
Successfully installed accelerate-0.26.1
Note: you may need to restar


[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # optional
)

In [16]:
trainer.train()

  0%|          | 0/168 [00:00<?, ?it/s]



  0%|          | 0/14 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_accuracy': 0.8973214285714286, 'eval_f1_micro': 0.6371681415929203, 'eval_f1_macro': 0.44428200129954515, 'eval_precision_micro': 0.5538461538461539, 'eval_recall_micro': 0.75, 'eval_loss': 0.472481906414032, 'eval_runtime': 39.5351, 'eval_samples_per_second': 5.666, 'eval_steps_per_second': 0.354, 'epoch': 1.0}




{'loss': 1.0885, 'learning_rate': 2.023809523809524e-05, 'epoch': 1.79}


  0%|          | 0/14 [00:00<?, ?it/s]

{'eval_accuracy': 0.875, 'eval_f1_micro': 0.5692307692307692, 'eval_f1_macro': 0.3958883493767214, 'eval_precision_micro': 0.45121951219512196, 'eval_recall_micro': 0.7708333333333334, 'eval_loss': 0.35562199354171753, 'eval_runtime': 42.2369, 'eval_samples_per_second': 5.303, 'eval_steps_per_second': 0.331, 'epoch': 2.0}


PermissionError: [WinError 5] Access is denied: './results\\tmp-checkpoint-112' -> './results\\checkpoint-112'