In [1]:
# !pip install transformers
# !pip install sentencepiece

# Import libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch# Import libraries
import torch.nn as nn
from transformers import AutoTokenizer, Trainer, AutoModelForSequenceClassification, TrainingArguments
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from torch import cuda
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
from sklearn import metrics

from sklearn.model_selection import train_test_split

import re
from tqdm import tqdm

# Read data

In [3]:
path = '/content/drive/MyDrive/Colab Notebooks/data/toxic/'
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/toxic/toxic_cleaned.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/toxic/toxic_cleaned_test.csv')

In [5]:
label_cols = df.columns.tolist()[2:]
df["labels"] = df[label_cols].values.tolist()
test_df["labels"] = test_df[label_cols].values.tolist()

train_df, val_df = train_test_split(df, test_size=0.05)
train_df.shape, val_df.shape, test_df.shape #test hs only ids and comment_text

((151592, 9), (7979, 9), (63978, 9))

# Choose Model and Tokenize

In [9]:
model_ckpt = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [10]:
train_encodings = tokenizer(train_df["comment_text"].values.tolist(), truncation=True, max_length=200)
val_encodings = tokenizer(val_df["comment_text"].values.tolist(), truncation=True,max_length=200)
test_encodings = tokenizer(test_df["comment_text"].values.tolist(), truncation=True,max_length=200)

In [11]:
train_labels = train_df["labels"].values.tolist()
val_labels = val_df["labels"].values.tolist()
test_labels = test_df["labels"].values.tolist()

# Dataset 

In [12]:
# Dataset class JigsawDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
train_dataset = JigsawDataset(train_encodings, train_labels)
val_dataset = JigsawDataset(val_encodings, val_labels)
test_dataset = JigsawDataset(test_encodings, test_labels)

In [16]:
num_labels=6

# Addressing imbalance using class weights
y = train_df[label_cols].values
pos_weight = (y==0).sum(axis=0) / y.sum(axis=0)
pos_weight = torch.Tensor(pos_weight).to(device)

In [17]:
pos_weight

tensor([  9.4302,  98.7972,  17.8900, 339.6562,  19.2989, 112.7224],
       device='cuda:0')

# Custom MultilabelTrainer

In [28]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels).to('cuda')

loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attenti

# Training Arguments

In [30]:
batch_size = 8
# configure logging so we see training loss
logging_steps = len(train_dataset) // batch_size

args = TrainingArguments(
    output_dir="jigsaw",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=logging_steps,
    save_total_limit = 2,
    save_strategy = "no",
    load_best_model_at_end=False
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


# Evaluation Function

In [None]:
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid: 
      y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}

In [31]:
multi_trainer = MultilabelTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

# Train

In [32]:
multi_trainer.train()

***** Running training *****
  Num examples = 151592
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 18949


Epoch,Training Loss,Validation Loss,Accuracy Thresh
1,0.0478,0.040979,0.984814


***** Running Evaluation *****
  Num examples = 7979
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=18949, training_loss=0.047830364950615795, metrics={'train_runtime': 3114.5014, 'train_samples_per_second': 48.673, 'train_steps_per_second': 6.084, 'total_flos': 1.273260679171488e+16, 'train_loss': 0.047830364950615795, 'epoch': 1.0})

# Save Model

In [33]:
multi_trainer.save_model(output_dir=path+model_ckpt+'_hugging_face_all_data')

Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/data/toxic/roberta-base_hugging_face_all_data
Configuration saved in /content/drive/MyDrive/Colab Notebooks/data/toxic/roberta-base_hugging_face_all_data/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/data/toxic/roberta-base_hugging_face_all_data/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Colab Notebooks/data/toxic/roberta-base_hugging_face_all_data/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Colab Notebooks/data/toxic/roberta-base_hugging_face_all_data/special_tokens_map.json


# Predictions on Test

In [34]:
predicted_results = multi_trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 63978
  Batch size = 8


In [35]:
y_pred = predicted_results.predictions
y_pred = np.where(y_pred > 0.5, 1, 0)
y_true = test_labels

In [36]:
accuracy = metrics.accuracy_score(y_true, y_pred)
f1_score_micro = metrics.f1_score(y_true, y_pred, average='micro')
f1_score_macro = metrics.f1_score(y_true, y_pred, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.888852418018694
F1 Score (Micro) = 0.6866378102326162
F1 Score (Macro) = 0.536496557772714


In [37]:
print(classification_report(
  y_true, 
  y_pred, 
  target_names=label_cols,
  zero_division=0
))

               precision    recall  f1-score   support

        toxic       0.59      0.87      0.70      6090
 severe_toxic       0.42      0.28      0.34       367
      obscene       0.63      0.78      0.70      3691
       threat       0.63      0.15      0.24       211
       insult       0.70      0.71      0.71      3427
identity_hate       0.79      0.40      0.53       712

    micro avg       0.63      0.76      0.69     14498
    macro avg       0.63      0.53      0.54     14498
 weighted avg       0.63      0.76      0.68     14498
  samples avg       0.08      0.07      0.07     14498

