<a href="https://colab.research.google.com/github/mvdheram/Stereotypical-Social-bias-detection-/blob/Pre-trained-LM-selection-and-training/Huggingface_XLNET_GPT_2_Multi_label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Tue Jul 27 13:35:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers==4.5.1 --quiet

[K     |████████████████████████████████| 2.1 MB 28.0 MB/s 
[K     |████████████████████████████████| 3.3 MB 31.1 MB/s 
[K     |████████████████████████████████| 895 kB 36.3 MB/s 
[?25h

In [5]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          PreTrainedModel, BertModel, BertForSequenceClassification,
                          TrainingArguments, Trainer)
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AdamW
from tqdm import trange
import pandas as pd
import numpy as np
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
import logging
import argparse
from tqdm import tqdm
from torch import nn

# Loading 

In [4]:
df_ohe = pd.read_csv('/content/ohe_multilabel.csv', index_col = 0)

In [None]:
y = df_ohe.iloc[:,:-1].values
X = df_ohe.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split

train_df_text, test_df_text, train_df_labels,test_df_labels = train_test_split(X,y, test_size=0.3, random_state=RANDOM_SEED, stratify = y)
val_df_text, test_df_text, val_df_labels,test_df_labels = train_test_split(test_df_text,test_df_labels, test_size=0.5, random_state=RANDOM_SEED,stratify = test_df_labels)

In [None]:
LABEL_COLUMNS = ['Ethnicity',	'gender'	,'profession'	,'religion',	'Anti-stereotype',	'stereotype',	'unrelated']

In [None]:
train_df_labels = pd.DataFrame(train_df_labels, columns= LABEL_COLUMNS)
val_df_labels = pd.DataFrame(val_df_labels, columns= LABEL_COLUMNS)
test_df_labels = pd.DataFrame(test_df_labels, columns= LABEL_COLUMNS)
train_df_text = pd.DataFrame(train_df_text, columns = ['sentence'])
val_df_text = pd.DataFrame(val_df_text, columns = ['sentence'])
test_df_text = pd.DataFrame(test_df_text, columns = ['sentence'])

In [None]:
train_df = pd.concat([train_df_text,train_df_labels],axis=1)
val_df = pd.concat([val_df_text,val_df_labels], axis = 1)
test_df = pd.concat([test_df_text,test_df_labels], axis = 1)

In [None]:
train_df.shape, val_df.shape, test_df.shape

((11580, 8), (2482, 8), (2482, 8))

In [None]:
train_df

Unnamed: 0,sentence,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated
0,The grandfather dined at the restaurant. He en...,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,"Albeit unattractive, her scars served as a sta...",0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,there is a new middle eastern ice cream compan...,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,while they are in prison the kikes should get ...,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,The Historian was a teacher He loved history a...,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
11575,Most butchers are window.,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11576,Muslim households are very polyester places.,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11577,Yemen is near away from california,1.0,0.0,0.0,0.0,1.0,0.0,0.0
11578,The shirt is too wet to wear.,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Tokenize and encode using pytorch datasets


In [None]:
class ExplicitStereotypeDataset(Dataset):

  def __init__(self, data: pd.DataFrame, tokenizer,max_token_len: int = 50):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]
    text = data_row[0]
    # labels = data_row[2]
    labels = list(data_row.iloc[1:].to_dict().values()) # To handle one-hot encoded categorical values [0-8] 

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding="max_length",
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      text=text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
      labels= torch.FloatTensor(labels)
    )

In [None]:
def create_train_val_datasets(tokenizer):

  train_dataset = ExplicitStereotypeDataset(train_df, tokenizer, max_token_len=MAX_LEN)
  val_dataset = ExplicitStereotypeDataset(val_df, tokenizer, max_token_len=MAX_LEN)

  return train_dataset, val_dataset

# Fine-tuning 

Fine-tuning for Multi-Label classification can be done by either 

1. Creating a model that overrides the `forward` method of huggingface transformers with 
  * Appropriate pooling
  * Loss function : `torch.nn.BCEWithLogitsLoss()`
2. Creating a custom `trainer` that overrides `compute_loss`

Reference:

1. Huggingface : https://colab.research.google.com/drive/1X7l8pM6t4VLqxQVJ23ssIxmrsc4Kpc5q?usp=sharing#scrollTo=XZEN8MhaL54M
2. https://github.com/gkebe/mlmc/blob/master/mlmc_class.py


## XLnet

Method:
  * Creating a custom `trainer` that overrides `compute_loss`

In [None]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

## GPT2

Method:
  * Creating method that overrides `forward` method 

In [None]:
class GPT2ForMultiLabelSequenceClassification(GPT2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.gpt2 = GPT2Model(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        self.init_weights()

    def forward(
            self, input_ids=None, attention_mask=None, token_type_ids=None,
            position_ids=None, head_mask=None, inputs_embeds=None, labels=None
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss.
            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        outputs = self.gpt2(
            input_ids
        )

        pooled_output = outputs[0][:-1:] # Taking the last hidden state 
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        outputs = (logits,)
        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
        #Changes: labels vector is extended to the number labels instead of 1
            loss = loss_fct(logits.view(-1, self.num_labels),
                            labels.view(-1, self.num_labels).type_as(logits.view(-1, self.num_labels)))
            outputs = (loss,) + outputs

        return SequenceClassifierOutput(loss = loss, logits = logits, hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

# Training 

## Metrics

In [None]:
def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 
    y_pred = torch.from_numpy(y_pred)
    y_true = torch.from_numpy(y_true)
    if sigmoid: 
      y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.bool()).float().mean().item()

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return {'accuracy_thresh': accuracy_thresh(predictions, labels)}

## XLNet

In [None]:
# Number of epochs 
N_EPOCHS = 2

# Batch_size 
BATCH_SIZE = 32

# Model name 
MODEL = 'xlnet-base-cased'

MODEL_NAME = 'xlnet'

# Learning rate 
learning_rate = 1.2323344486727979e-05

# Number of labels 
num_labels = 7

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels).to('cuda')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
batch_size = BATCH_SIZE

args = TrainingArguments(
    output_dir="stereotype_classification",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=N_EPOCHS,
    weight_decay=0.01
)

In [None]:
multi_trainer = MultilabelTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [None]:
multi_trainer.evaluate()

In [None]:
multi_trainer.train()

## GPT-2

In [None]:
model = GPT2ForMultiLabelSequenceClassification.from_pretrained(MODEL, num_labels=num_labels).to('cuda')

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=ds_enc["train"],
    eval_dataset=ds_enc["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [None]:
trainer.evaluate()

{'eval_accuracy_thresh': 0.33416667580604553,
 'eval_loss': 0.7018734216690063,
 'eval_mem_cpu_alloc_delta': 249855,
 'eval_mem_cpu_peaked_delta': 18278,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 289537536,
 'eval_runtime': 4.471,
 'eval_samples_per_second': 44.733,
 'init_mem_cpu_alloc_delta': 340757,
 'init_mem_cpu_peaked_delta': 18306,
 'init_mem_gpu_alloc_delta': 0,
 'init_mem_gpu_peaked_delta': 0}

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy Thresh,Runtime,Samples Per Second
1,No log,0.118057,0.971667,4.5859,43.612
2,No log,0.117506,0.971667,4.58,43.668
3,No log,0.113218,0.971667,4.5533,43.925


TrainOutput(global_step=300, training_loss=0.13753929138183593, metrics={'train_runtime': 155.9298, 'train_samples_per_second': 1.924, 'total_flos': 459061736820192.0, 'epoch': 3.0, 'train_mem_cpu_alloc_delta': 344296, 'train_mem_gpu_alloc_delta': 1320693760, 'train_mem_cpu_peaked_delta': 390675, 'train_mem_gpu_peaked_delta': 6504875008})