<a href="https://colab.research.google.com/github/mvdheram/Stereotypical-Social-bias-detection-/blob/Pre-trained-LM-selection-and-training/Huggingface_XLNET_GPT_2_Multi_label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!nvidia-smi

Mon Aug  2 19:20:59 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   57C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install transformers==4.5.1 --quiet

[K     |████████████████████████████████| 2.1 MB 8.3 MB/s 
[K     |████████████████████████████████| 895 kB 57.5 MB/s 
[K     |████████████████████████████████| 3.3 MB 49.5 MB/s 
[?25h

In [4]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          PreTrainedModel,
                          TrainingArguments, Trainer)
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import GPT2Tokenizer, GPT2Model, GPT2ForSequenceClassification
from torch.utils.data import Dataset , DataLoader
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AdamW
from tqdm import trange
import pandas as pd
import numpy as np
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
import logging
import argparse
from tqdm import tqdm
from torch import nn

# Loading 

In [5]:
df_ohe = pd.read_csv('/content/ohe_multilabel.csv', index_col = 0)

In [6]:
y = df_ohe.iloc[:,:-1].values
X = df_ohe.iloc[:,-1].values

In [7]:
MAX_LEN = 50
RANDOM_SEED = 47

In [8]:
from sklearn.model_selection import train_test_split

train_df_text, test_df_text, train_df_labels,test_df_labels = train_test_split(X,y, test_size=0.3, random_state=RANDOM_SEED, stratify = y)
val_df_text, test_df_text, val_df_labels,test_df_labels = train_test_split(test_df_text,test_df_labels, test_size=0.5, random_state=RANDOM_SEED,stratify = test_df_labels)

In [9]:
LABEL_COLUMNS = ['Ethnicity',	'gender'	,'profession'	,'religion',	'Anti-stereotype',	'stereotype',	'unrelated']

In [10]:
train_df_labels = pd.DataFrame(train_df_labels, columns= LABEL_COLUMNS)
val_df_labels = pd.DataFrame(val_df_labels, columns= LABEL_COLUMNS)
test_df_labels = pd.DataFrame(test_df_labels, columns= LABEL_COLUMNS)
train_df_text = pd.DataFrame(train_df_text, columns = ['sentence'])
val_df_text = pd.DataFrame(val_df_text, columns = ['sentence'])
test_df_text = pd.DataFrame(test_df_text, columns = ['sentence'])

In [11]:
train_df = pd.concat([train_df_text,train_df_labels],axis=1)
val_df = pd.concat([val_df_text,val_df_labels], axis = 1)
test_df = pd.concat([test_df_text,test_df_labels], axis = 1)

In [12]:
train_df.shape, val_df.shape, test_df.shape

((11580, 8), (2482, 8), (2482, 8))

In [13]:
train_df

Unnamed: 0,sentence,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated
0,The grandfather dined at the restaurant. He en...,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,"Albeit unattractive, her scars served as a sta...",0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,there is a new middle eastern ice cream compan...,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,while they are in prison the kikes should get ...,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,The Historian was a teacher He loved history a...,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
11575,Most butchers are window.,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11576,Muslim households are very polyester places.,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11577,Yemen is near away from california,1.0,0.0,0.0,0.0,1.0,0.0,0.0
11578,The shirt is too wet to wear.,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Tokenize and encode using pytorch datasets


In [14]:
class ExplicitStereotypeDataset(Dataset):

  def __init__(self, data: pd.DataFrame, tokenizer,max_token_len: int = 50):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]
    text = data_row[0]
    # labels = data_row[2]
    labels = list(data_row.iloc[1:].to_dict().values()) # To handle one-hot encoded categorical values [0-8] 

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding= True,
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      # text=text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
      labels= torch.FloatTensor(labels)
    )

In [15]:
def create_train_val_datasets(tokenizer):

  train_dataset = ExplicitStereotypeDataset(train_df, tokenizer, max_token_len=MAX_LEN)
  val_dataset = ExplicitStereotypeDataset(val_df, tokenizer, max_token_len=MAX_LEN)
  test_dataset = ExplicitStereotypeDataset(test_df, tokenizer, max_token_len=MAX_LEN)

  return train_dataset, val_dataset, test_dataset

# Fine-tuning 

Fine-tuning for Multi-Label classification can be done by either 

1. Creating a model that overrides the `forward` method of huggingface transformers with 
  * Appropriate pooling
  * Loss function : `torch.nn.BCEWithLogitsLoss()`
2. Creating a custom `trainer` that overrides `compute_loss`

Reference:

1. Huggingface : https://colab.research.google.com/drive/1X7l8pM6t4VLqxQVJ23ssIxmrsc4Kpc5q?usp=sharing#scrollTo=XZEN8MhaL54M
2. https://github.com/gkebe/mlmc/blob/master/mlmc_class.py


## XLnet

Method:
  * Creating a custom `trainer` that overrides `compute_loss`

In [16]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

## GPT2

Method:
  * Creating method that overrides `forward` method 

  Look into : https://github.com/huggingface/transformers/issues/3168

In [16]:
class GPT2ForMultiLabelSequenceClassification(GPT2ForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)

        self.gpt2 = GPT2Model(config)
        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
        # self.classifier = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.gpt2(input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict)

        pooled_output = outputs[0][:-1:] # Taking the last hidden state 
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
        #Changes: labels vector is extended to the number labels instead of 1
            loss = loss_fct(logits.view(-1, self.num_labels),
                            labels.view(-1, self.num_labels).type_as(logits.view(-1, self.num_labels)))
            outputs = (loss,) + outputs

        return SequenceClassifierOutput(loss = loss, logits = logits, hidden_states=outputs.hidden_states,
            attentions=outputs.attentions)

# Training 

## Metrics

In [19]:
LABELS = ['Ethnicity','gender','profession','religion','Anti-stereotype','stereotype','unrelated']

In [20]:
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report,hamming_loss, roc_auc_score, accuracy_score,multilabel_confusion_matrix
import numpy as np
import json

upper, lower = 1, 0

def classification_metrics(test_pred,labels,model_name,threshold,label_names = LABELS,write_to_file = True):

  print("Evaluation metrics for test set:")
  y_pred = np.where(test_pred > threshold, upper, lower)
  ROC_AUC_score = roc_auc_score(test_df_labels, test_pred)
  accuracy = accuracy_score(labels, y_pred)
  hloss = hamming_loss(labels, y_pred)
  cr = classification_report(labels, y_pred, labels=list(range(len(label_names))), target_names=label_names, output_dict=True)
  cf = multilabel_confusion_matrix(test_df_labels, 
  y_pred)

  recall_macro = recall_score(labels, y_pred, average="macro")
  precision_macro = precision_score(labels, y_pred, average="macro")
  f1_macro = f1_score(labels, y_pred, average="macro")
  
  model_metrics = {}
  model_metrics["AUC_ROC_score"] = ROC_AUC_score
  model_metrics["subset_accuracy"] = accuracy
  model_metrics["hamming_loss"]= hloss

  if write_to_file == True:
    model_metrics["Classification_report"] = cr

    for i,val in enumerate(LABEL_COLUMNS):
      model_metrics['confusion_matrix' + '_' + val] = str(cf[i].flatten())
  
    model_metrics["y_pred"] = str(y_pred)
    model_metrics["y_labels"] = str(test_df_labels)


    if threshold != 0.5:
      th = "calculated_threshold"
    else:
      th = threshold

    model_metrics["threshold"] = th
    output_file = "eval_results_" + model_name + "_"+str(th) +"_"+ ".json"
    
    with open(output_file, "w" ) as writer:
        json.dump(model_metrics,writer)
  
  return model_metrics
  # print("\n ROC-AUC score: %.6f \n" % (ROC_AUC_score))
  # print("\n Subset accuracy : %.6f \n" % (accuracy))
  # print("\n hamming_loss : %.6f \n" % (hloss))

  # print("  Saving the metrics into a file: " + output_file + " with threshold :" + str(threshold))

In [21]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return classification_metrics(predictions,labels,"XLNet",0.5)

## XLNet

In [108]:
# Number of epochs 
N_EPOCHS = 2

# Batch_size 
BATCH_SIZE = 32

# Model name 
MODEL = 'xlnet-base-cased'

MODEL_NAME = 'xlnet'

# Learning rate 
learning_rate = 1.2323344486727979e-05

# Number of labels 
num_labels = 7

In [109]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels).to('cuda')

KeyboardInterrupt: ignored

In [23]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1382015.0, style=ProgressStyle(descript…




In [37]:
batch_size = BATCH_SIZE

args = TrainingArguments(
    output_dir="stereotype_classification",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=N_EPOCHS,
    weight_decay=0.01
)

In [38]:
train_dataset,val_dataset, test_dataset = create_train_val_datasets(tokenizer)

In [39]:
train_dataset

<__main__.ExplicitStereotypeDataset at 0x7f97643bb550>

In [40]:
sample = train_dataset[0]

In [41]:
sample['labels']

tensor([0., 1., 0., 0., 1., 0., 0.])

In [70]:
multi_trainer = MultilabelTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [71]:
multi_trainer.evaluate()

Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))


{'eval_AUC_ROC_score': 0.5039134209679138,
 'eval_hamming_loss': 0.28830436284102684,
 'eval_loss': 0.7035492062568665,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 161860096,
 'eval_runtime': 13.6433,
 'eval_samples_per_second': 181.921,
 'eval_subset_accuracy': 0.016116035455278,
 'init_mem_cpu_alloc_delta': 8192,
 'init_mem_cpu_peaked_delta': 0,
 'init_mem_gpu_alloc_delta': 0,
 'init_mem_gpu_peaked_delta': 0}

In [72]:
multi_trainer.train()

Epoch,Training Loss,Validation Loss,Auc Roc Score,Subset Accuracy,Hamming Loss,Runtime,Samples Per Second
1,No log,0.267488,0.50135,0.347703,0.134511,14.5579,170.492
2,0.349800,0.245065,0.504241,0.433924,0.117071,14.0773,176.313


Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))


Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=724, training_loss=0.3203590403604244, metrics={'train_runtime': 550.6046, 'train_samples_per_second': 1.315, 'total_flos': 622843262932224.0, 'epoch': 2.0, 'train_mem_cpu_alloc_delta': 3694592, 'train_mem_gpu_alloc_delta': 1451683840, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 1438682624})

In [76]:
multi_trainer.evaluate()

Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'Ethnicity': {'precision': 0.8815165876777251, 'recall': 0.9489795918367347, 'f1-score': 0.914004914004914, 'support': 784}, 'gender': {'precision': 0.7859922178988327, 'recall': 0.6666666666666666, 'f1-score': 0.7214285714285713, 'support': 303}, 'profession': {'precision': 0.7960954446854663, 'recall': 0.7858672376873662, 'f1-score': 0.790948275862069, 'support': 467}, 'religion': {'precision': 0.9926470588235294, 'recall': 0.9215017064846417, 'f1-score': 0.9557522123893807, 'support': 293}, 'Anti-stereotype': {'precision': 0.7383177570093458, 'recall': 0.2036082474226804, 'f1-score': 0.31919191919191925, 'support': 776}, 'stereotype': {'precision': 0.8325652841781874, 'recall': 0.5060690943043884, 'f1-score': 0.6295005807200929, 'support': 1071}, 'unrelated': {'precision': 0.9734513274336283, 'recall': 0.6929133858267716, 'f1-score': 0.8095676172953082, 'support': 635}, 'micro avg': {'pr

{'epoch': 2.0,
 'eval_AUC_ROC_score': 0.5042412277925267,
 'eval_Classification_report': {'Anti-stereotype': {'f1-score': 0.31919191919191925,
   'precision': 0.7383177570093458,
   'recall': 0.2036082474226804,
   'support': 776},
  'Ethnicity': {'f1-score': 0.914004914004914,
   'precision': 0.8815165876777251,
   'recall': 0.9489795918367347,
   'support': 784},
  'gender': {'f1-score': 0.7214285714285713,
   'precision': 0.7859922178988327,
   'recall': 0.6666666666666666,
   'support': 303},
  'macro avg': {'f1-score': 0.7343420129846079,
   'precision': 0.8572265253866737,
   'recall': 0.6750865614613213,
   'support': 4329},
  'micro avg': {'f1-score': 0.7280748663101604,
   'precision': 0.8641701047286575,
   'recall': 0.629013629013629,
   'support': 4329},
  'profession': {'f1-score': 0.790948275862069,
   'precision': 0.7960954446854663,
   'recall': 0.7858672376873662,
   'support': 467},
  'religion': {'f1-score': 0.9557522123893807,
   'precision': 0.9926470588235294,
   

In [77]:
multi_trainer.evaluate(test_dataset,metric_key_prefix="test")

Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'Ethnicity': {'precision': 0.896969696969697, 'recall': 0.9438775510204082, 'f1-score': 0.9198259788688627, 'support': 784}, 'gender': {'precision': 0.8464730290456431, 'recall': 0.6710526315789473, 'f1-score': 0.7486238532110091, 'support': 304}, 'profession': {'precision': 0.8272921108742004, 'recall': 0.8308351177730193, 'f1-score': 0.829059829059829, 'support': 467}, 'religion': {'precision': 0.982078853046595, 'recall': 0.9351535836177475, 'f1-score': 0.958041958041958, 'support': 293}, 'Anti-stereotype': {'precision': 0.7658536585365854, 'recall': 0.20179948586118251, 'f1-score': 0.31943031536113936, 'support': 778}, 'stereotype': {'precision': 0.8474320241691843, 'recall': 0.5242990654205607, 'f1-score': 0.6478060046189377, 'support': 1070}, 'unrelated': {'precision': 0.9726315789473684, 'recall': 0.7287066246056783, 'f1-score': 0.8331830477908025, 'support': 634}, 'micro avg': {'pre

{'epoch': 2.0,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 139906560,
 'test_AUC_ROC_score': 0.9408275925968369,
 'test_Classification_report': {'Anti-stereotype': {'f1-score': 0.31943031536113936,
   'precision': 0.7658536585365854,
   'recall': 0.20179948586118251,
   'support': 778},
  'Ethnicity': {'f1-score': 0.9198259788688627,
   'precision': 0.896969696969697,
   'recall': 0.9438775510204082,
   'support': 784},
  'gender': {'f1-score': 0.7486238532110091,
   'precision': 0.8464730290456431,
   'recall': 0.6710526315789473,
   'support': 304},
  'macro avg': {'f1-score': 0.750852998136077,
   'precision': 0.8769615645127534,
   'recall': 0.6908177228396492,
   'support': 4330},
  'micro avg': {'f1-score': 0.7443227357734438,
   'precision': 0.8827629911280102,
   'recall': 0.643418013856813,
   'support': 4330},
  'profession': {'f1-score': 0.829059829059829,
   'precision': 0.8272921108742004,
 

In [78]:
!zip -r /content/file.zip /content/stereotype_classification

  adding: content/stereotype_classification/ (stored 0%)
  adding: content/stereotype_classification/checkpoint-500/ (stored 0%)
  adding: content/stereotype_classification/checkpoint-500/optimizer.pt (deflated 21%)
  adding: content/stereotype_classification/checkpoint-500/special_tokens_map.json (deflated 48%)
  adding: content/stereotype_classification/checkpoint-500/pytorch_model.bin (deflated 7%)
  adding: content/stereotype_classification/checkpoint-500/config.json (deflated 56%)
  adding: content/stereotype_classification/checkpoint-500/training_args.bin (deflated 46%)
  adding: content/stereotype_classification/checkpoint-500/tokenizer_config.json (deflated 48%)
  adding: content/stereotype_classification/checkpoint-500/scheduler.pt (deflated 49%)
  adding: content/stereotype_classification/checkpoint-500/spiece.model (deflated 49%)
  adding: content/stereotype_classification/checkpoint-500/trainer_state.json (deflated 48%)


In [79]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## GPT-2

In [22]:
# Number of epochs 
N_EPOCHS = 4

# Batch_size 
BATCH_SIZE = 32

# Model name 
MODEL = 'gpt2'

MODEL_NAME = 'gpt-2'

# Learning rate 
learning_rate = 2e-05

# Number of labels 
num_labels = 7

In [23]:
# model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels).to('cuda')

In [24]:
model = GPT2ForMultiLabelSequenceClassification.from_pretrained(MODEL, num_labels=num_labels).to('cuda')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=665.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=548118077.0, style=ProgressStyle(descri…




Some weights of GPT2ForMultiLabelSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['gpt2.h.2.mlp.c_proj.bias', 'gpt2.h.11.attn.c_attn.bias', 'gpt2.h.3.mlp.c_proj.bias', 'gpt2.h.8.attn.c_attn.weight', 'gpt2.h.3.attn.c_proj.bias', 'gpt2.h.10.ln_2.bias', 'gpt2.h.1.mlp.c_proj.weight', 'gpt2.h.4.attn.c_proj.bias', 'gpt2.h.5.mlp.c_proj.bias', 'gpt2.h.6.ln_1.bias', 'gpt2.h.7.ln_2.bias', 'gpt2.h.1.attn.c_proj.weight', 'gpt2.h.3.mlp.c_fc.bias', 'gpt2.h.7.mlp.c_fc.bias', 'gpt2.wte.weight', 'gpt2.h.7.attn.c_attn.bias', 'gpt2.h.2.mlp.c_proj.weight', 'gpt2.h.8.attn.c_attn.bias', 'gpt2.h.7.attn.c_proj.weight', 'gpt2.h.5.attn.bias', 'gpt2.h.9.ln_2.weight', 'gpt2.h.4.ln_2.bias', 'gpt2.h.3.attn.c_proj.weight', 'gpt2.h.11.ln_2.weight', 'gpt2.h.9.attn.bias', 'gpt2.h.7.mlp.c_proj.weight', 'gpt2.h.0.mlp.c_fc.bias', 'gpt2.h.10.attn.c_proj.bias', 'gpt2.h.4.mlp.c_fc.bias', 'gpt2.h.8.mlp.c_fc.weight', 'gpt2.h.9.ln_1.bias', 'gpt2.h.9.attn.c_attn.bias', 'gp

In [25]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# special_tokens_dict = {'pad_token': '[PAD]'}

# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1355256.0, style=ProgressStyle(descript…




In [26]:
# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

In [27]:
batch_size = BATCH_SIZE

args = TrainingArguments(
    output_dir="stereotype_classification_gpt-2",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=N_EPOCHS,
    weight_decay=0.01
)

In [28]:
train_dataset,val_dataset, test_dataset = create_train_val_datasets(tokenizer)

In [29]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [30]:
trainer.evaluate()

AttributeError: ignored

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy Thresh,Runtime,Samples Per Second
1,No log,0.118057,0.971667,4.5859,43.612
2,No log,0.117506,0.971667,4.58,43.668
3,No log,0.113218,0.971667,4.5533,43.925


TrainOutput(global_step=300, training_loss=0.13753929138183593, metrics={'train_runtime': 155.9298, 'train_samples_per_second': 1.924, 'total_flos': 459061736820192.0, 'epoch': 3.0, 'train_mem_cpu_alloc_delta': 344296, 'train_mem_gpu_alloc_delta': 1320693760, 'train_mem_cpu_peaked_delta': 390675, 'train_mem_gpu_peaked_delta': 6504875008})