<a href="https://colab.research.google.com/github/mvdheram/Stereotypical-Social-bias-detection-/blob/Pre-trained-LM-selection-and-training/Huggingface_XLNET_GPT_2_Multi_label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Wed Aug  4 13:01:52 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.42.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   55C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install transformers==4.5.1 --quiet

[K     |████████████████████████████████| 2.1 MB 4.1 MB/s 
[K     |████████████████████████████████| 895 kB 49.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 25.6 MB/s 
[?25h

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          PreTrainedModel,
                          TrainingArguments, Trainer)
from transformers import XLNetTokenizer, XLNetForSequenceClassification
from transformers import GPT2Tokenizer, GPT2Model, GPT2ForSequenceClassification
from torch.utils.data import Dataset , DataLoader
from transformers.modeling_outputs import SequenceClassifierOutput
from transformers import AdamW
from tqdm import trange
import pandas as pd
import numpy as np
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
import logging
import argparse
from tqdm import tqdm
from torch import nn

# Loading 

In [4]:
df_ohe = pd.read_csv('/content/ohe_multilabel.csv', index_col = 0)

In [5]:
y = df_ohe.iloc[:,:-1].values
X = df_ohe.iloc[:,-1].values

In [6]:
MAX_LEN = 50
RANDOM_SEED = 47

In [7]:
from sklearn.model_selection import train_test_split

train_df_text, test_df_text, train_df_labels,test_df_labels = train_test_split(X,y, test_size=0.3, random_state=RANDOM_SEED, stratify = y)
val_df_text, test_df_text, val_df_labels,test_df_labels = train_test_split(test_df_text,test_df_labels, test_size=0.5, random_state=RANDOM_SEED,stratify = test_df_labels)

In [8]:
LABEL_COLUMNS = ['Ethnicity',	'gender'	,'profession'	,'religion',	'Anti-stereotype',	'stereotype',	'unrelated']

In [9]:
train_df_labels = pd.DataFrame(train_df_labels, columns= LABEL_COLUMNS)
val_df_labels = pd.DataFrame(val_df_labels, columns= LABEL_COLUMNS)
test_df_labels = pd.DataFrame(test_df_labels, columns= LABEL_COLUMNS)
train_df_text = pd.DataFrame(train_df_text, columns = ['sentence'])
val_df_text = pd.DataFrame(val_df_text, columns = ['sentence'])
test_df_text = pd.DataFrame(test_df_text, columns = ['sentence'])

In [10]:
train_df = pd.concat([train_df_text,train_df_labels],axis=1)
val_df = pd.concat([val_df_text,val_df_labels], axis = 1)
test_df = pd.concat([test_df_text,test_df_labels], axis = 1)

In [11]:
train_df.shape, val_df.shape, test_df.shape

((11580, 8), (2482, 8), (2482, 8))

In [12]:
train_df

Unnamed: 0,sentence,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated
0,The grandfather dined at the restaurant. He en...,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,"Albeit unattractive, her scars served as a sta...",0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,there is a new middle eastern ice cream compan...,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,while they are in prison the kikes should get ...,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,The Historian was a teacher He loved history a...,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
11575,Most butchers are window.,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11576,Muslim households are very polyester places.,0.0,0.0,0.0,0.0,0.0,0.0,1.0
11577,Yemen is near away from california,1.0,0.0,0.0,0.0,1.0,0.0,0.0
11578,The shirt is too wet to wear.,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Tokenize and encode using pytorch datasets


In [13]:
class ExplicitStereotypeDataset(Dataset):

  def __init__(self, data: pd.DataFrame, tokenizer,max_token_len: int = 50):
    self.tokenizer = tokenizer
    self.data = data
    self.max_token_len = max_token_len
  
  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, index: int):
    data_row = self.data.iloc[index]
    text = data_row[0]
    # labels = data_row[2]
    labels = list(data_row.iloc[1:].to_dict().values()) # To handle one-hot encoded categorical values [0-8] 

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_token_len,
      return_token_type_ids=False,
      padding= True,
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return dict(
      # text=text,
      input_ids=encoding["input_ids"].flatten(),
      attention_mask=encoding["attention_mask"].flatten(),
      labels= torch.FloatTensor(labels)
    )

In [14]:
def create_train_val_datasets(tokenizer):

  train_dataset = ExplicitStereotypeDataset(train_df, tokenizer, max_token_len=MAX_LEN)
  val_dataset = ExplicitStereotypeDataset(val_df, tokenizer, max_token_len=MAX_LEN)
  test_dataset = ExplicitStereotypeDataset(test_df, tokenizer, max_token_len=MAX_LEN)

  return train_dataset, val_dataset, test_dataset

# Fine-tuning 

Fine-tuning for Multi-Label classification can be done by either 

1. Creating a model that overrides the `forward` method of huggingface transformers with 
  * Appropriate pooling
  * Loss function : `torch.nn.BCEWithLogitsLoss()`
2. Creating a custom `trainer` that overrides `compute_loss`

Reference:

1. Huggingface : https://colab.research.google.com/drive/1X7l8pM6t4VLqxQVJ23ssIxmrsc4Kpc5q?usp=sharing#scrollTo=XZEN8MhaL54M
2. https://github.com/gkebe/mlmc/blob/master/mlmc_class.py


Methodology :

* Using the pooling used in huggingface transformers for sequence classification 
* Overriding the `compute_loss` of `trainer` class. 


## XLnet, GPT-2

Method:
  * Creating a custom `trainer` that overrides `compute_loss`

In [17]:
class MultilabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
                        labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

## GPT2

Method:
  * Creating method that overrides `forward` method 

  Look into : https://github.com/huggingface/transformers/issues/3168

In [None]:
class GPT2ForMultiLabelSequenceClassification(GPT2ForSequenceClassification):

    def __init__(self, config):
        super().__init__(config)

    def forward(
        self,
        input_ids=None,
        past_key_values=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        use_cache=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        transformer_outputs = self.transformer(
            input_ids,
            past_key_values=past_key_values,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        hidden_states = transformer_outputs[0]
        logits = self.score(hidden_states)

        if input_ids is not None:
            batch_size, sequence_length = input_ids.shape[:2]
        else:
            batch_size, sequence_length = inputs_embeds.shape[:2]

        assert (
            self.config.pad_token_id is not None or batch_size == 1
        ), "Cannot handle batch sizes > 1 if no padding token is defined."
        if self.config.pad_token_id is None:
            sequence_lengths = -1
        else:
            if input_ids is not None:
                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
            else:
                sequence_lengths = -1
                logger.warning(
                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
                )

        pooled_logits = logits[range(batch_size), sequence_lengths]

        loss = None
        if labels is not None:
            loss_fct = BCEWithLogitsLoss()
        #Changes: labels vector is extended to the number labels instead of 1
            loss = loss_fct(pooled_logits.view(-1, self.num_labels),
                            pooled_logits.view(-1, self.num_labels).type_as(logits.view(-1, self.num_labels)))
            
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutputWithPast(
            loss=loss,
            logits=pooled_logits,
            past_key_values=transformer_outputs.past_key_values,
            hidden_states=transformer_outputs.hidden_states,
            attentions=transformer_outputs.attentions,
        )

# Training 

## Metrics

In [109]:
LABELS = ['Ethnicity','gender','profession','religion','Anti-stereotype','stereotype','unrelated']

In [119]:
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report,hamming_loss, roc_auc_score, accuracy_score,multilabel_confusion_matrix
import numpy as np
import json

upper, lower = 1, 0

def classification_metrics(test_pred,labels,model_name,threshold,label_names = LABELS):

  print("Evaluation metrics for test set:")
  y_pred = np.where(test_pred > threshold, upper, lower)
  ROC_AUC_score = roc_auc_score(test_df_labels, test_pred)
  accuracy = accuracy_score(labels, y_pred)
  hloss = hamming_loss(labels, y_pred)
  cr = classification_report(labels, y_pred, labels=list(range(len(label_names))), target_names=label_names, output_dict=True)
  cf = multilabel_confusion_matrix(test_df_labels, 
  y_pred)

  recall_macro = recall_score(labels, y_pred, average="macro")
  precision_macro = precision_score(labels, y_pred, average="macro")
  f1_macro = f1_score(labels, y_pred, average="macro")
  
  model_metrics = {}
  model_metrics["AUC_ROC_score"] = ROC_AUC_score
  model_metrics["subset_accuracy"] = accuracy
  model_metrics["hamming_loss"]= hloss

  if write_to_file:
    model_metrics["Classification_report"] = cr

    for i,val in enumerate(LABEL_COLUMNS):
      model_metrics['confusion_matrix' + '_' + val] = str(cf[i].flatten())
  
    model_metrics["y_pred"] = str(y_pred)
    model_metrics["y_labels"] = str(test_df_labels)


    if threshold != 0.5:
      th = "calculated_threshold"
    else:
      th = threshold

    model_metrics["threshold"] = th
    output_file = "eval_results_" + model_name + "_"+str(th) +"_"+ ".json"
    
    with open(output_file, "w" ) as writer:
        json.dump(model_metrics,writer)
  
  return model_metrics
  # print("\n ROC-AUC score: %.6f \n" % (ROC_AUC_score))
  # print("\n Subset accuracy : %.6f \n" % (accuracy))
  # print("\n hamming_loss : %.6f \n" % (hloss))

  # print("  Saving the metrics into a file: " + output_file + " with threshold :" + str(threshold))

In [120]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    return classification_metrics(predictions,labels,MODEL_NAME,0.5)

## XLNet

In [58]:
# Number of epochs 
N_EPOCHS = 2

# Batch_size 
BATCH_SIZE = 32

# Model name 
MODEL = 'xlnet-base-cased'

MODEL_NAME = 'xlnet'

# Learning rate 
learning_rate = 2.49816047538945e-05

# Number of labels 
num_labels = 7

In [59]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels).to('cuda')

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [60]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [61]:
batch_size = BATCH_SIZE

args = TrainingArguments(
    output_dir="stereotype_classification",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=N_EPOCHS,
    weight_decay=0.01
)

In [62]:
train_dataset,val_dataset, test_dataset = create_train_val_datasets(tokenizer)

In [63]:
train_dataset

<__main__.ExplicitStereotypeDataset at 0x7fbf82cec790>

In [64]:
sample = train_dataset[0]

In [65]:
sample['labels']

tensor([0., 1., 0., 0., 1., 0., 0.])

In [66]:
multi_trainer = MultilabelTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [67]:
write_to_file = False # Disable logging the metrics to file 
multi_trainer.train()

Epoch,Training Loss,Validation Loss,Auc Roc Score,Subset Accuracy,Hamming Loss,Runtime,Samples Per Second
1,No log,0.234616,0.505025,0.514907,0.107862,15.1022,164.346
2,0.288100,0.207713,0.502472,0.569299,0.096581,15.0301,165.135


Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))


Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=724, training_loss=0.2600429123936437, metrics={'train_runtime': 637.3775, 'train_samples_per_second': 1.136, 'total_flos': 624664919553432.0, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 8192, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1441792, 'train_mem_gpu_alloc_delta': 1427509248, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 1445477888})

In [72]:
write_to_file = True # Enable logging the metrics to file 
multi_trainer.evaluate(test_dataset,metric_key_prefix="test")

Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'Ethnicity': {'precision': 0.9108433734939759, 'recall': 0.9642857142857143, 'f1-score': 0.9368029739776952, 'support': 784}, 'gender': {'precision': 0.850909090909091, 'recall': 0.7697368421052632, 'f1-score': 0.8082901554404146, 'support': 304}, 'profession': {'precision': 0.8705357142857143, 'recall': 0.8351177730192719, 'f1-score': 0.8524590163934426, 'support': 467}, 'religion': {'precision': 0.9862068965517241, 'recall': 0.9761092150170648, 'f1-score': 0.9811320754716981, 'support': 293}, 'Anti-stereotype': {'precision': 0.7899159663865546, 'recall': 0.36246786632390743, 'f1-score': 0.4969162995594713, 'support': 778}, 'stereotype': {'precision': 0.8040540540540541, 'recall': 0.6672897196261682, 'f1-score': 0.7293156281920328, 'support': 1070}, 'unrelated': {'precision': 0.983271375464684, 'recall': 0.8343848580441641, 'f1-score': 0.902730375426621, 'support': 634}, 'micro avg': {'pre

{'epoch': 2.0,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 115342848,
 'test_AUC_ROC_score': 0.9541083160855942,
 'test_Classification_report': {'Anti-stereotype': {'f1-score': 0.4969162995594713,
   'precision': 0.7899159663865546,
   'recall': 0.36246786632390743,
   'support': 778},
  'Ethnicity': {'f1-score': 0.9368029739776952,
   'precision': 0.9108433734939759,
   'recall': 0.9642857142857143,
   'support': 784},
  'gender': {'f1-score': 0.8082901554404146,
   'precision': 0.850909090909091,
   'recall': 0.7697368421052632,
   'support': 304},
  'macro avg': {'f1-score': 0.8153780749230537,
   'precision': 0.8851052101636855,
   'recall': 0.772770284060222,
   'support': 4330},
  'micro avg': {'f1-score': 0.8021618903971846,
   'precision': 0.8800330943188086,
   'recall': 0.7369515011547344,
   'support': 4330},
  'profession': {'f1-score': 0.8524590163934426,
   'precision': 0.8705357142857143,


In [74]:
!zip -r /content/file.zip /content/stereotype_classification

  adding: content/stereotype_classification/ (stored 0%)
  adding: content/stereotype_classification/checkpoint-500/ (stored 0%)
  adding: content/stereotype_classification/checkpoint-500/trainer_state.json (deflated 49%)
  adding: content/stereotype_classification/checkpoint-500/optimizer.pt (deflated 21%)
  adding: content/stereotype_classification/checkpoint-500/pytorch_model.bin (deflated 7%)
  adding: content/stereotype_classification/checkpoint-500/special_tokens_map.json (deflated 48%)
  adding: content/stereotype_classification/checkpoint-500/training_args.bin (deflated 46%)
  adding: content/stereotype_classification/checkpoint-500/config.json (deflated 56%)
  adding: content/stereotype_classification/checkpoint-500/spiece.model (deflated 49%)
  adding: content/stereotype_classification/checkpoint-500/scheduler.pt (deflated 49%)
  adding: content/stereotype_classification/checkpoint-500/tokenizer_config.json (deflated 48%)


In [121]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## GPT-2

In [101]:
# Number of epochs 
N_EPOCHS = 2

# Batch_size 
BATCH_SIZE = 32

# Model name 
MODEL = 'gpt2'

MODEL_NAME = 'gpt-2'

# Learning rate 
learning_rate = 2.49816047538945e-05

# Number of labels 
num_labels = 7

In [102]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels).to('cuda')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [103]:
# model = GPT2ForMultiLabelSequenceClassification.from_pretrained(MODEL, num_labels=num_labels).to('cuda')

In [104]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

special_tokens_dict = {'pad_token': '[PAD]'}

# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token

In [105]:
# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

In [106]:
batch_size = BATCH_SIZE

args = TrainingArguments(
    output_dir="stereotype_classification_gpt-2",
    evaluation_strategy = "epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=N_EPOCHS,
    weight_decay=0.01
)

In [107]:
train_dataset,val_dataset, test_dataset = create_train_val_datasets(tokenizer)

In [108]:
multi_trainer = MultilabelTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer)

In [112]:
write_to_file = False # Disable logging the metrics to file 
multi_trainer.train()

Epoch,Training Loss,Validation Loss,Auc Roc Score,Subset Accuracy,Hamming Loss,Runtime,Samples Per Second
1,No log,0.341349,0.49561,0.247381,0.174571,18.1344,136.867
2,0.443000,0.308771,0.492717,0.278002,0.15857,18.4444,134.567


Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))


Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=724, training_loss=0.4092355696535901, metrics={'train_runtime': 622.1446, 'train_samples_per_second': 1.164, 'total_flos': 601079198773248.0, 'epoch': 2.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': -984981504, 'train_mem_gpu_alloc_delta': 1514442240, 'train_mem_cpu_peaked_delta': 985219072, 'train_mem_gpu_peaked_delta': 2092218880})

In [118]:
write_to_file = True # Enable logging the metrics to file 
multi_trainer.evaluate(test_dataset,metric_key_prefix="test")

Evaluation metrics for test set:


  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'Ethnicity': {'precision': 0.8746397694524496, 'recall': 0.7742346938775511, 'f1-score': 0.8213802435723951, 'support': 784}, 'gender': {'precision': 0.8524590163934426, 'recall': 0.17105263157894737, 'f1-score': 0.28493150684931506, 'support': 304}, 'profession': {'precision': 0.7324675324675325, 'recall': 0.6038543897216274, 'f1-score': 0.6619718309859154, 'support': 467}, 'religion': {'precision': 0.9249146757679181, 'recall': 0.9249146757679181, 'f1-score': 0.9249146757679181, 'support': 293}, 'Anti-stereotype': {'precision': 0.40476190476190477, 'recall': 0.021850899742930592, 'f1-score': 0.041463414634146344, 'support': 778}, 'stereotype': {'precision': 0.8901098901098901, 'recall': 0.30280373831775703, 'f1-score': 0.45188284518828453, 'support': 1070}, 'unrelated': {'precision': 0.8678861788617886, 'recall': 0.6735015772870663, 'f1-score': 0.7584369449378331, 'support': 634}, 'micro 

{'epoch': 2.0,
 'eval_mem_cpu_alloc_delta': 0,
 'eval_mem_cpu_peaked_delta': 0,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 283736064,
 'test_AUC_ROC_score': 0.8914286769035048,
 'test_Classification_report': {'Anti-stereotype': {'f1-score': 0.041463414634146344,
   'precision': 0.40476190476190477,
   'recall': 0.021850899742930592,
   'support': 778},
  'Ethnicity': {'f1-score': 0.8213802435723951,
   'precision': 0.8746397694524496,
   'recall': 0.7742346938775511,
   'support': 784},
  'gender': {'f1-score': 0.28493150684931506,
   'precision': 0.8524590163934426,
   'recall': 0.17105263157894737,
   'support': 304},
  'macro avg': {'f1-score': 0.563568780276544,
   'precision': 0.7924627096878466,
   'recall': 0.4960303723276853,
   'support': 4330},
  'micro avg': {'f1-score': 0.5945053295301005,
   'precision': 0.8494208494208494,
   'recall': 0.45727482678983833,
   'support': 4330},
  'profession': {'f1-score': 0.6619718309859154,
   'precision': 0.7324675324

In [122]:
!zip -r /content/gpt2.zip /content/stereotype_classification_gpt-2

  adding: content/stereotype_classification_gpt-2/ (stored 0%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/ (stored 0%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/trainer_state.json (deflated 48%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/optimizer.pt (deflated 31%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/pytorch_model.bin (deflated 9%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/special_tokens_map.json (deflated 60%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/training_args.bin (deflated 46%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/config.json (deflated 56%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/vocab.json (deflated 59%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/merges.txt (deflated 53%)
  adding: content/stereotype_classification_gpt-2/checkpoint-500/scheduler.pt (deflated 49%)
 

In [123]:
from google.colab import files
files.download("/content/file.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>