<br><br>

## **Import necessary Python libraries and modules**

In [None]:
pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     

In [None]:
import torch
# from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification
from transformers import Trainer, TrainingArguments

In [None]:
from collections import defaultdict
import gdown
import gzip
import json
import os
import random
import pickle

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)

<br><br>

## **Set parameters and file paths**

In [None]:
model_name = 'roberta-base'
device_name = 'cuda'
max_length = 512
cached_model_directory_name = 'fine-tuned-models'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
colab_directory_path = '/content/drive/My Drive/colab-output/2024-stories'

In [None]:
%cd /content/drive/My Drive/colab-output/2024-stories

/content/drive/My Drive/colab-output/2024-stories


<br><br>

## **Load and sample stories data**

In [None]:
story_df = pd.read_csv('gold.stories_and_events.all.formatted_CONSENSUS.clean.csv')
len(story_df.index)

502

In [None]:
_story_df_train = story_df[story_df['split'] == 'train']
_story_df_test = story_df[story_df['split'] == 'test']

X_train = _story_df_train['text'].tolist()
X_val = _story_df_test['text'].tolist()

y_train = _story_df_train['gold_consensus'].tolist()
y_val = _story_df_test['gold_consensus'].tolist()

len(X_train), len(X_val), len(y_train), len(y_val)

(301, 101, 301, 101)

<br><br>

## **Encode data for BERT**


In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
unique_labels = set(label for label in y_train)
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

In [None]:
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=max_length)
test_encodings  = tokenizer(X_val, truncation=True, padding=True, max_length=max_length)

train_labels_encoded = [label2id[y] for y in y_train]
test_labels_encoded  = [label2id[y] for y in y_val]

<br><br>

## **Make a custom Torch dataset**

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = MyDataset(train_encodings, train_labels_encoded)
test_dataset = MyDataset(test_encodings, test_labels_encoded)

<br><br>

## **Load pre-trained model**

In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(id2label)).to(device_name)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<br><br>

## **Set the BERT fine-tuning parameters**

In [None]:
training_args = TrainingArguments(
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=20,   # batch size for evaluation
    learning_rate=5e-5,              # initial learning rate for Adam optimizer
    warmup_steps=20,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    output_dir='./storyseeker',          # output directory
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,                # number of steps to output logging
    evaluation_strategy='steps',     # evaluate during fine-tuning so that we can see progress
)

<br><br>

## **Fine-tune the BERT model**

In [None]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  acc = accuracy_score(labels, preds)
  return {
      'accuracy': acc,
  }

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset (usually a validation set; here we just send our test set)
    compute_metrics=compute_metrics      # our custom evaluation function
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy
10,0.6936,0.705089,0.415842
20,0.6797,0.611842,0.722772
30,0.5749,0.640896,0.772277
40,0.4985,0.422743,0.80198
50,0.2081,0.422383,0.821782


TrainOutput(global_step=57, training_loss=0.49783703946230706, metrics={'train_runtime': 98.9193, 'train_samples_per_second': 9.129, 'train_steps_per_second': 0.576, 'total_flos': 237589282990080.0, 'train_loss': 0.49783703946230706, 'epoch': 3.0})

<br><br>

## **Save fine-tuned model**

In [None]:
trainer.save_model('model')

<br><br>

## **Evaluate fine-tuned model**

In [None]:
trainer.evaluate()

{'eval_loss': 0.4447569251060486,
 'eval_accuracy': 0.8316831683168316,
 'eval_runtime': 3.2861,
 'eval_samples_per_second': 30.735,
 'eval_steps_per_second': 1.826,
 'epoch': 3.0}

In [None]:
predicted_results = trainer.predict(test_dataset)

In [None]:
predicted_labels = predicted_results.predictions.argmax(-1) # Get the highest probability prediction
predicted_labels = predicted_labels.flatten().tolist()      # Flatten the predictions into a 1D list
predicted_labels = [id2label[l] for l in predicted_labels]  # Convert from integers back to strings for readability

In [None]:
len(predicted_labels)

101

In [None]:
print(classification_report(y_val,
                            predicted_labels))

              precision    recall  f1-score   support

           0       0.85      0.86      0.86        59
           1       0.80      0.79      0.80        42

    accuracy                           0.83       101
   macro avg       0.83      0.83      0.83       101
weighted avg       0.83      0.83      0.83       101



<br><br><br><br>

## **Evaluate on test set**

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
p_list = []
r_list = []
f_list = []

p_list0 = []
r_list0 = []
f_list0 = []

p_list1 = []
r_list1 = []
f_list1 = []

for i in range(100):

    print(i)

    _story_df_test = story_df[story_df['split'] == 'test']
    _story_df_test = _story_df_test.sample(len(_story_df_test.index), replace=True)

    X_test = _story_df_test['text'].tolist()
    y_test = _story_df_test['gold_consensus'].tolist()

    test_encodings  = tokenizer(X_test, truncation=True, padding=True, max_length=max_length)
    test_labels_encoded  = [label2id[y] for y in y_test]
    test_dataset = MyDataset(test_encodings, test_labels_encoded)

    predicted_results = trainer.predict(test_dataset)
    y_pred = predicted_results.predictions.argmax(-1) # Get the highest probability prediction
    y_pred = y_pred.flatten().tolist()      # Flatten the predictions into a 1D list
    y_pred = [id2label[l] for l in y_pred]  # Convert from integers back to strings for readability

    p_list.append(precision_score(y_test, y_pred, average='macro'))
    r_list.append(recall_score(y_test, y_pred, average='macro'))
    f_list.append(f1_score(y_test, y_pred, average='macro'))

    p_list0.append(precision_score(y_test, y_pred, average='binary', pos_label=0))
    r_list0.append(recall_score(y_test, y_pred, average='binary', pos_label=0))
    f_list0.append(f1_score(y_test, y_pred, average='binary', pos_label=0))

    p_list1.append(precision_score(y_test, y_pred, average='binary', pos_label=1))
    r_list1.append(recall_score(y_test, y_pred, average='binary', pos_label=1))
    f_list1.append(f1_score(y_test, y_pred, average='binary', pos_label=1))

print('MACRO AVERAGE')
print('precision:', round(np.mean(p_list), 3), '+-', round(np.std(p_list), 3))
print('recall:', round(np.mean(r_list), 3), '+-', round(np.std(r_list), 3))
print('f1:', round(np.mean(f_list), 3), '+-', round(np.std(f_list), 3))
print()
print('BINARY (0)')
print('precision:', round(np.mean(p_list0), 3), '+-', round(np.std(p_list0), 3))
print('recall:', round(np.mean(r_list0), 3), '+-', round(np.std(r_list0), 3))
print('f1:', round(np.mean(f_list0), 3), '+-', round(np.std(f_list0), 3))
print()
print('BINARY (1)')
print('precision:', round(np.mean(p_list1), 3), '+-', round(np.std(p_list1), 3))
print('recall:', round(np.mean(r_list1), 3), '+-', round(np.std(r_list1), 3))
print('f1:', round(np.mean(f_list1), 3), '+-', round(np.std(f_list1), 3))