<br><br><br><br>

# **Imports and paths**

In [None]:
pip install transformers[torch]



In [None]:
import torch
from transformers import Trainer, TrainingArguments

In [None]:
import ast
from collections import defaultdict
import gdown
import gzip
import json
import random
import pickle

import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import ticker
sns.set(style='ticks', font_scale=1.2)

In [None]:
model_name = 'roberta-base'
device_name = 'cuda'
max_length = 512
cached_model_directory_name = 'fine-tuned-models'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
colab_directory_path = '/content/drive/My Drive/colab-output/2023-12-10-stories-spans'

In [None]:
%cd /content/drive/My Drive/colab-output/2023-12-10-stories-spans

/content/drive/My Drive/colab-output/2023-12-10-stories-spans


<br><br><br><br>

# **Load dataset**

In [None]:
data_df = pd.read_csv('gold.stories_and_events.all.formatted_CONSENSUS.clean.csv')
len(data_df.index)

502

In [None]:
data_train = []
data_val = []
data_test = []

for i, r in data_df.iterrows():

  # Assume no story span
  _new_tags= [0]*len(r['tokens_stories_union'])

  # Only include the span if the consensus was that the text contains a story
  if r['gold_consensus'] == 1:
    _tags = ast.literal_eval(r['tokens_stories_union'])
    _new_tags = _tags # don't use beginning tag

  if r['split'] == 'train':
    data_train.append({'id': r['id'],
                       'tags': _new_tags,
                       'tokens': ast.literal_eval(r['tokens'])})
  elif r['split'] == 'val':
    data_val.append({'id': r['id'],
                     'tags': _new_tags,
                     'tokens': ast.literal_eval(r['tokens'])})
  elif r['split'] == 'test':
    data_test.append({'id': r['id'],
                      'tags': _new_tags,
                      'tokens': ast.literal_eval(r['tokens'])})

len(data_train), len(data_val), len(data_test)

(301, 100, 101)

In [None]:
label_list = ['Non-Story', 'Story']

<br><br><br><br>

# **Preprocess**

In [None]:
!pip install datasets



In [None]:
from datasets import Dataset, DatasetDict, load_dataset

In [None]:
dataset_train = Dataset.from_list(data_train)
dataset_val = Dataset.from_list(data_val)
dataset_test = Dataset.from_list(data_test)

In [None]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples['tokens'], truncation=True, is_split_into_words=True, padding=True)

    labels = []
    for i, label in enumerate(examples['tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_data_train = dataset_train.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/301 [00:00<?, ? examples/s]

In [None]:
tokenized_data_val = dataset_val.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [None]:
tokenized_data_test = dataset_test.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [None]:
for t in tokenized_data_test:
  if len(t['tokens']) < 300:
    print(t)

{'id': 'chaqss9', 'tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

<br><br><br><br>

# **Evaluate**

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m936.7 kB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m41.0/43.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m966.2 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=199202017b4fac00998e5b7d04411b0f6e14859019f95b81a4c744757fac2b7a
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [None]:
import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

<br><br><br><br>

# **Train**

In [None]:
id2label = {
    0: "Non-Story",
    1: "Story",
}
label2id = {
    "Non-Story": 0,
    "Story": 1,
}

In [None]:
from transformers import RobertaForTokenClassification


model = RobertaForTokenClassification.from_pretrained(model_name,
                                                      num_labels=2,
                                                      id2label=id2label,
                                                      label2id=label2id)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir=cached_model_directory_name,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    warmup_steps=20,
    save_strategy="epoch",
    logging_steps=10,
    evaluation_strategy='steps',
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data_train,
    eval_dataset=tokenized_data_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
10,0.6834,0.643895,0.277092,0.020078,0.037442,0.634065
20,0.6165,0.578763,0.533333,0.033396,0.062857,0.642973
30,0.5074,0.403896,0.757882,0.70003,0.727808,0.840356




TrainOutput(global_step=38, training_loss=0.556824477095353, metrics={'train_runtime': 94.2626, 'train_samples_per_second': 6.386, 'train_steps_per_second': 0.403, 'total_flos': 157300647555072.0, 'train_loss': 0.556824477095353, 'epoch': 2.0})

In [None]:
trainer.save_model(cached_model_directory_name + '/model')

<br><br><br><br>

# **Examine evaluation**

In [None]:
trainer.evaluate(eval_dataset=tokenized_data_test)



{'eval_loss': 0.29643887281417847,
 'eval_precision': 0.7646288781254091,
 'eval_recall': 0.7416201117318436,
 'eval_f1': 0.7529487592652272,
 'eval_accuracy': 0.8843197676477881,
 'eval_runtime': 4.5462,
 'eval_samples_per_second': 22.216,
 'eval_steps_per_second': 1.54,
 'epoch': 2.0}

In [None]:
predicted_results = trainer.predict(tokenized_data_test)

In [None]:
predicted_results.predictions.shape

(101, 512, 2)

In [None]:
predicted_labels = predicted_results.predictions.argmax(-1) # Get the highest probability prediction
predicted_labels = predicted_labels.flatten().tolist()      # Flatten the predictions into a 1D list

In [None]:
len(predicted_labels)

51712

In [None]:
predicted_labels[0]

1

In [None]:
y_test = [t for d in tokenized_data_test for t in d['labels']]
len(y_test)

51712

In [None]:
def compute_metrics(label_lists, prediction_lists):

    prediction_lists = np.argmax(prediction_lists, axis=2)

    # save a list of labels for each document
    # -100 are special characters so we need to check for those and leave them out of the evaluation
    predicted_label_lists = [[label_list[p] for (p, l) in zip(predictions, labels) if l != -100]
                             for predictions, labels in zip(prediction_lists, label_lists)]
    true_label_lists = [[label_list[l] for (p, l) in zip(predictions, labels) if l != -100]
                       for predictions, labels in zip(prediction_lists, label_lists)]

    # flatten
    predicted_labels = [t for d in predicted_label_lists for t in d]
    true_labels = [t for d in true_label_lists for t in d]

    print(predicted_labels[:10])
    print(true_labels[:10])

    print(classification_report(true_labels,
                                predicted_labels))

In [None]:
compute_metrics(tokenized_data_test['labels'], predicted_results.predictions)

['Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story']
['Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story', 'Non-Story']
              precision    recall  f1-score   support

   Non-Story       0.91      0.94      0.92     20477
       Story       0.82      0.75      0.78      7756

    accuracy                           0.88     28233
   macro avg       0.86      0.84      0.85     28233
weighted avg       0.88      0.88      0.88     28233



<br><br><br><br>

# **Bootstrap**

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score
from random import choices

In [None]:
p_list = []
r_list = []
f_list = []

p_list0 = []
r_list0 = []
f_list0 = []

p_list1 = []
r_list1 = []
f_list1 = []

for i in range(100):

    print(i)

    _labels = tokenized_data_test['labels']
    _predictions = np.argmax(predicted_results.predictions, axis=2)

    y_pred = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(_predictions, _labels)
    ]
    y_true = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(_predictions, _labels)
    ]

    # flatten
    y_pred = [t for d in y_pred for t in d]
    y_true = [t for d in y_true for t in d]

    _sample = choices(list(zip(y_pred, y_true)), k=len(y_pred))

    y_pred = [p for p, t in _sample]
    y_true = [t for p, t in _sample]

    p_list.append(precision_score(y_true, y_pred, average='macro'))
    r_list.append(recall_score(y_true, y_pred, average='macro'))
    f_list.append(f1_score(y_true, y_pred, average='macro'))

    p_list0.append(precision_score(y_true, y_pred, average='binary', pos_label='Non-Story'))
    r_list0.append(recall_score(y_true, y_pred, average='binary', pos_label='Non-Story'))
    f_list0.append(f1_score(y_true, y_pred, average='binary', pos_label='Non-Story'))

    p_list1.append(precision_score(y_true, y_pred, average='binary', pos_label='Story'))
    r_list1.append(recall_score(y_true, y_pred, average='binary', pos_label='Story'))
    f_list1.append(f1_score(y_true, y_pred, average='binary', pos_label='Story'))

print('MACRO AVERAGE')
print('precision:', round(np.mean(p_list), 3), '+-', round(np.std(p_list), 3))
print('recall:', round(np.mean(r_list), 3), '+-', round(np.std(r_list), 3))
print('f1:', round(np.mean(f_list), 3), '+-', round(np.std(f_list), 3))
print()
print('BINARY (0)')
print('precision:', round(np.mean(p_list0), 3), '+-', round(np.std(p_list0), 3))
print('recall:', round(np.mean(r_list0), 3), '+-', round(np.std(r_list0), 3))
print('f1:', round(np.mean(f_list0), 3), '+-', round(np.std(f_list0), 3))
print()
print('BINARY (1)')
print('precision:', round(np.mean(p_list1), 3), '+-', round(np.std(p_list1), 3))
print('recall:', round(np.mean(r_list1), 3), '+-', round(np.std(r_list1), 3))
print('f1:', round(np.mean(f_list1), 3), '+-', round(np.std(f_list1), 3))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
MACRO AVERAGE
precision: 0.828 +- 0.003
recall: 0.788 +- 0.003
f1: 0.804 +- 0.003

BINARY (0)
precision: 0.873 +- 0.002
recall: 0.933 +- 0.002
f1: 0.902 +- 0.002

BINARY (1)
precision: 0.784 +- 0.006
recall: 0.643 +- 0.006
f1: 0.706 +- 0.005
