In [None]:
! pip install -q accelerate datasets evaluate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/279.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m276.5/279.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m279.7/279.7 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.3/38.3 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip

In [None]:
import torch
import torch.nn as nn
import numpy as np
from datasets import load_dataset
import evaluate
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, EvalPrediction

from time import time
from transformers import set_seed
SEED = 2023
set_seed(2023)

In [None]:
def replace_none_with_str(dataset):
  for i in range(len(dataset)):
    if dataset[i]['text'] == None:
      dataset[i]['text'] = ''

def sigmoid(X):
    return 1 / (1 + np.exp(-X))

def heaviside(X):
    return np.heaviside(X - 0.5, 0)

def compute_metrics(eval_preds: EvalPrediction):
    logits, labels = eval_preds.predictions, eval_preds.label_ids
    predictions = heaviside(sigmoid(logits))

    f1 = f1_score(labels, predictions, average=None, zero_division=0.0)
    f1 = {f'f1_C{i}': f1[i] for i in range(len(f1))}
    f1_macro = f1_score(labels, predictions, average='macro', zero_division=0.0)
    recall = recall_score(labels, predictions, average=None, zero_division=0.0)
    recall = {f'recall_C{i}': recall[i] for i in range(len(recall))}
    recall_macro = recall_score(labels, predictions, average='macro', zero_division=0.0)
    precision = precision_score(labels, predictions, average=None, zero_division=0.0)
    precision = {f'precision_C{i}': precision[i] for i in range(len(precision))}
    precision_macro = precision_score(labels, predictions, average='macro', zero_division=0.0)
    accuracy = accuracy_score(labels, predictions)
    results = {'accuracy': accuracy, 'precision_macro': precision_macro, 'recall_macro': recall_macro, 'f1_macro': f1_macro, **f1, **recall, **precision}
    return results

In [None]:
ds_url = f'/content/'
ds_files = {
    'train': ds_url + 'train.tsv',
    'validation': ds_url + 'validation.tsv',
}

ds = load_dataset('csv', data_files=ds_files, delimiter='\t')
ds = ds.rename_columns({'ID':'id', 'Text':'text', 'Label': 'label'})

ds

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['id', 'text', 'label'],
        num_rows: 500
    })
})

In [None]:
def convert_labels(example):
  example["label"] = [float(num) for num in example['label'][1:-1].split(' ')]
  return example

def replace_none_with_str(example):
  if example['text'] == None:
    example['text'] = ''
  return example

ds = ds.map(convert_labels)
ds = ds.map(replace_none_with_str)

print(ds['train'].features)
print(ds['validation'].features)

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

{'id': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), 'label': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}
{'id': Value(dtype='int64', id=None), 'text': Value(dtype='string', id=None), 'label': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}


In [None]:
num_epochs = 5
checkpoint = 'gpt2-medium'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(example):
  return tokenizer(example['text'], truncation=True)

tokenized_datasets = ds.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=20, problem_type="multi_label_classification")
# model.config.pad_token_id = model.config.eos_token_id


Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    run_name=f'First Run-{time()}',
    output_dir='outputs-gpt2', overwrite_output_dir=False,
    auto_find_batch_size=True,
    num_train_epochs=num_epochs,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=5, load_best_model_at_end=True,
    save_safetensors=False,
    group_by_length=True,

    push_to_hub=False,
    hub_model_id='mohammad-osoolian/Semeval-task4-gpt2',
    hub_strategy='every_save',
    hub_private_repo=True,
    hub_token='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'
)


In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
train_output = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,F1 C0,F1 C1,F1 C2,F1 C3,F1 C4,F1 C5,F1 C6,F1 C7,F1 C8,F1 C9,F1 C10,F1 C11,F1 C12,F1 C13,F1 C14,F1 C15,F1 C16,F1 C17,F1 C18,F1 C19,Recall C0,Recall C1,Recall C2,Recall C3,Recall C4,Recall C5,Recall C6,Recall C7,Recall C8,Recall C9,Recall C10,Recall C11,Recall C12,Recall C13,Recall C14,Recall C15,Recall C16,Recall C17,Recall C18,Recall C19,Precision C0,Precision C1,Precision C2,Precision C3,Precision C4,Precision C5,Precision C6,Precision C7,Precision C8,Precision C9,Precision C10,Precision C11,Precision C12,Precision C13,Precision C14,Precision C15,Precision C16,Precision C17,Precision C18,Precision C19
1,0.182,0.537237,0.106,0.065091,0.067407,0.063801,0.0,0.0,0.0,0.0,0.0,0.1875,0.0,0.242424,0.0,0.418182,0.0,0.427907,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.190476,0.0,0.511111,0.0,0.396552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.333333,0.0,0.353846,0.0,0.464646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.1241,0.623417,0.124,0.131031,0.092412,0.101116,0.0,0.0625,0.0,0.0,0.0,0.117647,0.431373,0.206897,0.0,0.416938,0.0,0.385965,0.0,0.0,0.0,0.181818,0.219178,0.0,0.0,0.0,0.0,0.037037,0.0,0.0,0.0,0.125,0.407407,0.142857,0.0,0.474074,0.0,0.284483,0.0,0.0,0.0,0.217391,0.16,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.111111,0.458333,0.375,0.0,0.372093,0.0,0.6,0.0,0.0,0.0,0.15625,0.347826,0.0,0.0,0.0
3,0.0891,0.844756,0.096,0.141355,0.108957,0.112347,0.0,0.117647,0.0,0.0,0.0,0.171429,0.5,0.181818,0.0,0.441926,0.0,0.42487,0.0,0.0,0.0,0.172414,0.236842,0.0,0.0,0.0,0.0,0.074074,0.0,0.0,0.0,0.25,0.407407,0.119048,0.0,0.577778,0.0,0.353448,0.0,0.0,0.0,0.217391,0.18,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.130435,0.647059,0.384615,0.0,0.357798,0.0,0.532468,0.0,0.0,0.0,0.142857,0.346154,0.0,0.0,0.0


In [None]:
train_output = trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro,F1 C0,F1 C1,F1 C2,F1 C3,F1 C4,F1 C5,F1 C6,F1 C7,F1 C8,F1 C9,F1 C10,F1 C11,F1 C12,F1 C13,F1 C14,F1 C15,F1 C16,F1 C17,F1 C18,F1 C19,Recall C0,Recall C1,Recall C2,Recall C3,Recall C4,Recall C5,Recall C6,Recall C7,Recall C8,Recall C9,Recall C10,Recall C11,Recall C12,Recall C13,Recall C14,Recall C15,Recall C16,Recall C17,Recall C18,Recall C19,Precision C0,Precision C1,Precision C2,Precision C3,Precision C4,Precision C5,Precision C6,Precision C7,Precision C8,Precision C9,Precision C10,Precision C11,Precision C12,Precision C13,Precision C14,Precision C15,Precision C16,Precision C17,Precision C18,Precision C19
1,0.1959,0.202661,0.196,0.287126,0.148919,0.184074,0.770642,0.0,0.0,0.133333,0.0,0.0,0.444444,0.461538,0.296296,0.306818,0.0,0.557377,0.0,0.0,0.0,0.0,0.276923,0.434109,0.0,0.0,0.666667,0.0,0.0,0.075472,0.0,0.0,0.296296,0.357143,0.222222,0.2,0.0,0.586207,0.0,0.0,0.0,0.0,0.18,0.394366,0.0,0.0,0.913043,0.0,0.0,0.571429,0.0,0.0,0.888889,0.652174,0.444444,0.658537,0.0,0.53125,0.0,0.0,0.0,0.0,0.6,0.482759,0.0,0.0
