In [None]:
"""
goal: make the notebooks from kaggle look mroe like the standard huggingface approach and eliminate all the errors?
"""

In [None]:
import wandb
from wandb_creds import *

wandb.login(key=API_KEY)
wandb.init(project="feedback_prize_pytorch", tags=TAGS, entity="feedback_prize_michael_and_wilson")

In [None]:
# CONFIG
import pandas as pd
import os

EXP_NUM = 1
TASK = "ner"
MODEL_CHECKPOINT = "longformer-base-4096-hf"
MAX_LENGTH = 1024
STRIDE = 128
MIN_TOKENS = 6
MODEL_PATH = f'{MODEL_CHECKPOINT.split("/")[-1]}-{EXP_NUM}'
DATA_DIR = 'data'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TRAIN_DF = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))

# TRAINING HYPERPARAMS
BATCH_SIZE = 8
GRAD_ACC = 8
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01
WARMUP = 0.1
N_EPOCHS = 5

In [None]:
# read train data
TRAIN_DF.head(1)

In [None]:
# check unique classes
CLASSES = TRAIN_DF.discourse_type.unique().tolist()
CLASSES

In [None]:
# setup label indices
from collections import defaultdict

tags = defaultdict()

for i, c in enumerate(CLASSES):
    tags[f'B-{c}'] = i
    tags[f'I-{c}'] = i + len(CLASSES)
tags[f'O'] = len(CLASSES) * 2
tags[f'Special'] = -100

l2i = dict(tags)

i2l = defaultdict()
for k, v in l2i.items():
    i2l[v] = k
i2l[-100] = 'Special'

i2l = dict(i2l)

N_LABELS = len(i2l) - 1 # not accounting for -100

In [None]:
def get_raw_text(ids):
    with open(TRAIN_DIR/f'{ids}.txt', 'r') as file: data = file.read()
    return data

In [None]:
# group training labels by text file
df1 = TRAIN_DF.groupby('id')['discourse_type'].apply(list).reset_index(name='classlist')
df2 = TRAIN_DF.groupby('id')['discourse_start'].apply(list).reset_index(name='starts')
df3 = TRAIN_DF.groupby('id')['discourse_end'].apply(list).reset_index(name='ends')
df4 = TRAIN_DF.groupby('id')['predictionstring'].apply(list).reset_index(name='predictionstrings')
df = pd.merge(df1, df2, how='inner', on='id')
df = pd.merge(df, df3, how='inner', on='id')
df = pd.merge(df, df4, how='inner', on='id')
df['text'] = df['id'].apply(get_raw_text)
df.head()

In [None]:
# we will use HuggingFace datasets
from datasets import Dataset, load_metric

ds = Dataset.from_pandas(df)
datasets = ds.train_test_split(test_size=0.1, shuffle=True, seed=42)
datasets

In [1]:
# Not sure if this is needed, but in case we create a span with certain class without starting token of that class,
# let's convert the first token to be the starting token.

e = [0, 7, 7, 7, 1, 1, 8, 8, 8, 9, 9, 9, 14, 4, 4, 4]

def set_beginning(labels):
    for i in range(1,len(labels)):
        curr_lab = labels[i]
        prev_lab = labels[i-1]
        if curr_lab in range(7,14):
            if prev_lab != curr_lab and prev_lab != curr_lab - 7:
                labels[i] = curr_lab -7
    return labels

set_beginning(labels=e)

[0, 7, 7, 7, 1, 1, 8, 8, 8, 2, 9, 9, 14, 4, 4, 4]

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, add_prefix_space=True)

In [None]:
# tokenize and add labels
def tokenize_and_align_labels(examples):
    o = tokenizer(examples['text'], truncation=True, padding=True, return_offsets_mapping=True, max_length=max_length, stride=stride, return_overflowing_tokens=True)
    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = o["overflow_to_sample_mapping"]
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = o["offset_mapping"]
    o["labels"] = []
    for i in range(len(offset_mapping))
        sample_index = sample_mapping[i]
        labels = [l2i['O'] for i in range(len(o['input_ids'][i]))]
        for label_start, label_end, label in \
        list(zip(examples['starts'][sample_index], examples['ends'][sample_index], examples['classlist'][sample_index])):
            for j in range(len(labels)):
                token_start = offset_mapping[i][j][0]
                token_end = offset_mapping[i][j][1]
                if token_start == label_start:
                    labels[j] = l2i[f'B-{label}']
                if token_start > label_start and token_end <= label_end:
                    labels[j] = l2i[f'I-{label}']
        for k, input_id in enumerate(o['input_ids'][i]):
            if input_id in [0,1,2]:
                labels[k] = -100
        labels = set_beginning(labels)
        o["labels"].append(labels)

    return o

In [None]:
#
tokenized_datasets = datasets.map(tokenize_and_align_labels, batched=True, batch_size=20000, remove_columns=datasets["train"].column_names)
tokenized_datasets

In [None]:
# we will use auto model for token classification
model = AutoModelForTokenClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=N_LABELS)
model_name = MODEL_CHECKPOINT.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-finetuned-{TASK}",  #
    evaluation_strategy = "epoch",  # evaluation is done at the end of each epoch
    logging_strategy = "epoch",  # logging is done at the end of each epoch. consider "steps".
    logging_first_step=True, # whether to log and evaluate the first global step or not
    save_strategy = "epoch",  # saving is done at the end of each epoch
    learning_rate=LEARNING_RATE,  # the initial learning rate for AdamW optimizer
    per_device_train_batch_size=BATCH_SIZE,  # the batch size per device
    per_device_eval_batch_size=BATCH_SIZE,  # the batch size per device
    num_train_epochs=N_EPOCHS,  # default=3. the number of training epochs
    weight_decay=WEIGHT_DECAY,  # Default=0. The weight decay to apply to all layers except all bias and LayerNorm weights in AdamW optimizer
    report_to='wandb', #
    adam_beta1=0.9,  # default=0.9. The beta1 hyperparameter for the AdamW optimizer
    adam_beta2=0.999,  # default=0.999. The beta2 hyperparameter for the AdamW optimizer
    adam_epsilon=1e-8,  # default=1e-8. the epsilon hyperparameter for the AdamW optimizer.
    max_grad_norm=1,  # default=1, maximum gradient norm (for gradient clipping).
    lr_scheduler_type="linear", # default="linear". the scheduler type to use. Consider get_cosine_scheduler_with_warmup()
    gradient_accumulation_steps=GRAD_ACC,  # default=1. the number of updates steps to accumulate gradients for before performing a backward/update pass.
    warmup_ratio=WARMUP,  # ratio of total training steps used for a linear warnup from 0 to learning rate
    seed=42,  # random seed that will be used at the beginning of training. TO ensure reproducability use the model_init() function to instantiate the nodel if it has some randomly initialize parameters.
)
model

In [None]:
# data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset() or eval_dataset(). Data collators may apply some processing (like padding) or random data augmentation (like random masking)

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,  # the tokenizer used for encoding the data
    return_tensors='pt', # the type of tensor to return
    label_pad_token_id=-100, # default. the padding id to use when padding the labels
    padding=True  # strategy to pad the sequence. True = pad to the longest sequence in the batch.
)
data_collator

In [None]:
# this is not the competition metric, but for now this will be better than nothing...
metric = load_metric("seqeval")

import numpy as np

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [i2l[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [i2l[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# The compute function needs to receive a tuple (with logits and labels) and has to return a dictionary with string keys (the name of the metric) and float values. It will be called at the end of each evaluation phase on the whole arrays of predictions/labels.

In [None]:
trainer = Trainer(
    model=model,  # defined above
    args=args,  # defined above
    train_dataset=tokenized_datasets["train"],  # this is kinda set
    eval_dataset=tokenized_datasets["test"],  # this is kinda set
    data_collator=data_collator,
    tokenizer=tokenizer,  # defined above. I think this can be improved.
    compute_metrics=compute_metrics,  # defined above. seems like a placeholder
)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")  # new addition
device

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(args.num_train_epochs))

trainer.train()  #
for epoch in range(args.num_train_epochs):
    for batch in data_collator:
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model.(**batch)
          args.loss = outputs.loss
          args.loss.backward()
          args.optimizer.step()
          args.lr_scheduler.step()
          args.optimizer.zero_grad()
          progress_bar.update(1)
wandb.log({"train-loss": 0.5, "accuracy": 0.9})  # new additions. most basic
wandb.watch(model)  # new additions
wandb.finish()  #

In [None]:


sequences = [pass]  # mk: what do I put here?

batch = tokenizer(sequences, padding=True, truncation=True, return_tensors='pt')

batch['labels'] = torch.tensor([1, 1])  # mk: how to determine the correct value to put here?

optimizer  = torch.optim.AdamW(model.parameters)

loss = model(**batch).loss
loss.backward()
optimizer.step()

In [None]:
import torch
from transformers import AdamW, AutoModelForTokenClassification

In [None]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device