# End-to-end

This notebook should form the core skeleton of the 'run' function

## Colab Set up

In [20]:
# -- env setup
import os
import gc

!python3.7 -m pip install git+https://github.com/namiyousef/colab-utils.git
from colabtools.utils import get_gpu_utilization, mount_drive, install_private_library

drive_path = mount_drive()
project_path = os.path.join(drive_path, 'COMP0087/data/core')
#development_dir = os.path.join(drive_path, 'argument-mining/argminer')

install_private_library(os.path.join(drive_path, 'nlp/github_config.json'), 'argument-mining')

Collecting git+https://github.com/namiyousef/colab-utils.git
  Cloning https://github.com/namiyousef/colab-utils.git to /tmp/pip-req-build-8ukhohse
  Running command git clone -q https://github.com/namiyousef/colab-utils.git /tmp/pip-req-build-8ukhohse
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mount successful.


### Imports

In [21]:
# -- public imports

from transformers import AutoTokenizer, AutoModelForTokenClassification
import pandas as pd
from torch.utils.data import DataLoader
import torch
from pandas.testing import assert_frame_equal
import time

# -- private imports
from colabtools.utils import move_to_device
from colabtools.config import DEVICE

# -- dev imports
%load_ext autoreload
%autoreload 2

from argminer.data import ArgumentMiningDataset, TUDarmstadtProcessor, PersuadeProcessor
from argminer.evaluation import inference
from argminer.utils import encode_model_name
from argminer.config import LABELS_MAP_DICT



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [22]:
# constants (these will be abstracted away by inputs that you give to run)

# -- model specific configurations
model_name = 'google/bigbird-roberta-base'
max_length = 1024

# -- training configurations
epochs = 5
batch_size = 2
verbose = 2
save_freq = 2

# -- dataset configurations
dataset_name = 'Persuade'

# -- experiment configurations
strategy = 'standard_bieo'
strat_name, strat_label = strategy.split('_')

# -- inferred configurations
df_label_map = LABELS_MAP_DICT[dataset_name][strat_label]
num_labels = len(set(df_label_map.label))
Processor = eval(f'{dataset_name}Processor')


### Tokenizer, Model and Optimizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels) 
# TODO force_download
# TODO add option for optimizer
optimizer = torch.optim.Adam(params=model.parameters())

Downloading:   0%|          | 0.00/0.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/826k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/775 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/489M [00:00<?, ?B/s]

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForTokenClassification were no

### Dataset 
Note this will change as the Processor develops. On the cluster you will need to use different options

In [None]:
processor = Processor(os.path.join(project_path, f'{strat_label}'))
processor = processor.from_json()
df_total = processor.dataframe

df_dict = processor.get_tts(test_size=0.3, val_size=0.1)
df_train = df_dict.get('train')[['text', 'labels']]
df_test = df_dict.get('test')[['text', 'labels']]
df_val = df_dict.get('val')[['text', 'labels']]

#df_train = df_total[['text', 'labels']].head(10) 
#df_test = df_total[['text', 'labels']].tail(201)


#assert_frame_equal(df_total[['text', 'labels']], pd.concat([df_train, df_test]))

# todo this changes NOTE FIXED BT STRATEGY!!
# todo this needs to get updated as well.....


In [None]:
train_set = ArgumentMiningDataset(df_label_map, df_train, tokenizer, max_length, strategy)
test_set = ArgumentMiningDataset(df_label_map, df_test, tokenizer, max_length, strategy, is_train=False)

train_loader = DataLoader(train_set, batch_size=batch_size)
test_loader = DataLoader(test_set, batch_size=batch_size)

In [None]:
if not os.path.exists('models'):
  os.makedirs('models')
  print('models directory created!')
model.to(DEVICE)
print(f'Model pushed to device: {DEVICE}')
for epoch in range(epochs):
    model.train()
    start_epoch_message = f'EPOCH {epoch + 1} STARTED'
    print(start_epoch_message)
    print(f'{"-" * len(start_epoch_message)}')
    start_epoch = time.time()

    start_load = time.time()
    training_loss = 0
    for i, (inputs, targets) in enumerate(train_loader):
        start_train = time.time()
        inputs = move_to_device(inputs, DEVICE)
        targets = move_to_device(targets, DEVICE)
        if DEVICE != 'cpu':
            print(f'GPU Utilisation at batch {i+1} after data loading: {get_gpu_utilization()}')

        optimizer.zero_grad()

        loss, outputs = model(
            labels=targets,
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            return_dict=False
        )
        if DEVICE != 'cpu':
            print(f'GPU Utilisation at batch {i+1} after training: {get_gpu_utilization()}')


        training_loss += loss.item()

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        del targets, inputs, loss, outputs
        gc.collect()
        torch.cuda.empty_cache()

        end_train = time.time()

        if verbose > 1:
            print(
                f'Batch {i + 1} complete. Time taken: load({start_train - start_load:.3g}), '
                f'train({end_train - start_train:.3g}), total({end_train - start_load:.3g}). '
            )
        start_load = time.time()

    print_message = f'Epoch {epoch + 1}/{epochs} complete. ' \
                    f'Time taken: {start_load - start_epoch:.3g}. ' \
                    f'Loss: {training_loss/(i+1): .3g}'

    if verbose:
        print(f'{"-" * len(print_message)}')
        print(print_message)
        print(f'{"-" * len(print_message)}')

    if epoch % save_freq == 0:
        encoded_model_name = encode_model_name(model_name, epoch+1)
        save_path = f'models/{encoded_model_name}'
        model.save_pretrained(save_path)
        print(f'Model saved at epoch {epoch+1} at: {save_path}')

encoded_model_name = encode_model_name(model_name, 'final')
save_path = f'models/{encoded_model_name}'
model.save_pretrained(save_path)
print(f'Model saved at epoch {epoch + 1} at: {save_path}')

In [None]:
# load trained model
path = ''
trained_model = AutoModelForTokenClassification.from_pretrained(path)

In [None]:
df_metrics, df_scores = inference(trained_model, test_loader)

In [None]:
df_scores