# End-to-end

This notebook should form the core skeleton of the 'run' function

## Colab Set up

In [1]:
# -- env setup
import os
import gc

!python3.7 -m pip install git+https://github.com/namiyousef/colab-utils.git
from colabtools.utils import get_gpu_utilization, mount_drive, install_private_library

drive_path = mount_drive()
project_path = os.path.join(drive_path, 'argument-mining')
development_dir = os.path.join(drive_path, 'argument-mining/argminer')

install_private_library(os.path.join(project_path, 'data/github_config.json'), 'argument-mining')

Collecting git+https://github.com/namiyousef/colab-utils.git
  Cloning https://github.com/namiyousef/colab-utils.git to /tmp/pip-req-build-8yzr10gp
  Running command git clone -q https://github.com/namiyousef/colab-utils.git /tmp/pip-req-build-8yzr10gp
Building wheels for collected packages: colabtools
  Building wheel for colabtools (setup.py) ... [?25l[?25hdone
  Created wheel for colabtools: filename=colabtools-0.0.5-py3-none-any.whl size=3585 sha256=0f5eff45790806532dcbae2f6ec205035ff211552dcd2805e22ecf6cd4f0d162
  Stored in directory: /tmp/pip-ephem-wheel-cache-eberl06s/wheels/1c/35/c0/364531e4ff0f0fe0f3296c80f1ee668b03ae6c6c378c5a44bf
Successfully built colabtools
Installing collected packages: colabtools
Successfully installed colabtools-0.0.5
Google Drive import successful.
CUDA device detected. Using GPU...
Mounted at /content/drive
Google Drive mount successful.


### Imports

In [26]:
# -- public imports

from transformers import AutoTokenizer, AutoModelForTokenClassification
import pandas as pd
from torch.utils.data import DataLoader
import torch
from pandas.testing import assert_frame_equal
import time

# -- private imports
from colabtools.utils import move_to_device
from colabtools.config import DEVICE

# -- dev imports
%load_ext autoreload
%autoreload 2

from argminer.data import ArgumentMiningDataset, TUDarmstadtProcessor
from argminer.evaluation import inference
from argminer.utils import encode_model_name


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
# constants (these will be abstracted away by inputs that you give to run)
model_name = 'google/bigbird-roberta-base'
max_length = 1024
epochs = 5
batch_size = 2
strategy = 'standard_bio'
strat_name = strategy.split('_')[1]
verbose = 2
save_freq = 2


### Tokenizer, Model and Optimizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=15) 
# TODO force_download
# TODO add option for optimizer
optimizer = torch.optim.Adam(params=model.parameters())

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForTokenClassification were no

### Dataset 
Note this will change as the Processor develops. On the cluster you will need to use different options

In [23]:
processor = TUDarmstadtProcessor(os.path.join(project_path, 'data/UCL/dataset2/ArgumentAnnotatedEssays-2.0/brat-project-final'))
processor = processor.preprocess().process(strat_name).postprocess()
df_total = processor.dataframe
df_train = df_total[['text', 'labels']].head(10) 
df_test = df_total[['text', 'labels']].tail(201)


#assert_frame_equal(df_total[['text', 'labels']], pd.concat([df_train, df_test]))

# todo this changes NOTE FIXED BT STRATEGY!!
df_label_map = pd.DataFrame({
    'label_id':[0,1,2,3,4,5,6],
    'label':['O', 'B-MajorClaim', 'I-MajorClaim', 'B-Claim', 'I-Claim', 'B-Premise', 'I-Premise']
})

assert set(df_train.labels.values[0]) == set(df_label_map.label)

Found non-matching segments:--------------------------------------------------

murdering criminals is therefore immoral and hard to accept

"murdering" criminals is therefore immoral and hard to accept

Found non-matching segments:--------------------------------------------------

Click is a very interesting comedy, with a serious approach about the importance of having a balanced life between family and work businesses

"Click" is a very interesting comedy, with a serious approach about the importance of having a balanced life between family and work businesses

Found non-matching segments:--------------------------------------------------

Blood diamond, an adaptation of a real story in South Africa, focuses on the link between diamonds and conflict

"Blood diamond", an adaptation of a real story in South Africa, focuses on the link between diamonds and conflict

Found non-matching segments:--------------------------------------------------

rush hours are usually not the direct co

In [24]:
train_set = ArgumentMiningDataset(df_label_map, df_train, tokenizer, max_length, strategy)
test_set = ArgumentMiningDataset(df_label_map, df_test, tokenizer, max_length, strategy, is_train=False)

train_loader = DataLoader(train_set, batch_size=batch_size)
test_loader = DataLoader(test_set, batch_size=batch_size)

In [31]:
if not os.path.exists('models'):
  os.makedirs('models')
  print('models directory created!')
model.to(DEVICE)
print(f'Model pushed to device: {DEVICE}')
for epoch in range(epochs):
    model.train()
    start_epoch_message = f'EPOCH {epoch + 1} STARTED'
    print(start_epoch_message)
    print(f'{"-" * len(start_epoch_message)}')
    start_epoch = time.time()

    start_load = time.time()
    training_loss = 0
    for i, (inputs, targets) in enumerate(train_loader):
        start_train = time.time()
        inputs = move_to_device(inputs, DEVICE)
        targets = move_to_device(targets, DEVICE)
        if DEVICE != 'cpu':
            print(f'GPU Utilisation at batch {i+1} after data loading: {get_gpu_utilization()}')

        optimizer.zero_grad()

        loss, outputs = model(
            labels=targets,
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            return_dict=False
        )
        if DEVICE != 'cpu':
            print(f'GPU Utilisation at batch {i+1} after training: {get_gpu_utilization()}')


        training_loss += loss.item()

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        del targets, inputs, loss, outputs
        gc.collect()
        torch.cuda.empty_cache()

        end_train = time.time()

        if verbose > 1:
            print(
                f'Batch {i + 1} complete. Time taken: load({start_train - start_load:.3g}), '
                f'train({end_train - start_train:.3g}), total({end_train - start_load:.3g}). '
            )
        start_load = time.time()

    print_message = f'Epoch {epoch + 1}/{epochs} complete. ' \
                    f'Time taken: {start_load - start_epoch:.3g}. ' \
                    f'Loss: {training_loss/(i+1): .3g}'

    if verbose:
        print(f'{"-" * len(print_message)}')
        print(print_message)
        print(f'{"-" * len(print_message)}')

    if epoch % save_freq == 0:
        encoded_model_name = encode_model_name(model_name, epoch+1)
        save_path = f'models/{encoded_model_name}'
        model.save_pretrained(save_path)
        print(f'Model saved at epoch {epoch+1} at: {save_path}')

encoded_model_name = encode_model_name(model_name, 'final')
save_path = f'models/{encoded_model_name}'
model.save_pretrained(save_path)
print(f'Model saved at epoch {epoch + 1} at: {save_path}')

Model pushed to device: cuda
EPOCH 1 STARTED
---------------
GPU Utilisation at batch 1 after data loading: 2739


  * num_indices_to_pick_from


GPU Utilisation at batch 1 after training: 7297
Batch 1 complete. Time taken: load(0.0163), train(2.09), total(2.1). 
GPU Utilisation at batch 2 after data loading: 2739
GPU Utilisation at batch 2 after training: 7297
Batch 2 complete. Time taken: load(0.0179), train(1.95), total(1.97). 
GPU Utilisation at batch 3 after data loading: 2739
GPU Utilisation at batch 3 after training: 7297
Batch 3 complete. Time taken: load(0.0136), train(1.97), total(1.98). 
GPU Utilisation at batch 4 after data loading: 2739
GPU Utilisation at batch 4 after training: 7297
Batch 4 complete. Time taken: load(0.0173), train(2.04), total(2.06). 
GPU Utilisation at batch 5 after data loading: 2739
GPU Utilisation at batch 5 after training: 7297
Batch 5 complete. Time taken: load(0.0277), train(2.07), total(2.1). 
--------------------------------------------------
Epoch 1/5 complete. Time taken: 10.2. Loss:  0.482
--------------------------------------------------
Model saved at epoch 1 at: models/Z29vZ2xlL2Jp

In [32]:
# load trained model
path = ''
trained_model = AutoModelForTokenClassification.from_pretrained(path)

In [33]:
df_metrics, df_scores = inference(trained_model, test_loader)

  * num_indices_to_pick_from


In [34]:
df_scores

Unnamed: 0,tp,class,fn,fp,f1
0,0.0,0,47,0,0.0
1,0.0,1,3,0,0.0
2,0.0,2,9,0,0.0
3,0.0,3,33,2,0.0
0,0.0,0,37,0,0.0
...,...,...,...,...,...
3,0.0,3,12,2,0.0
0,0.0,0,16,0,0.0
1,0.0,1,2,0,0.0
2,0.0,2,2,0,0.0
