# Setup
- run `zip_for_colab.py` locally in the NLP-project directory
- upload the resulting zip file `nlp_proj_colab.zip`
- upload your data CSV

In [1]:
!unzip /content/nlp_proj_colab.zip

Archive:  /content/nlp_proj_colab.zip
   creating: configs/
   creating: environment/
   creating: src/
  inflating: pyproject.toml          
  inflating: setup.cfg               
   creating: configs/multitask/
   creating: configs/singletask/
  inflating: configs/singletask/bert_classifier_hv.yml  
  inflating: configs/singletask/bilstm_regressor_svo_dist.yml  
  inflating: configs/singletask/bert_classifier_scv.yml  
  inflating: configs/singletask/bert_regressor_svo_dist_norm.yml  
  inflating: configs/singletask/bilstm_classifier_svo_dist_norm_disc10.yml  
  inflating: configs/singletask/bilstm_classifier_scv.yml  
  inflating: configs/singletask/bilstm_classifier_hv.yml  
  inflating: configs/singletask/bert_classifier_freeze_apv.yml  
  inflating: configs/singletask/bert_regressor_svo_dist.yml  
  inflating: configs/singletask/bilstm_classifier_apv.yml  
  inflating: configs/singletask/bilstm_regressor_svo_dist_norm.yml  
  inflating: configs/singletask/bilstm_classifier_svo_dis

In [2]:
!pip install -r /content/environment/requirements-colab-train.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting black
  Downloading black-22.10.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 29.7 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 82.1 MB/s 
[?25hCollecting wandb
  Downloading wandb-0.13.6-py2.py3-none-any.whl (1.9 MB)
[K     |████████████████████████████████| 1.9 MB 56.6 MB/s 
[?25hCollecting torchmetrics
  Downloading torchmetrics-0.11.0-py3-none-any.whl (512 kB)
[K     |████████████████████████████████| 512 kB 79.6 MB/s 
Collecting mypy-extensions>=0.4.3
  Downloading mypy_extensions-0.4.3-py2.py3-none-any.whl (4.5 kB)
Collecting click>=8.0.0
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 6.1 MB/s 
[?25hCollecting pathspec>=0.9.0
  Downloading paths

# Imports

In [1]:
import sys
sys.path.append('/content/')
sys.path.append('/content/src/')
sys.path.append('/content/src/nlp_proj/')

import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import wandb
import torch
import torch.nn as nn
import logging 
from pprint import pformat
import pathlib
logging.getLogger().setLevel(logging.INFO)

from nlp_proj.model_optim_utils import make_model, make_optimizer, make_criterion
from nlp_proj.config_utils import load_config
from nlp_proj.dataset_utils import make_dataloader, make_tokenizer, make_datasets
from nlp_proj.train_utils_singletask import train_model_singletask, test_model_singletask
from nlp_proj.train_utils_multitask import train_model_multitask, test_model_multitask
from nlp_proj.train import model_pipeline

%load_ext autoreload
%autoreload 2

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


# Configuration

In [12]:
# CONFIGURATION
config_filepath = "/content/configs/multitask/bert_multitask_2.yml"  
data_filepath = "/content/train_auto_annotations_UPDATED_cleaned.csv" 
results_dir = "/content/results/bert_multitask_2/"
test_run = False
test_run_n_samples = 30
batch_size = 32

In [13]:
# Load config
config = load_config(config_filepath)
config["data_filepath"] = data_filepath
config["results_dir"] = results_dir

# Alter config
config["batch_size"] = batch_size

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
config["device"] = device
logging.info("Loaded config:")
logging.info(pformat(config))

INFO:root:Loaded config:
INFO:root:{'architecture': 'BERTMultitask',
 'base_lr': 0.001,
 'batch_size': 32,
 'data_filepath': '/content/train_auto_annotations_UPDATED_cleaned.csv',
 'device': device(type='cuda', index=0),
 'dim_hid': 64,
 'early_stopping': True,
 'early_stopping_label_col': 'scv',
 'early_stopping_metric': 'f1',
 'freeze_pretrained': False,
 'grad_clip': 5,
 'label_cols': ['apv', 'scv'],
 'label_criterion': ['CrossEntropyLoss', 'CrossEntropyLoss'],
 'logging_freq': 20,
 'max_epochs': 3,
 'multitask': True,
 'num_classes_list': [3, 3],
 'optimizer': 'ADAM',
 'project_name': 'bert-multitask',
 'random_seed': 42,
 'results_dir': '/content/results/bert_multitask_2/',
 'shuffle': False,
 'start_epoch': 0,
 'tasks': ['classification', 'classification'],
 'test_run': False,
 'test_run_n_samples': 10,
 'weight_decay': 0.0001}


In [14]:
# Create results_dir direectory
results_dir = config["results_dir"]
if not pathlib.Path(results_dir).is_dir():
    pathlib.Path(results_dir).mkdir(parents=True)
    logging.info(f"Created results directory {results_dir}")
else:
    logging.warning(f"Results directory {results_dir} already exists, and running may overwrite files")

INFO:root:Created results directory /content/results/bert_multitask_2/


In [15]:
# Wandb setup
wandb.login()

True

# Run Training

In [16]:
model_pipeline(config)

INFO:root:Loaded tokenizer
INFO:root:Loaded all data of length 31799
INFO:root:Made train (length 22260), validation (length 3180), and test (length 6359) data split
INFO:root:Made data loaders
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
INFO:root:Loaded model with 66461702 trainable parameters
INFO:root:Created

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
batch_ct,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅█████████████
model_n_params,▁
test_apv_acc,▁
test_apv_f1,▁
test_samples,▁
test_scv_acc,▁
test_scv_f1,▁
train_apv_acc,▁▇█
train_apv_f1,▁▇█

0,1
batch_ct,2088.0
epoch,2.0
model_n_params,66461702.0
test_apv_acc,0.98852
test_apv_f1,0.98443
test_samples,6359.0
test_scv_acc,0.9838
test_scv_f1,0.9807
train_apv_acc,0.99481
train_apv_f1,0.97567


BERTMultitask(
  (pretrained_layers): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Li

# Cleanup
Delete source code files and data to upload newer versions

In [None]:
# !rm -rf configs/ environment/ src/ results/ wandb/ nlp_proj_colab.zip pyproject.toml setup.cfg

In [None]:
# !rm -rf rule_based_annotations_new_svo_dist.csv

# Test a Trained Model

In [None]:
from nlp_proj.model_optim_utils import make_model, make_optimizer, make_criterion
from nlp_proj.config_utils import load_config
from nlp_proj.dataset_utils import make_dataloader, make_tokenizer, make_datasets
from nlp_proj.train_utils_singletask import train_model_singletask, test_model_singletask
from nlp_proj.train_utils_multitask import train_model_multitask, test_model_multitask
from types import SimpleNamespace
from torchmetrics.functional import confusion_matrix

In [None]:
# CONFIGURATION
model_filepath = "/content/results/bilstm_classifier_hv/bilstm-hv.pt"

In [None]:
# Load model
model = torch.load(model_filepath)

In [None]:
# Setup config
config_ns = SimpleNamespace(**config)

# Make tokenizer
tokenizer = make_tokenizer()
logging.info("Loaded tokenizer")
config_ns.vocab_size = tokenizer.vocab_size

# Make the data loaders
train, val, test = make_datasets(config_ns)
train_loader = make_dataloader(train, tokenizer, config_ns)
val_loader = make_dataloader(val, tokenizer, config_ns)
test_loader = make_dataloader(test, tokenizer, config_ns)
logging.info("Made data loaders")

INFO:root:Loaded tokenizer
INFO:root:Loaded all data of length 31906
INFO:root:Made train (length 22335), validation (length 3190), and test (length 6381) data split
INFO:root:Made data loaders


In [None]:
# Test data Evaluation
evaluation, total = test_model_singletask(model, test_loader, config_ns)
evaluation

In [None]:
# Device
device = config_ns.device
model = model.to(device)

model.eval()

all_preds = []
all_labels = []

# Run the model on some test examples
with torch.no_grad():
    total = 0
    for batch_x, batch_y in test_loader:
        # Push batch_x to device
        batch_x = batch_x.to(device)
        batch_y = batch_y.to(device)

        # Forward
        input_ids = batch_x.input_ids
        attention_mask = batch_x.attention_mask
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Predictions
        if config_ns.task == "classification":
            batch_preds = torch.argmax(outputs, dim=-1)
        elif config_ns.task == "regression":
            batch_preds = outputs.squeeze()

        all_preds.append(batch_preds)
        all_labels.append(batch_y)
        total += batch_y.size(0)

all_preds = torch.concat(all_preds).to(device)
all_labels = torch.concat(all_labels).to(device)

In [None]:
# Confusion matrix 
conf = confusion_matrix(
    all_preds, 
    all_labels, 
    task="multiclass", 
    num_classes=config_ns.num_classes, 
)
conf

tensor([[   0,  104,    0],
        [   0, 5932,    0],
        [   0,  345,    0]], device='cuda:0')