In [7]:
%load_ext autoreload
%autoreload 2

import argparse
import os
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import yaml
from src import BertClassifier
from src import datasets as data_utils
from src import train_utils, utils
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm

import wandb

device = utils.get_device()

config = utils.load_config(
    "model_params/bert_classifier.yaml",
    epochs=9,
    learning_rate=2e-4,
    batch_size=32
)

# Create datasets
train_dataset = data_utils.create_train_sst2(
    device=device,
    num_samples=config["num_training_examples"],
    tokenizer_name=config["bert_model_name"],
    max_seq_len=config["max_sequence_length"],
)

test_dataset = data_utils.create_test_sst2(
    device=device,
    tokenizer_name=config["bert_model_name"],
    max_seq_len=config["max_sequence_length"],
)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=1)
config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


100%|████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 14614.69it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 872/872 [00:00<00:00, 10776.01it/s]


{'epochs': 9,
 'bert_model_name': 'distilbert-base-uncased',
 'max_sequence_length': 64,
 'learning_rate': 0.0002,
 'batch_size': 32,
 'classifier_init_state_path': 'model_params/init_classifier_params.pt',
 'classifier_type': 'single-fc',
 'classifier_hidden_size': 0,
 'classifier_drop_out': 0,
 'optimizer_weight_decay': 0.001,
 'num_training_examples': 10000}

In [8]:
full_model = train_utils.train_bert_model(
    train_dataset, test_dataset, config, validation_dataset=test_dataset
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initial 0.7111523025968206, 46.674311926605505


100%|████████████████████████████████████████████████████████████████████████████████████████████████| 313/313 [00:19<00:00, 16.23batch/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 313/313 [00:19<00:00, 16.09batch/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 313/313 [00:19<00:00, 16.29batch/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 313/313 [00:19<00:00, 16.16batch/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 313/313 [00:19<00:00, 16.35batch/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 313/313 [00:18<00:00, 16.53batch/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 313/313 [00:19<00:00, 16.36batch/s]
100%|██████████████████████

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▂▃▄▅▅▆▇█
train/accuracy,▁▆▇▇█████
train/batch_loss,█▇▅▅▆▆▆▅▆▃▃▅▃▃▅▄▄▃▂▄▃▄▃▂▃▅▃▂▁▆▃▃▅▂▃▃▂▂▂▂
train/loss,█▅▃▂▂▂▁▁▁
val/accuracy,▁▅▆▆▇▇█▇█
val/loss,█▅▃▂▂▁▁▁▁

0,1
epoch,9.0
train/accuracy,82.45807
train/batch_loss,0.1421
train/loss,0.39815
val/accuracy,83.1422
val/loss,0.39864


In [10]:
ldf, loss, acc = train_utils.evaluate_loss(full_model, test_dataloader)
loss,acc

(0.39863892992381156, 83.14220183486239)

In [11]:
full_model.save_model('model_params', config)

In [22]:
config

{'epochs': 30,
 'bert_model_name': 'distilbert-base-uncased',
 'max_sequence_length': 64,
 'learning_rate': 0.005,
 'lr_warmup_pct': 0.2,
 'batch_size': 16,
 'classifier_init_state_path': 'model_params/bert-epoch30-reg0.001-10000.pt',
 'classifier_type': 'single-fc',
 'classifier_hidden_size': 0,
 'classifier_drop_out': 0,
 'optimizer_weight_decay': 0.01,
 'num_training_examples': 10000}

In [4]:
og_model, config = BertClassifier.load_model('model_params/bert-best.yaml')
ldf2, l, a = train_utils.evaluate_loss(og_model, test_dataloader)
l,a

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(0.37703973579917965, 83.14220183486239)

In [12]:
config2 = utils.load_config(
    "model_params/bert_classifier.yaml",
    epochs=30,
    num_training_examples=10000,
    optimizer_weight_decay=0.001,
)

model2, fdf2, full_test_loss2, full_test_acc2 = train_utils.train_bert_model(
    train_dataset, test_dataset, config2
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Initial 0.7111523025968206, 46.674311926605505


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [00:21<00:00, 29.52batch/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [00:20<00:00, 29.98batch/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [00:20<00:00, 29.82batch/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [00:20<00:00, 29.88batch/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [00:20<00:00, 29.84batch/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 625/625 [00:20<00:00, 29.81batch/s]
100%|█████████████████████████████████████████████████████

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
train/accuracy,▁▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇████████
train/batch_loss,▆▅▆▄▅▆▁▄▄▁▄▆▃▄▂▃▄▅▃█▂▃▃▅▅▄▂▃▃▇▄▄▇▅▂▇▅▃▂▃
train/loss,█▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁

0,1
epoch,30.0
test/accuracy,83.1422
test/loss,0.37704
train/accuracy,85.12
train/batch_loss,0.06538
train/loss,0.35042


Final 0.37703973579917965, 83.14220183486239


In [20]:
model2.save_model('model_params', config3)

In [13]:
config2

{'epochs': 30,
 'bert_model_name': 'distilbert-base-uncased',
 'max_sequence_length': 64,
 'learning_rate': 0.005,
 'lr_warmup_pct': 0.2,
 'batch_size': 16,
 'classifier_init_state_path': 'model_params/init_classifier_params.pt',
 'classifier_type': 'single-fc',
 'classifier_hidden_size': 0,
 'classifier_drop_out': 0,
 'optimizer_weight_decay': 0.001,
 'num_training_examples': 10000}

In [15]:
config3 = dict(sorted(config2.items()))
config3

{'batch_size': 16,
 'bert_model_name': 'distilbert-base-uncased',
 'classifier_drop_out': 0,
 'classifier_hidden_size': 0,
 'classifier_init_state_path': 'model_params/init_classifier_params.pt',
 'classifier_type': 'single-fc',
 'epochs': 30,
 'learning_rate': 0.005,
 'lr_warmup_pct': 0.2,
 'max_sequence_length': 64,
 'num_training_examples': 10000,
 'optimizer_weight_decay': 0.001}

In [None]:
--

In [21]:
model, config = BertClassifier.load_model(
    "results_10k_strong_l2/bert-classifier-epoch5-10000-strong-l2.yaml"
)
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
model.classifier.state_dict().__str__()

"OrderedDict([('1.weight', tensor([[ 0.0633,  0.1250, -0.1110,  ...,  0.0867,  0.0103,  0.0697],\n        [-0.0633, -0.1250,  0.1110,  ..., -0.0867, -0.0094, -0.0699]],\n       device='cuda:0')), ('1.bias', tensor([-0.0212,  0.0175], device='cuda:0'))])"

In [23]:
full_model.classifier.state_dict().__str__()

"OrderedDict([('1.weight', tensor([[ 0.0633,  0.1250, -0.1110,  ...,  0.0867,  0.0103,  0.0697],\n        [-0.0633, -0.1250,  0.1110,  ..., -0.0867, -0.0094, -0.0699]],\n       device='cuda:0')), ('1.bias', tensor([-0.0212,  0.0175], device='cuda:0'))])"

In [4]:
full_test_loss, full_test_acc

(0.3953259670319262, 83.02752293577981)

In [5]:
full_model.save_model("model_params", config)

In [6]:
fdf

Unnamed: 0,test_guid,logits,pred,label,loss
0,0,"[-1.9787033, 1.5581679]",1,1,0.028689
1,1,"[0.64768565, -1.0567914]",0,0,0.167096
2,2,"[-1.8197987, 1.4446558]",1,1,0.037506
3,3,"[-1.4292126, 1.0838774]",1,1,0.077903
4,4,"[0.46477368, -0.8791866]",0,0,0.231755
...,...,...,...,...,...
867,867,"[-0.7241081, 0.34439066]",1,0,1.363795
868,868,"[-0.32096845, -0.083922565]",1,1,0.581632
869,869,"[-0.671131, 0.29042196]",1,0,1.285301
870,870,"[-0.11103187, -0.28170618]",0,0,0.611447


In [7]:
firstq_loss = fdf.loss.quantile(0.25)
median_loss = fdf.loss.quantile(0.5)
thirdq_loss = fdf.loss.quantile(0.75)

In [8]:
fdf[(fdf.loss >= firstq_loss) & (fdf.loss < median_loss)].iloc[:2]

Unnamed: 0,test_guid,logits,pred,label,loss
1,1,"[0.64768565, -1.0567914]",0,0,0.167096
4,4,"[0.46477368, -0.8791866]",0,0,0.231755


In [9]:
fdf[(fdf.loss >= median_loss) & (fdf.loss < thirdq_loss)].iloc[:2]

Unnamed: 0,test_guid,logits,pred,label,loss
12,12,"[0.35501534, -0.73896444]",0,0,0.288842
21,21,"[0.30371025, -0.6972296]",0,0,0.313009


In [10]:
fdf[(fdf.loss >= thirdq_loss)].iloc[:2]

Unnamed: 0,test_guid,logits,pred,label,loss
11,11,"[-0.21338409, -0.11888009]",1,0,0.741515
13,13,"[0.39137152, -0.77856445]",0,1,1.440258
