## Install packages

In [8]:
# !pip install transformers

In [2]:
%load_ext autoreload
%autoreload 2

import os
import re

# from transformers import get_linear_schedule_with_warmup
# from transformers.optimization import AdamW
import sys
from datetime import datetime
from pathlib import Path
from sys import platform

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import src.utils as utils
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

%matplotlib inline

In [10]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mpatcao[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Set up GPU

In [3]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available.")
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


## Load Dataset

In [12]:
data_path = Path("data")

all_train_df = pd.read_csv(data_path / "train.csv")
all_test_df = pd.read_csv(data_path / "test.csv")
all_val_df = pd.read_csv(data_path / "val.csv")

print(f"train: {len(all_train_df)}")
print(f"test: {len(all_test_df)}")
print(f"val: {len(all_val_df)}")

train: 67349
test: 1821
val: 872


In [13]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(all_train_df, test_size=0.1, random_state=23)
test_df = all_val_df

# Reset all indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"train: {len(train_df)}")
print(f"val: {len(val_df)}")
print(f"test: {len(test_df)}")

train: 60614
val: 6735
test: 872


In [14]:
train_df.label.value_counts()

1    33803
0    26811
Name: label, dtype: int64

In [15]:
val_df.label.value_counts()

1    3766
0    2969
Name: label, dtype: int64

In [16]:
test_df.label.value_counts()

1    444
0    428
Name: label, dtype: int64

## PyTorch DataLoader

In [42]:
%%time
from src.datasets import SST2Dataset
from torch.utils.data import (
    DataLoader,
    RandomSampler,
    SequentialSampler,
    TensorDataset,
    random_split,
)


def create_dataloaders(
    bert_tokenizer_name: str, max_sequence_length: int, batch_size: int
):
    device = utils.get_device()

    data_path = Path("data")

    all_train_df = pd.read_csv(data_path / "train.csv")
    # all_test_df = pd.read_csv(data_path / "test.csv")
    all_val_df = pd.read_csv(data_path / "val.csv")

    # train_df, val_df = train_test_split(all_train_df, test_size=0.1, random_state=23)
    train_df = all_train_df
    test_df = all_val_df

    # Reset all indices
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    # Train data
    train_data = SST2Dataset.create_dataset(
        name="train",
        device=device,
        df=train_df,
        tokenizer_name=bert_tokenizer_name,
        max_seq_len=max_sequence_length,
    )

    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

    # Test data
    test_data = SST2Dataset.create_dataset(
        name="test",
        device=device,
        df=test_df,
        tokenizer_name=bert_tokenizer_name,
        max_seq_len=max_sequence_length,
    )
    test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data))

    return (train_dataloader, None, test_dataloader)

CPU times: user 17 µs, sys: 0 ns, total: 17 µs
Wall time: 20.5 µs


# Fine-Tuned Bert Classifier

## Train

In [30]:
df = pd.read_csv('loo/run_0/test_loss.csv')
df2 = pd.read_csv('loo/run_1/test_loss.csv')

In [37]:
df.drop('Unnamed: 0', axis=1)

Unnamed: 0,guid,label,loss
0,0,1,0.013113
1,1,0,0.080885
2,2,1,0.011847
3,3,1,0.079333
4,4,0,0.338142
...,...,...,...
867,867,0,2.273263
868,868,1,0.316634
869,869,0,1.533939
870,870,0,0.568069


In [39]:
df.drop([c for c in df.columns if 'Unnamed' in c], axis=1)

Unnamed: 0,guid,label,loss
0,0,1,0.013113
1,1,0,0.080885
2,2,1,0.011847
3,3,1,0.079333
4,4,0,0.338142
...,...,...,...
867,867,0,2.273263
868,868,1,0.316634
869,869,0,1.533939
870,870,0,0.568069


In [27]:
df2

Unnamed: 0.1,Unnamed: 0,guid,label,loss
0,0,0,1,0.016650
1,1,1,0,0.072907
2,2,2,1,0.015513
3,3,3,1,0.076530
4,4,4,0,0.198417
...,...,...,...,...
867,867,867,0,1.617648
868,868,868,1,0.362112
869,869,869,0,1.560394
870,870,870,0,0.641366


In [23]:
import time

for i in tqdm(range(2000)):
    df = pd.read_csv(f"loo/run_{i}/test_loss.csv")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:01<00:00, 1029.58it/s]


In [17]:
df2

Unnamed: 0.1,Unnamed: 0,guid,label,loss
0,0,0,1,0.016650
1,1,1,0,0.072907
2,2,2,1,0.015513
3,3,3,1,0.076530
4,4,4,0,0.198417
...,...,...,...,...
867,867,867,0,1.617648
868,868,868,1,0.362112
869,869,869,0,1.560394
870,870,870,0,0.641366


In [43]:
%%time
import src.BertClassifier as BertClassifier


def main_train_loop():
    config = wandb.config
    train_dataloader, val_dataloader, test_dataloader = create_dataloaders(
        config.bert_model_name, config.max_sequence_length, config.batch_size
    )
    print(f"Train: {len(train_dataloader)*config.batch_size}")
    print(f"Test: {len(test_dataloader)}")

    model = BertClassifier.create_bert_classifier(
        config.bert_model_name,
        classifier_hidden_size=config.classifier_hidden_size,
        classifier_drop_out=config.classifier_drop_out,
        freeze_bert=True,
        random_state=42,
    )
    optimizer = utils.create_optimizer(
        model.classifier.parameters(), config.learning_rate
    )

    loss_fn = torch.nn.CrossEntropyLoss()

    utils.train(
        config=config,
        model=model,
        optimizer=optimizer,
        loss_fn=loss_fn,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
    )

    test_loss, test_acc = utils.evaluate(model, test_dataloader)
    wandb.summary["test/loss"] = test_loss
    wandb.summary["test/accuracy"] = test_acc

    wandb.finish()
    return model

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 9.54 µs


In [None]:
# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.

run = wandb.init(
    project="BertClassifier",
    group="fine-tuning",
    config={
        "epochs": 3,
        "batch_size": 16,
        "learning_rate": 5e-5,
        "classifier_hidden_size": 40,
        "classifier_drop_out": 0,
        "max_sequence_length": 64,
        "bert_model_name": "distilbert-base-uncased",
    },
)

model = main_train_loop()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 67349/67349 [00:05<00:00, 13064.55it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 872/872 [00:00<00:00, 10360.88it/s]


Train: 67360
Test: 872


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 65%|████████████████████████████████████████████████████████████████████████▎                                       | 2718/4210 [10:47<05:53,  4.23batch/s]

In [None]:
# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.

run = wandb.init(
    project="BertClassifier",
    group="fine_tune_model_dropout",
    config={
        "epochs": 3,
        "batch_size": 16,
        "learning_rate": 3e-5,
        "classifier_hidden_size": 20,
        "classifier_drop_out": 0,
        "max_sequence_length": 64,
        "bert_model_name": "distilbert-base-uncased",
    },
)

model = main_train_loop()

In [7]:
import src.BertClassifier as BertClassifier


class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self


config = AttrDict()
config.update(
    {
        "epochs": 3,
        "batch_size": 16,
        "learning_rate": 3e-5,
        "classifier_hidden_size": 20,
        "classifier_drop_out": 0,
        "max_sequence_length": 64,
        "bert_model_name": "distilbert-base-uncased",
    }
)
model = BertClassifier.create_bert_classifier(
    config.bert_model_name,
    classifier_hidden_size=config.classifier_hidden_size,
    classifier_drop_out=config.classifier_drop_out,
    freeze_bert=True,
    random_state=42,
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [9]:
model.classifier

Sequential(
  (0): Linear(in_features=768, out_features=20, bias=True)
  (1): Tanh()
  (2): Dropout(p=0, inplace=False)
  (3): Linear(in_features=20, out_features=2, bias=True)
)

In [None]:
model.classifier

## Sweep Parameters

In [None]:
# sweep_configuration = {
#     "method": "bayes",
#     "metric": {"goal": "minimize", "name": "test/loss"},
#     "parameters": {
#         "epochs": {"value": 4},
#         "bert_model_name": {"value": "distilbert-base-uncased"},
#         "max_sequence_length": {"value": 64},
#         "batch_size": {"values": [8, 16, 32, 64]},
#         "learning_rate": {"values": [3e-5, 5e-5]},
#         "classifier_hidden_size": {"values": [10, 100, 200, 400]},
#     },
# }

# sweep_id = wandb.sweep(sweep=sweep_configuration, project="BertClassifier")
# wandb.agent(sweep_id, function=main_train_loop)

In [None]:
epoc_length = len(history["train_loss"])
x = np.arange(1, epoc_length + 1)


plt.plot(x, history["train_loss"], color="b")
plt.plot(x, history["val_loss"], color="g")
plt.title("Loss")
plt.xlabel("Epochs")
plt.ylabel("Loss")

In [None]:
plt.plot(x, history["train_acc"], color="b")
plt.plot(x, history["val_acc"], color="g")
plt.title("Accuracy")
plt.xlabel("Epochs")
plt.ylabel("Acc")

In [None]:
# Save model parameters

model_params_path = Path("model_params")

now = datetime.now().strftime("%m-%d-%Y_%H%M%S")

torch.save(
    bert_classifier.classifier.state_dict(),
    model_params_path / f"{now}-BertClassifier-epoch{epochs}-batch{batch_size}",
)

## Evaluate

In [None]:
from src.BertClassifier import bert_predict
from src.utils import evaluate_roc

# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, test_dataloader, device)

# Evaluate the Bert classifier
# evaluate_roc(probs, y_val[: len(probs)])
evaluate_roc(probs, test_data.labels.cpu())

In [None]:
# Compute predicted probabilities on the validation set
probs = bert_predict(bert_classifier, val_dataloader, device)

# Evaluate the Bert classifier
evaluate_roc(probs, val_data.labels.cpu())

In [None]:
len(val_data)

In [None]:
len(test_data)

# LOO

In [None]:
# guids = val_data.guids
# labels = val_data.labels

In [None]:
%%time
loo = val_data.leave_one_out(3018)

In [None]:
# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, test_dataloader, device)

In [None]:
probs