## Install packages

In [13]:
# !pip install transformers

In [14]:
%load_ext autoreload
%autoreload 2

import os
import re

# from transformers import get_linear_schedule_with_warmup
# from transformers.optimization import AdamW
import sys
from datetime import datetime
from pathlib import Path
from sys import platform

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import src.utils as utils
import torch
from torch.utils.data import DataLoader, Dataset

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Set up GPU

In [15]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"There are {torch.cuda.device_count()} GPU(s) available.")
    print("Device name:", torch.cuda.get_device_name(0))
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: NVIDIA GeForce GTX 1070


## Load Dataset

In [16]:
data_path = Path("data")

all_train_df = pd.read_csv(data_path / "train.csv")
all_test_df = pd.read_csv(data_path / "test.csv")
all_val_df = pd.read_csv(data_path / "val.csv")

print(f"train: {len(all_train_df)}")
print(f"test: {len(all_test_df)}")
print(f"val: {len(all_val_df)}")

train: 67349
test: 1821
val: 872


In [17]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(all_train_df, test_size=0.1, random_state=23)
test_df = all_val_df

# Reset all indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"train: {len(train_df)}")
print(f"val: {len(val_df)}")
print(f"test: {len(test_df)}")

train: 60614
val: 6735
test: 872


In [18]:
train_df.sentence.apply(lambda d: len(d.split())).max()

52

In [19]:
val_df.sentence.apply(lambda d: len(d.split())).max()

51

In [20]:
test_df.sentence.apply(lambda d: len(d.split())).max()

47

# Fine-Tuned Bert Classifier

## PyTorch DataLoader

In [21]:
%%time
from src.datasets import SST2Dataset
from torch.utils.data import (
    DataLoader,
    RandomSampler,
    SequentialSampler,
    TensorDataset,
    random_split,
)
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 16
MAX_LEN = 64
FRAC = 1

device = utils.get_device()

# Train data
print("Train")
train_data = SST2Dataset.create_dataset(
    "train", device, train_df, max_seq_len=MAX_LEN, frac=FRAC
)
train_dataloader = DataLoader(
    train_data, sampler=RandomSampler(train_data), batch_size=batch_size
)

# Val data
print("Validation")
val_data = SST2Dataset.create_dataset(
    "val", device, val_df, max_seq_len=MAX_LEN, frac=FRAC
)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data))

# Test data
print("Test")
test_data = SST2Dataset.create_dataset("test", device, test_df, max_seq_len=MAX_LEN)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data))

Train


100%|████████████████████████████████████████████████████████████████████████████████████████████| 60614/60614 [00:04<00:00, 12466.66it/s]


Validation


100%|██████████████████████████████████████████████████████████████████████████████████████████████| 6735/6735 [00:00<00:00, 12533.24it/s]


Test


100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 872/872 [00:00<00:00, 9959.46it/s]

CPU times: user 6.16 s, sys: 17.5 ms, total: 6.18 s
Wall time: 9.78 s





## Train

In [22]:
from src.BertClassifier import BertClassifier
from transformers import (
    AdamW,
    BertForSequenceClassification,
    get_linear_schedule_with_warmup,
)


def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler."""
    # Instantiate Bert Classifier
    device = utils.get_device()
    bert_classifier = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2
    )
    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(
        bert_classifier.classifier.parameters(),
        lr=5e-5,  # Default learning rate
        eps=1e-8,  # Default epsilon value
    )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps  # Default value
    )
    return bert_classifier, optimizer, scheduler

In [23]:
utils.set_seed(42)  # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=epochs)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [24]:
bert_classifier

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [11]:
%%time
epochs = 2

utils.set_seed(42)  # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=epochs)
# history = utils.train(
#     bert_classifier,
#     optimizer,
#     scheduler,
#     train_dataloader,
#     val_dataloader,
#     epochs=epochs,
#     evaluation=True,
# )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

NameError: name 'train_dataloader' is not defined

In [12]:
# Save model parameters

model_params_path = Path("model_params")

now = datetime.now().strftime("%m-%d-%Y_%H%M%S")

torch.save(
    bert_classifier.state_dict(), model_params_path / f"{now}-bert_sequence_baseline"
)

NameError: name 'bert_classifier' is not defined

## Evaluate

In [None]:
from src.BertClassifier import bert_predict
from src.utils import evaluate_roc

# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, test_dataloader, device)

# Evaluate the Bert classifier
# evaluate_roc(probs, y_val[: len(probs)])
evaluate_roc(probs, test_data.labels.cpu())

In [None]:
# Compute predicted probabilities on the validation set
probs = bert_predict(bert_classifier, val_dataloader, device)

# Evaluate the Bert classifier
evaluate_roc(probs, val_data.labels.cpu())

In [None]:
len(val_data)

In [None]:
len(test_data)

# LOO

In [None]:
# guids = val_data.guids
# labels = val_data.labels

In [None]:
%%time
loo = val_data.leave_one_out(3018)

In [None]:
# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, test_dataloader, device)

In [None]:
probs