## Install packages

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
# from transformers import get_linear_schedule_with_warmup
# from transformers.optimization import AdamW
import sys
from sys import platform
import re
from pathlib import Path
from datetime import datetime
import src.utils as utils

import matplotlib.pyplot as plt
%matplotlib inline

## Set up GPU

In [3]:
if torch.cuda.is_available():
  device = torch.device("cuda")
  print(f'There are {torch.cuda.device_count()} GPU(s) available.')
  print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

No GPU available, using the CPU instead.


## Load Dataset

In [4]:
data_path = Path('data')

all_train_df = pd.read_csv(data_path / 'train.csv')
all_test_df = pd.read_csv(data_path / 'test.csv')
all_val_df = pd.read_csv(data_path / 'val.csv')

print(f"train: {len(all_train_df)}")
print(f"test: {len(all_test_df)}")
print(f"val: {len(all_val_df)}")

train: 67349
test: 1821
val: 872


In [5]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(all_train_df, test_size=0.1, random_state=23)
test_df = all_val_df

# Reset all indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

print(f"train: {len(train_df)}")
print(f"val: {len(val_df)}")
print(f"test: {len(test_df)}")

train: 60614
val: 6735
test: 872


# Fine-Tuned Bert Classifier

## PyTorch DataLoader

In [12]:
%%time
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from src.datasets import SST2Dataset

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32
MAX_LEN = 64
FRAC = 0.01

# Train data
print('Train')
train_data = SST2Dataset.create_dataset('train', train_df, max_seq_len = MAX_LEN, frac=FRAC)
train_dataloader = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=batch_size)

# Val data
print('Validation')
val_data = SST2Dataset.create_dataset('val', val_df, max_seq_len = MAX_LEN, frac=FRAC)
val_dataloader = DataLoader(val_data, sampler=SequentialSampler(val_data))

# Test data
print('Test')
test_data = SST2Dataset.create_dataset('test', test_df, max_seq_len = MAX_LEN)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data))

Train


100%|██████████| 606/606 [00:00<00:00, 2098.61it/s]


Validation


100%|██████████| 67/67 [00:00<00:00, 2398.73it/s]


Test


100%|██████████| 872/872 [00:00<00:00, 1261.82it/s]

CPU times: user 852 ms, sys: 3.46 ms, total: 855 ms
Wall time: 1.27 s





## Train

In [8]:
from transformers import AdamW, get_linear_schedule_with_warmup
from src.BertClassifier import BertClassifier

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=True)

    # Tell PyTorch to run the model on GPU
    bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [9]:
%%time
epochs = 2

utils.set_seed(42)    # Set seed for reproducibility
bert_classifier, optimizer, scheduler = initialize_model(epochs=epochs)
utils.train(bert_classifier, optimizer, scheduler, train_dataloader, val_dataloader, epochs=epochs, evaluation=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------


100%|██████████| 19/19 [01:16<00:00,  4.04s/batch]

   1    |   18    |   0.683986   |     -      |     -     |   76.82  
----------------------------------------------------------------------





   1    |    -    |   0.683986   |  0.677833  |   53.73   |   87.86  
----------------------------------------------------------------------


 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------


100%|██████████| 19/19 [01:11<00:00,  3.76s/batch]

   2    |   18    |   0.679186   |     -      |     -     |   71.41  
----------------------------------------------------------------------





   2    |    -    |   0.679186   |  0.676503  |   53.73   |   82.32  
----------------------------------------------------------------------


Training complete!
CPU times: user 2min 35s, sys: 12.5 s, total: 2min 47s
Wall time: 2min 51s


In [10]:
# Save model parameters

model_params_path = Path('model_params')

now = datetime.now().strftime("%m-%d-%Y_%H%M%S")

# torch.save(bert_classifier.classifier.state_dict(), model_params_path / f"{now}-classifier")

## Evaluate

In [11]:
from src.BertClassifier import bert_predict
from src.utils import evaluate_roc

# Compute predicted probabilities on the test set
probs = bert_predict(bert_classifier, test_dataloader, device)

# Evaluate the Bert classifier
# evaluate_roc(probs, y_val[: len(probs)])
evaluate_roc(probs, test_data.labels)

KeyboardInterrupt: 

In [None]:
# Compute predicted probabilities on the validation set
probs = bert_predict(bert_classifier, val_dataloader, device)

# Evaluate the Bert classifier
evaluate_roc(probs, val_data.labels)

In [None]:
len(val_data)

In [None]:
len(test_data)

# LOO

In [None]:
# guids = val_data.guids
# labels = val_data.labels

In [None]:
%%time
loo = val_data.leave_one_out(3018)

In [None]:
# loo.guids