In [7]:
!pip install pytorch-pretrained-bert --quiet
!pip install mlflow --quiet
!pip install pyngrok --quiet
!pip install importlib-metadata --quiet

The pycodestyle_magic extension is already loaded. To reload it, use:
  %reload_ext pycodestyle_magic


In [21]:
import sys
import os
import time
import numpy as np
import random as rn
import pandas as pd
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn

from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
import matplotlib.pyplot as plt

from sklearn.metrics import f1_score

# mlflow
import mlflow
from mlflow import log_metric, log_param, log_artifact
from pyngrok import ngrok
from getpass import getpass

# Set random seed for reproducible experiments
SEED = 123
rn.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

INFO:pycodestyle:36:1: W391 blank line at end of file


In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
!tar -xzf /content/gdrive/MyDrive/TAED2/aclImdb_v1.tar.gz

In [None]:
raw_data_train = []
raw_data_test = []

for folder in ['train', 'test']:
    path = 'aclImdb/'+folder
    for sentiment in ['pos', 'neg']:
        directory = path+'/'+sentiment
        for filename in os.listdir(directory):
            with open(os.path.join(directory, filename)) as f:
                lines = f.readlines()
                assert len(lines) == 1
                if folder == 'train':
                    raw_data_train.append({'text':lines[0], 'sentiment':sentiment})
                else:
                    raw_data_test.append({'text':lines[0], 'sentiment':sentiment})

In [None]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), raw_data_train)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), raw_data_test)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

(25000, 25000, 25000, 25000)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

100%|██████████| 231508/231508 [00:00<00:00, 13423489.09B/s]


In [None]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)

(25000, 25000)

In [None]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((25000, 512), (25000, 512))

In [8]:
train_y = np.array(train_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

NameError: ignored

In [None]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

In [None]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [None]:
# ensuring that the model runs on GPU, not on CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()     # running BERT on CUDA_GPU

100%|██████████| 407873900/407873900 [00:06<00:00, 58293660.02B/s]


In [11]:
# Setting hyper-parameters

BATCH_SIZE = 4
EPOCHS = 3

INFO:pycodestyle:7:1: W391 blank line at end of file


In [None]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [None]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

torch.cuda.empty_cache()   # Clearing Cache space for a fresh Model run

In [None]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

# computes accuracy
def binary_accuracy(preds, y):
  #rounded_preds = torch.round(torch.sigmoid(preds))
  correct = (preds == y).float()
  acc = correct.sum() / len(correct)
  return acc

# training step
def train(model, iterator, optimizer, criterion):
    # stats
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0
    # train mode
    model.train()

    for i, batch_data in enumerate(iterator):
        
        if i % 50 == 1:
            print(i / len(iterator) * 100)
            print(f'Loss: {epoch_loss / i}, Acc: {epoch_acc / i}, F1-Score: {epoch_f1 / i}')

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = model(token_ids, masks)
        rounded_preds = torch.round(logits)

        loss = criterion(logits, labels)
        acc = binary_accuracy(rounded_preds, labels)
        f1 = f1_score(labels.detach().cpu().numpy(), rounded_preds.detach().cpu().numpy(), zero_division=0)

        # stats
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        epoch_f1 += f1

        log_metric("Train Accuracy", acc)
        log_metric("Train Loss", loss)
        log_metric("F1 Score", f1)
        
        model.zero_grad()
        loss.backward()
        clip_grad_norm_(parameters=model.parameters(), max_norm=1.0)
        optimizer.step()
        clear_output(wait=True)

    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1 / len(iterator)

# evaluates the model on given iterator (either 
# train_iter, valid_iter, or test_iter)
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    epoch_f1 = 0
    # evaluation mode
    model.eval()

    with torch.no_grad():
        
        for i, batch_data in enumerate(iterator):
        
            token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

            logits = bert_clf(token_ids, masks)

            rounded_preds = torch.round(logits)
            
            loss = criterion(logits, labels)
            acc = binary_accuracy(rounded_preds, labels)
            f1 = f1_score(labels.detach().cpu().numpy(), rounded_preds.detach().cpu().numpy(), zero_division=0)

            # stats
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            epoch_f1 += f1

            log_metric("Val Accuracy", acc)
            log_metric("Val Loss", loss)
            log_metric("F1 Score", f1)

    return epoch_loss / len(iterator), epoch_acc / len(iterator), epoch_f1 / len(iterator)

In [None]:
mlflow.start_run(run_name = 'Experiment 1') 

# run tracking UI in the background
get_ipython().system_raw("mlflow ui --port 5000 &") # run tracking UI in the background

# Terminate open tunnels if exist
ngrok.kill()

# Setting the authtoken (optional)
# Get your authtoken from https://dashboard.ngrok.com/auth
NGROK_AUTH_TOKEN = getpass('Enter the ngrok authtoken: ')  #2F7eonHHvXFgOXU0vnRJllaq0Kc_qEmuN78R6Be1v9SiSwuv
ngrok.set_auth_token(NGROK_AUTH_TOKEN)

# Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

Enter the ngrok authtoken: ··········
MLflow Tracking UI: https://d10d-35-221-10-160.ngrok.io


In [None]:
criterion =  nn.BCELoss().to(device)

best_val_loss = float('inf')

for epoch in range(1, EPOCHS+1):
 
    # start time
    start_time = time.time()

    # train for an epoch
    train_loss, train_acc, train_f1 = train(bert_clf, train_dataloader, optimizer, criterion)
    valid_loss, valid_acc, valid_f1 = evaluate(bert_clf, test_dataloader, criterion)
    # end time
    end_time = time.time()
    # stats
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    # save model if has validation loss
    # better than last one
    if valid_loss < best_val_loss:
        best_val_loss = valid_loss
        torch.save(bert_clf.state_dict(), 'model.pt')
        log_artifact('model.pt')
    # stats
    print(f'Epoch: {epoch:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}% | Train F1-Score: {train_f1:.2f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}% | Val. F1-Score: {valid_f1:.2f}')

# Test
# model.load_state_dict(torch.load('model.pt'))
# test_loss, test_acc = evaluate(model, test_iter, criterion)
# print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

99.21600000000001
Loss: 0.2590061287888689, Acc: 0.9405337848734076, F1-Score: 0.8681718002472804


In [None]:
mlflow.end_run()