## Day 6: Using BERT for sentece classification.

Based on the original notebook available at [github](https://github.com/sugi-chan/custom_bert_pipeline) and corresponding TowardsDataScience [blogpost](https://towardsdatascience.com/bert-classifier-just-another-pytorch-model-881b3cf05784) by Michael Sugimura.

Our task for today is to use the pre-trained BERT model implemented in PyTorch for sentence classification problem.

We will use the [IMDB 50k dataset](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) with movies reviews. There are only two classes: positive and negative review.

To download the data uncomment the following cell. The data should be also awailable in the repository mentioned above.

In [None]:
# ! wget https://raw.githubusercontent.com/sugi-chan/custom_bert_pipeline/master/IMDB%20Dataset.csv

In [None]:
import pandas as pd

dat = pd.read_csv('IMDB Dataset.csv')

In [None]:
dat.head()

On Colab uncomment the following cell. On local machine install the `pytorch-pretrained-bert` lib into the selected environment.

In [None]:
# ! pip install pytorch-pretrained-bert

In [None]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

# Load pre-trained model tokenizer (vocabulary)
# It may take a while (depending on your internet connection)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
import time
import os
import copy


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision import datasets, models, transforms

import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from random import randrange

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
# class BertLayerNorm(nn.Module):
#         def __init__(self, hidden_size, eps=1e-12):
#             """Construct a layernorm module in the TF style (epsilon inside the square root).
#             """
#             super(BertLayerNorm, self).__init__()
#             self.weight = nn.Parameter(torch.ones(hidden_size))
#             self.bias = nn.Parameter(torch.zeros(hidden_size))
#             self.variance_epsilon = eps

#         def forward(self, x):
#             u = x.mean(-1, keepdim=True)
#             s = (x - u).pow(2).mean(-1, keepdim=True)
#             x = (x - u) / torch.sqrt(s + self.variance_epsilon)
#             return self.weight * x + self.bias
        

class BertForSequenceClassification(nn.Module):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_labels`: the number of classes for the classifier. Default = 2.
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_labels].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_labels = 2
    model = BertForSequenceClassification(config, num_labels)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, num_labels=2):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        nn.init.xavier_normal_(self.classifier.weight)
    
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

#     def freeze_bert_encoder(self):
#         # <YOUR CODE HERE>
        
#     def unfreeze_bert_encoder(self):
#         # <YOUR CODE HERE>

In [None]:
text = 'example text'
tokenized_text = tokenizer.tokenize(text)


In [None]:
from pytorch_pretrained_bert import BertConfig

config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

num_labels = 2
model = BertForSequenceClassification(num_labels)

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([tokenizer.convert_tokens_to_ids(tokenized_text)])

logits = model(tokens_tensor)

In [None]:
logits

In [None]:
example_1 = tokenizer.tokenize(dat.review[1])
example_2 = tokenizer.tokenize(dat.review[3])

[' '.join(example_1), ' '.join(example_2)]

In [None]:
tokenized_example_1 = tokenizer.convert_tokens_to_ids(example_1)
tokenized_example_2 = tokenizer.convert_tokens_to_ids(example_2)

In [None]:
tokenized_example_1[:5]

In [None]:
tokens_tensor = torch.tensor([tokenized_example_1])

In [None]:
tokens_tensor

In [None]:
logits = model(tokens_tensor)
logits

In [None]:
import torch.nn.functional as F

F.softmax(logits,dim=1)

In [None]:
from sklearn.model_selection import train_test_split
X = dat['review']
y = dat['sentiment']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)


In [None]:
X_train = X_train.values.tolist()
X_test = X_test.values.tolist()

y_train = pd.get_dummies(y_train).values.tolist()
y_test = pd.get_dummies(y_test).values.tolist()


Today we will use the PyTorch [Dataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset) to handle the data. Full tutorial is available at [this link](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html).

In [None]:
max_seq_length = 256

class TextDataset(Dataset):
    def __init__(self, x_y_pair, transform=None):
        # x_y_pair should contain the pair of X and y parts of the data (object matrix and labels vector)
        self.x_y_pair = x_y_pair
        self.transform = transform
        
    def __getitem__(self, index):
        # Tokenize x part at the selected index
        tokenized_review = 
        
        # Crop the review if it exceeds the max length
        if len(tokenized_review) > max_seq_length:
            # <YOUR CODE HERE>
            
        # Convert tokens to ids
        ids_review  = # <YOUR CODE HERE>
        
        # Pad the `ids_review` if it's length is less than `max_seq_length`
        # <YOUR CODE HERE>
        
        assert len(ids_review) == max_seq_length
        
        # Convert `ids_review` to torch.tensor
        ids_review = torch.tensor(ids_review)
        
        
        # Get the sentiment for selected index
        sentiment = # <YOUR CODE HERE>
        
        
        label = [torch.from_numpy(np.array(sentiment))]
        
        
        return ids_review, label
    
    def __len__(self):
        # Implement method to return the dataset length
        return # <YOUR CODE HERE>

In [None]:
batch_size = 16
train_lists = [X_train, y_train]
test_lists = [X_test, y_test]

training_dataset = TextDataset(x_y_pair = train_lists )

test_dataset = TextDataset(x_y_pair = test_lists )

dataloaders_dict = {
    'train': torch.utils.data.DataLoader(
        dataset=training_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0
    ),
    'val': torch.utils.data.DataLoader(
        dataset=test_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0
    )
}
dataset_sizes = {
    'train': len(train_lists[0]),
    'val': len(test_lists[0])}

In [None]:
# Select the cuda device if it's available. Use cpu otherwise.
device = # <YOUR CODE HERE>
print(device)

In [None]:
import tqdm

In [None]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                # Set model to training mode
                # <YOUR CODE HERE>
            else:
                # Set model to evaluate mode
                # <YOUR CODE HERE>

            running_loss = 0.0
            
            sentiment_corrects = 0
            
            
            # Iterate over data.
            for inputs, sentiment in tqdm.tqdm(dataloaders_dict[phase]):
                inputs = inputs.to(device) 

                sentiment = sentiment.to(device)
                
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)

                    outputs = F.softmax(outputs,dim=1)
                    
                    loss = criterion(outputs, torch.max(sentiment.float(), 1)[1])
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)

                
                sentiment_corrects += torch.sum(torch.max(outputs, 1)[1] == torch.max(sentiment, 1)[1])

                
            epoch_loss = running_loss / dataset_sizes[phase]

            
            sentiment_acc = sentiment_corrects.double() / dataset_sizes[phase]

            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} sentiment_acc: {:.4f}'.format(
                phase, sentiment_acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), 'bert_model_test.pth')

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(float(best_loss)))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model

Let's look at out model:

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')


Let's freeze everything but the last layer (our classifier):

In [None]:
model.freeze_bert_encoder()
model.classifier.weight.requires_grad = True

In [None]:
print(f'Now the model has {count_parameters(model):,} trainable parameters')


And move our model to the available device:

In [None]:
model.to(device)

Because the BERT part is freezed, we need to tune only the final layer. If you want to fine-tune BERT as well, uncomment the corresponding strings below.

In [None]:
lrlast = .001
# lrmain = .00001
optim1 = optim.Adam(
    [
#         {"params":model.bert.parameters(),"lr": lrmain},
        {"params":model.classifier.parameters(), "lr": lrlast},
       
   ])

#optim1 = optim.Adam(model.parameters(), lr=0.001)#,momentum=.9)
# Observe that all parameters are being optimized
optimizer_ft = optim1
criterion = nn.CrossEntropyLoss()

# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.1)


In [None]:
model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,
                       num_epochs=10)


That's all for today.