In [None]:
import os
import re
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

#!pip install transformers

In [62]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# %cd /content/gdrive/My Drive/Colab Notebooks/NLP-Seminar

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/Colab Notebooks/NLP-Seminar


In [63]:
data = pd.read_csv('esnli_dev.csv')

In [64]:
data_sentences = data[['Sentence1', 'Sentence2', 'gold_label']]

In [65]:
labelencoder = LabelEncoder()

data_sentences['gold_label_cat'] = labelencoder.fit_transform(data_sentences['gold_label'])
data_sentences.drop('gold_label', inplace = True, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [66]:
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
Device name: Tesla T4


In [67]:
def text_preprocessing(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [68]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def preprocessing_for_bert(data):

    input_ids_1 = []
    attention_masks_1 = []

    input_ids_2 = []
    attention_masks_2 = []

    sentence1 = data.Sentence1.values
    sentence2 = data.Sentence2.values

    for sent in sentence1:
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  
            add_special_tokens=True,        
            max_length=MAX_LEN,                 
            pad_to_max_length=True,         
            #return_tensors='pt',           
            return_attention_mask=True      
            )
        
        input_ids_1.append(encoded_sent.get('input_ids'))
        attention_masks_1.append(encoded_sent.get('attention_mask'))


    for sent in sentence2:
        encoded_sent = tokenizer.encode_plus(
            text=text_preprocessing(sent),  
            add_special_tokens=True,        
            max_length=MAX_LEN,                 
            pad_to_max_length=True,         
            #return_tensors='pt',           
            return_attention_mask=True      
            )
        
        input_ids_2.append(encoded_sent.get('input_ids'))
        attention_masks_2.append(encoded_sent.get('attention_mask'))        

    input_ids_1 = torch.tensor(input_ids_1)
    attention_masks_1 = torch.tensor(attention_masks_1)

    input_ids_2 = torch.tensor(input_ids_2)
    attention_masks_2 = torch.tensor(attention_masks_2)

    return input_ids_1, attention_masks_1, input_ids_2, attention_masks_2

In [69]:
Sentence1 = data_sentences['Sentence1']
Sentence2 = data_sentences['Sentence2']

res = pd.concat([Sentence1, Sentence2])
res.reset_index(drop = True, inplace = True)

In [70]:
encoded_sents = [tokenizer.encode(sent, add_special_tokens=True) for sent in res]

max_len = max([len(sent) for sent in encoded_sents])
# print('Max length: ', max_len)

In [71]:
from sklearn.model_selection import train_test_split

train, val = train_test_split(data_sentences, test_size=0.2)

X_train, y_train = train[['Sentence1', 'Sentence2']], train['gold_label_cat']
X_val, y_val = val[['Sentence1', 'Sentence2']], val['gold_label_cat']

In [72]:
MAX_LEN = 63

train_inputs_1, train_masks_1, train_inputs_2, train_masks_2 = preprocessing_for_bert(X_train)
val_inputs_1, val_masks_1, val_inputs_2, val_masks_2 = preprocessing_for_bert(X_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [73]:
combined_train_inputs = torch.cat([train_inputs_1, train_inputs_2], dim = 1)
combined_train_masks = torch.cat([train_masks_1, train_masks_2], dim = 1)

combined_val_inputs = torch.cat([val_inputs_1, val_inputs_2], dim = 1)
combined_val_masks = torch.cat([val_masks_1, val_masks_2], dim = 1)

In [74]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert other data types to torch.Tensor
train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)

# For fine-tuning BERT, the authors recommend a batch size of 16 or 32.
batch_size = 32

# Create the DataLoader for our training set
train_data = TensorDataset(combined_train_inputs, combined_train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
val_data = TensorDataset(combined_val_inputs, combined_val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)


In [75]:
%%time
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        D_in, H, D_out = 768, 50, 3

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids, attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        
        last_hidden_state_cls = outputs[0][:, 0, :]

        logits = self.classifier(last_hidden_state_cls)

        return logits

CPU times: user 39 µs, sys: 0 ns, total: 39 µs
Wall time: 42.4 µs


In [76]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    bert_classifier = BertClassifier(freeze_bert=False)

    bert_classifier.to(device)

    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,   
                      eps=1e-8   
                      )

    total_steps = len(train_dataloader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, 
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [77]:
import random
import time

loss_fn = nn.CrossEntropyLoss()

def set_seed(seed_value=42):
    """Set seed for reproducibility.
    """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):

    print("Start training...\n")
    for epoch_i in range(epochs):

        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()

        total_loss, batch_loss, batch_counts = 0, 0, 0

        model.train()

        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            model.zero_grad()

            logits = model(b_input_ids, b_attn_mask)

            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()

            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch

                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)

        if evaluation == True:

            val_loss, val_accuracy = evaluate(model, val_dataloader)

            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """

    model.eval()

    val_accuracy = []
    val_loss = []

    for batch in val_dataloader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)

        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())

        preds = torch.argmax(logits, dim=1).flatten()

        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_accuracy.append(accuracy)

    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)

    return val_loss, val_accuracy

In [78]:
set_seed(42)  
bert_classifier, optimizer, scheduler = initialize_model(epochs=4)
train(bert_classifier, train_dataloader, val_dataloader, epochs=4, evaluation=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  |  Val Acc  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   1.098948   |     -      |     -     |   14.44  
   1    |   40    |   1.024430   |     -      |     -     |   14.14  
   1    |   60    |   0.862769   |     -      |     -     |   13.85  
   1    |   80    |   0.868596   |     -      |     -     |   13.60  
   1    |   100   |   0.773322   |     -      |     -     |   13.40  
   1    |   120   |   0.754628   |     -      |     -     |   13.42  
   1    |   140   |   0.745995   |     -      |     -     |   13.58  
   1    |   160   |   0.750236   |     -      |     -     |   13.76  
   1    |   180   |   0.697930   |     -      |     -     |   13.81  
   1    |   200   |   0.623260   |     -      |     -     |   13.73  
   1    |   220   |   0.640764   |     -      |     -     |   13.60  
   1    |   240   |   0.627276   |     -      |     -     |   13.55  
