In [1]:
#@title Install some libraries
# A dependency of the preprocessing for BERT inputs
!pip install -q transformers 

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m48.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [45]:
#@title Import Libraries 
%matplotlib inline
import os
import time
import pandas as pd
import numpy as np

import transformers 
#from transformers import BertModel, BertTokenizer
from transformers import DistilBertModel, DistilBertTokenizer
import torch 
from torch import nn 
from torch.utils.data import Dataset, DataLoader

#import itertools
#from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from tqdm import tqdm 
import warnings
warnings.filterwarnings('ignore') 

In [3]:
!npx degit https://github.com/mequanent/Social-Networks -f

[K[?25hnpx: installed 1 in 2.828s
[36m> destination directory is not empty. Using --force, continuing[39m
[36m> cloned [1mmequanent/Social-Networks[22m#[1mHEAD[22m[39m


# SemEval Dataset

In [46]:
#@title Dataset Folder
# show the folder 
#data_folder = "/content/drive/MyDrive/text_lab" # this is where you upload your zip file at 
data_folder = "/content/text_lab" # this is where you upload your zip file at 
#os.listdir(data_folder) 

In [None]:
#@title Read training data
columns = ["id", "text", "emotion", "intensity"]
anger_train = pd.read_csv(f"{data_folder}/data/semeval/train/anger-ratings-0to1.train.txt", 
                          sep="\t", header=None, names=columns)
sadness_train = pd.read_csv(f"{data_folder}/data/semeval/train/sadness-ratings-0to1.train.txt",
                         sep="\t", header=None, names=columns)
fear_train = pd.read_csv(f"{data_folder}/data/semeval/train/fear-ratings-0to1.train.txt",
                         sep="\t", header=None, names=columns)
joy_train = pd.read_csv(f"{data_folder}/data/semeval/train/joy-ratings-0to1.train.txt",
                         sep="\t", header=None, names=columns)

# combine 4 sub-dataset, 'se_' prefix to identify from imdb
se_train_df = pd.concat([anger_train, fear_train, joy_train, sadness_train], ignore_index=True)

In [None]:
#@title Read testing data
anger_test = pd.read_csv(f"{data_folder}/data/semeval/dev/anger-ratings-0to1.dev.gold.txt",
                         sep="\t", header=None, names=columns)
sadness_test = pd.read_csv(f"{data_folder}/data/semeval/dev/sadness-ratings-0to1.dev.gold.txt",
                         sep="\t", header=None, names=columns)
fear_test = pd.read_csv(f"{data_folder}/data/semeval/dev/fear-ratings-0to1.dev.gold.txt",
                         sep="\t", header=None, names=columns)
joy_test = pd.read_csv(f"{data_folder}/data/semeval/dev/joy-ratings-0to1.dev.gold.txt",
                         sep="\t", header=None, names=columns)

# combine 4 sub-dataset
se_test_df = pd.concat([anger_test, fear_test, joy_test, sadness_test], ignore_index=True)
se_test_df.head(2)

Unnamed: 0,id,text,emotion,intensity
0,10857,@ZubairSabirPTI pls dont insult the word 'Molna',anger,0.479
1,10858,@ArcticFantasy I would have almost took offens...,anger,0.458


In [None]:
# shuffle dataset
se_train_df = se_train_df.sample(frac=1)
se_test_df = se_test_df.sample(frac=1)

In [None]:
## save to pickle file
se_train_df.to_pickle(os.path.join(data_folder, "se_train_df.pkl")) 
se_test_df.to_pickle(os.path.join(data_folder, "se_test_df.pkl"))

In [5]:
#@title Load a pickle file
se_train_df = pd.read_pickle(os.path.join(data_folder, "se_train_df.pkl"))
se_test_df = pd.read_pickle(os.path.join(data_folder, "se_test_df.pkl"))

In [6]:
# get some validation data from the training set
se_train_df, se_val_df = train_test_split(se_train_df, test_size=0.15, random_state=42)
print("Shape of Training df: ", se_train_df.shape)
print("Shape of Validating df: ", se_val_df.shape)
print("Shape of Testing df: ", se_test_df.shape)

Shape of Training df:  (3071, 4)
Shape of Validating df:  (542, 4)
Shape of Testing df:  (347, 4)


In [16]:
labels = list(se_train_df.emotion.unique())
labels

['joy', 'fear', 'sadness', 'anger']

# **Label encoder decoder for both**

In [17]:
#@title Label encoder decoder function for both imdb and semeval
def encode_decode_labels(df): 
  '''
  +) for SemEval: encoded_labels = {'fear':0, 'sadness':1, 'anger':2, 'joy':3}
  +) for   IMDB:  encoded_labels = {'pos':0, 'neg':1}

  +) for SemEval: decoded_labels = {0:'fear', 1:'sadness', 2:'anger', 3:'joy'}
  +) for   IMDB:  decoded_labels = {0:'pos', 1:'neg'}
  '''
  labels = df.emotion.unique() 
  encoded_labels = {labels[i]: i for i in range(len(labels))}  
  decoded_labels = {v:k for k,v in encoded_labels.items()} # decode the labels 
  return encoded_labels, decoded_labels   

In [18]:
# showcase example for encode_decode_labels(df)
se_labels, se_decoded_labels = encode_decode_labels(se_test_df)
print('encoded labels: ', se_labels)
print('decoded labels: ', se_decoded_labels) 

encoded labels:  {'joy': 0, 'anger': 1, 'fear': 2, 'sadness': 3}
decoded labels:  {0: 'joy', 1: 'anger', 2: 'fear', 3: 'sadness'}


# **Common BERT for BOTH SemEval and IMDB Dataset**

## Learning Part

For multi-label text classification learned from [this kaggle notebook](https://www.kaggle.com/sainijagjit/bbc-dataset) on bbc news classification.

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
sample = 'Hi, this is a sample sentence to learn from BERT.'
# truncation : if it is True then we allow bert to truncated every sequence it's length is higher then max_length
# return_tensors : the type of tensors that will be returned (as we are using pytorch then we set "pt")
bert_input  = tokenizer(sample,padding="max_length",max_length=15,truncation=True,return_tensors="pt")

In [None]:
bert_input

{'input_ids': tensor([[ 101, 8790,  117, 1142, 1110,  170, 6876, 5650, 1106, 3858, 1121,  139,
         9637, 1942,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
print(bert_input["input_ids"])
# input_ids are the id representation of each token 
# we can decode these inputs to get the original sequence 
print(tokenizer.decode(bert_input["input_ids"][0] ))
# the code 102 is for the [SEP] token and the 0 is for [PAD] token 

tensor([[ 101, 8790,  117, 1142, 1110,  170, 6876, 5650, 1106, 3858, 1121,  139,
         9637, 1942,  102]])
[CLS] Hi, this is a sample sentence to learn from BERT [SEP]


In [None]:
print(bert_input["token_type_ids"])
# the token_type_ids identified to which sequence a token belongs, when having just one sequence so it's always 0 

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])


In [None]:
print(bert_input["attention_mask"]) 
# the attention_mask identified whether the token is a real word or just a token padding
# it's 1 for the real words, the CLS and the SEP tokens, and for the pad token is 0

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


## Dataset Class
Now that we know which are the ouputs of our bert tokenizer we are going to build a Dataset Class for our emotion Dataset

In [47]:
#@title Dataset Class
#tokenizer= BertTokenizer.from_pretrained("bert-base-cased")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

class Dataset(Dataset): 
    def __init__(self, df): 
        '''get the encoded labels from the dataframe 
            +) for SemEval: labels = {'fear':0, 'sadness':1, 'anger':2, 'joy':3}
            +) for   IMDB:  labels = {'pos':0, 'neg':1} 
         +) encode_decode_labels() is a custom function defined above.
         '''
        encoded_labels, _ = encode_decode_labels(df) 
        #extract our labels from the df 
        self.labels = [encoded_labels[label] for label in df["emotion"]]
        
        #extract our labels from the df 
        #self.labels = [sem_eval_labels[label] for label in df["emotion"]]
        #tokenize our texts to the format that BERT expects to get as input 
        self.texts = [tokenizer(text, padding='max_length', max_length=512, 
                                truncation=True, return_tensors="pt")  
                     for text in df["text"]] 
    def classes(self):
        return self.labels
    
    def __len__(self): 
        return len(self.labels)
    
    #fetch a batch of labels
    def get_batch_labels(self,indx): 
        return np.array(self.labels[indx])
    # fetch a batch of texts 
    def get_batch_texts(self,indx): 
        return self.texts[indx]

    #get an item with the texts and the label
    def __getitem__(self, indx): 
        batch_texts = self.get_batch_texts(indx)
        batch_y = self.get_batch_labels(indx)
        
        return batch_texts, batch_y

## Building the model
Now after preparing our data to the Learning process, let's create our model using the pre-trained BERT base model which contains 12 layers of Transformers encoder.

In [76]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 
#@title DistilBertSentimentClassifier Model
class DistilBertSentimentClassifier(nn.Module): 
    def __init__(self, n_classes, dropout=0.5): 
        super(DistilBertSentimentClassifier, self).__init__()
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(dropout)
        self.classifier = torch.nn.Linear(768, n_classes)

    def forward(self, input_id, mask):
        output_1 = self.bert(input_ids=input_id, attention_mask=mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [54]:
#@title BertSentimentClassifier Model
class BertSentimentClassifier(nn.Module): 
    def __init__(self, n_classes, dropout=0.5): 
        super(BertSentimentClassifier, self).__init__()
        
        #self.bert = BertModel.from_pretrained("bert-base-cased")
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.dropout = nn.Dropout(dropout)
        # bert output a vector of size 768 
        #self.lin = nn.Linear(768, 4)
        self.lin = nn.Linear(self.bert.config.hidden_size, n_classes)
        #self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
    def forward(self, input_id, mask): 
        # as output, the bert model give us first the embedding vector of all the tokens of the sequence 
        # second we get the embedding vector of the CLS token.
        # fot a classification task it's enough to use this embedding for our classifier

        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output
        
        _, pooled_output = self.bert(input_ids=input_id, 
                                    attention_mask = mask, return_dict = False)
        dropout_output = self.dropout(pooled_output)
        linear_output  = self.lin(dropout_output)
        final_layer = self.softmax(linear_output)
        
        return final_layer  

In [99]:
#@title Training
# we are creating a standard pytorch training loop 

def train(model, train_data, val_data, learning_rate, epochs=5):
    start = time.time()
    #creating a custom Dataset objects using the training and validation data
    train, val = Dataset(train_data), Dataset(val_data)
    #creating dataloaders
    train_dataloader = DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                # get the predictions 
                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc
                # updating the Gradient Descent and Backpropagation operation
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            # now we evaluate on the validation data
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
    
    print(f'\nTraining time is: {(time.time()-start)/3600: .3f} hrs.')            

In [100]:
#@title EValuate the model
'''Now that we trained the model on the training set, we are going to use 
the test data to evaluate the performance of the model on unseen data'''
def evaluate(model, test_df):
    test = Dataset(test_df)
    test_dl = DataLoader(test,batch_size=2)
    
    cuda_available = torch.cuda.is_available()
    
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    if cuda_available:
        model = model.cuda()
    
    total_acc = 0
    for test_input , test_label in tqdm(test_dl):
        test_label = test_label.to(device)
        mask = test_input["attention_mask"].to(device)
        input_id = test_input["input_ids"].squeeze(1).to(device)
        output = model(input_id, mask)
        
        acc = (output.argmax(dim=1) == test_label).sum().item()
        total_acc +=acc 
        
    print(f"\nTest Accuracy : {total_acc / len(test_df): .3f}")
    

In [101]:
#@title Prediction Function
def predict(device, model, decoded_labels, sentence):
    #sentence = sentence
    sentence_input = tokenizer(sentence, padding='max_length', max_length=512, 
                               truncation=True,return_tensors="pt").to(device)
    input_id = sentence_input["input_ids"]
    mask = sentence_input["attention_mask"]
    output = model(input_id, mask)
    predicted_class_label = output.argmax(dim=1)
    predicted_class = decoded_labels[predicted_class_label.item()]
    print(f"The predicted emotion is : {predicted_class}")

In [102]:
#@title Predict on df samples function

# This is to test with sample texts from the test set 
def predict_on_samples_from_df(model, df, n_samples):
    # get the labels from the dataframe  
  _, decoded_labels = encode_decode_labels(df)
  # get n_samples of text from the df/test_df
  textset = df.sample(n_samples).reset_index(drop=True)
  
  for i in range(len(textset)):
      text = textset.loc[i, 'text']
      emotion = textset.loc[i, 'emotion']
      print('Predict the following text:\n')
      print(text)
      print('='*50)
      predict(device, model, decoded_labels, text)
      print('The exact emotion is: ', emotion)
      print('+'*50, '\n')      

In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

## SemEval model

In [85]:
EPOCHS = 20 # 9:54 am - 10:56 am
#sem_eval_model = BertSentimentClassifier(n_classes=4)
sem_eval_model = DistilBertSentimentClassifier(n_classes=4)

learning_rate = 1e-6 
train(sem_eval_model, se_train_df, se_val_df, learning_rate, EPOCHS)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 1536/1536 [02:53<00:00,  8.85it/s]


Epochs: 1 | Train Loss:  0.689                 | Train Accuracy:  0.293                 | Val Loss:  0.711                 | Val Accuracy:  0.214


100%|██████████| 1536/1536 [02:52<00:00,  8.88it/s]


Epochs: 2 | Train Loss:  0.682                 | Train Accuracy:  0.322                 | Val Loss:  0.716                 | Val Accuracy:  0.205


100%|██████████| 1536/1536 [02:53<00:00,  8.88it/s]


Epochs: 3 | Train Loss:  0.668                 | Train Accuracy:  0.355                 | Val Loss:  0.752                 | Val Accuracy:  0.199


100%|██████████| 1536/1536 [02:53<00:00,  8.87it/s]


Epochs: 4 | Train Loss:  0.634                 | Train Accuracy:  0.416                 | Val Loss:  0.784                 | Val Accuracy:  0.181


100%|██████████| 1536/1536 [02:52<00:00,  8.88it/s]


Epochs: 5 | Train Loss:  0.589                 | Train Accuracy:  0.495                 | Val Loss:  0.811                 | Val Accuracy:  0.181


100%|██████████| 1536/1536 [02:52<00:00,  8.89it/s]


Epochs: 6 | Train Loss:  0.534                 | Train Accuracy:  0.568                 | Val Loss:  0.890                 | Val Accuracy:  0.166


100%|██████████| 1536/1536 [02:52<00:00,  8.89it/s]


Epochs: 7 | Train Loss:  0.481                 | Train Accuracy:  0.627                 | Val Loss:  0.984                 | Val Accuracy:  0.175


100%|██████████| 1536/1536 [02:52<00:00,  8.89it/s]


Epochs: 8 | Train Loss:  0.425                 | Train Accuracy:  0.677                 | Val Loss:  1.080                 | Val Accuracy:  0.151


100%|██████████| 1536/1536 [02:52<00:00,  8.89it/s]


Epochs: 9 | Train Loss:  0.376                 | Train Accuracy:  0.718                 | Val Loss:  1.168                 | Val Accuracy:  0.148


100%|██████████| 1536/1536 [02:52<00:00,  8.89it/s]


Epochs: 10 | Train Loss:  0.321                 | Train Accuracy:  0.771                 | Val Loss:  1.281                 | Val Accuracy:  0.151


100%|██████████| 1536/1536 [02:52<00:00,  8.89it/s]


Epochs: 11 | Train Loss:  0.277                 | Train Accuracy:  0.806                 | Val Loss:  1.346                 | Val Accuracy:  0.140


100%|██████████| 1536/1536 [02:53<00:00,  8.86it/s]


Epochs: 12 | Train Loss:  0.237                 | Train Accuracy:  0.838                 | Val Loss:  1.427                 | Val Accuracy:  0.133


100%|██████████| 1536/1536 [02:52<00:00,  8.88it/s]


Epochs: 13 | Train Loss:  0.201                 | Train Accuracy:  0.874                 | Val Loss:  1.570                 | Val Accuracy:  0.114


100%|██████████| 1536/1536 [02:52<00:00,  8.88it/s]


Epochs: 14 | Train Loss:  0.170                 | Train Accuracy:  0.891                 | Val Loss:  1.616                 | Val Accuracy:  0.137


100%|██████████| 1536/1536 [02:52<00:00,  8.88it/s]


Epochs: 15 | Train Loss:  0.142                 | Train Accuracy:  0.918                 | Val Loss:  1.731                 | Val Accuracy:  0.124


100%|██████████| 1536/1536 [02:52<00:00,  8.89it/s]


Epochs: 16 | Train Loss:  0.117                 | Train Accuracy:  0.931                 | Val Loss:  1.807                 | Val Accuracy:  0.120


100%|██████████| 1536/1536 [02:52<00:00,  8.89it/s]


Epochs: 17 | Train Loss:  0.098                 | Train Accuracy:  0.944                 | Val Loss:  1.860                 | Val Accuracy:  0.125


100%|██████████| 1536/1536 [02:52<00:00,  8.88it/s]


Epochs: 18 | Train Loss:  0.085                 | Train Accuracy:  0.952                 | Val Loss:  1.983                 | Val Accuracy:  0.105


100%|██████████| 1536/1536 [02:52<00:00,  8.88it/s]


Epochs: 19 | Train Loss:  0.075                 | Train Accuracy:  0.962                 | Val Loss:  1.986                 | Val Accuracy:  0.127


100%|██████████| 1536/1536 [02:52<00:00,  8.89it/s]


Epochs: 20 | Train Loss:  0.064                 | Train Accuracy:  0.968                 | Val Loss:  2.091                 | Val Accuracy:  0.111


In [86]:
# see the model attributes
sem_eval_model

DistilBertSentimentClassifier(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1):

## Test the model

In [87]:
evaluate(sem_eval_model, se_test_df) 

100%|██████████| 174/174 [00:06<00:00, 28.20it/s]


Test Accuracy :  0.305





In [88]:
# predict for 5 random sentences from test dataset
predict_on_samples_from_df(sem_eval_model, se_test_df, n_samples=5) 

Predict the following text:

@FluDino Event started! everyone is getting ready to travel to the lake of rage, where everything glows
The predicted emotion is : anger
The exact emotion is:  anger
++++++++++++++++++++++++++++++++++++++++++++++++++ 

Predict the following text:

@abra @abra @abra what're you doing to my ears? To my soul?! #GoodMusic #melancholy
The predicted emotion is : sadness
The exact emotion is:  sadness
++++++++++++++++++++++++++++++++++++++++++++++++++ 

Predict the following text:

I feel like an appendix. I don't have a purpose.  #depressed  #alone #lonely #broken  #cry #hurt #crying #life
The predicted emotion is : anger
The exact emotion is:  sadness
++++++++++++++++++++++++++++++++++++++++++++++++++ 

Predict the following text:

Bout ta get my @dontbreathe on up in here! @WarrenTheaters #nervous #icantholdmybreaththatlong
The predicted emotion is : sadness
The exact emotion is:  fear
++++++++++++++++++++++++++++++++++++++++++++++++++ 

Predict the following t

## Predict on Single inputs

In [72]:
_, se_decoded_labels = encode_decode_labels(se_train_df)
se_decoded_labels 

{0: 'joy', 1: 'fear', 2: 'sadness', 3: 'anger'}

In [89]:
# We can also test with our own statemnts 
predict(device, sem_eval_model, se_decoded_labels,
        "Your really did great to me. I am proud of you.")

The predicted emotion is : anger


In [90]:
# We can also test with our own statemnts 
predict(device, sem_eval_model, se_decoded_labels,
        "Your really did bad to me. I feel sorry.")

The predicted emotion is : anger


In [91]:
# what if a little modification of the above sentence
predict(device, sem_eval_model, se_decoded_labels,
        "Your really did bad. I feel sad.")

The predicted emotion is : sadness


# **BERT for IMDB Dataset** 

## Download the dataset

In [92]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

--2023-01-08 08:19:37--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2023-01-08 08:19:38 (71.9 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



The texts in the IMDB dataset are in separate files with in train/test folders with pos/neg subfolders. The following method will read every line in pos and neg subfolders and assign the subfolder name as a label. The positive and negative texts from 

In [93]:
#@title IMDB -\> DF

# read the imdb dataset into pandas data frame format 
from pathlib import Path 
# 
def read_imdb_data(split_dir): 
    df = pd.DataFrame(columns = ['text', 'emotion']) 
    # while the labels in imdb are called 'label', 'emotion' is used instead 
    # of 'label' to make more similar with SemEval 
    split_dir = Path(split_dir) 

    for label_dir in ["pos", "neg"]: 
        for text_file in (split_dir/label_dir).iterdir(): 
            df = df.append({'text' : text_file.read_text(), 
                            'emotion' : label_dir}, ignore_index = True) 

    return df 

imdb_train_df = read_imdb_data('aclImdb/train') 
imdb_test_df = read_imdb_data('aclImdb/test') 
display(imdb_train_df.head(2)) 
display(imdb_test_df.head(2)) 

Unnamed: 0,text,emotion
0,***SPOILERS*** ***SPOILERS*** Packed with memo...,pos
1,"""Heartland"" is a wonderful depiction of what i...",pos


Unnamed: 0,text,emotion
0,This has got to be one of the best episodes of...,pos
1,A sparkling movie. BB is a marvel. She's sultr...,pos


In [94]:
# shuffle the datasets 
imdb_train_df = imdb_train_df.sample(frac=1)
imdb_test_df = imdb_test_df.sample(frac=1)

In [None]:
#@title Save to pickle file 
# to reduce downloading every time, saving to drive may be faster 
imdb_train_df.to_pickle(os.path.join(data_folder, "imdb_train_df.pkl")) 
imdb_test_df.to_pickle(os.path.join(data_folder, "imdb_test_df.pkl")) 

In [None]:
#@title Load a pickle file 
imdb_train_df = pd.read_pickle(os.path.join(data_folder, "imdb_train_df.pkl")) 
imdb_test_df = pd.read_pickle(os.path.join(data_folder, "imdb_test_df.pkl")) 

In [95]:
# get some validation data from the testing set which is 25K
imdb_test_df, imdb_val_df = train_test_split(imdb_test_df, test_size=0.2,
                                              random_state=42)
print("Shape of Training df: ", imdb_train_df.shape)
print("Shape of Validating df: ", imdb_val_df.shape)
print("Shape of Testing df: ", imdb_test_df.shape)

Shape of Training df:  (25000, 2)
Shape of Validating df:  (5000, 2)
Shape of Testing df:  (20000, 2)


In [None]:
EPOCHS = 10 
imdb_model = DistilBertSentimentClassifier(n_classes=2)
learning_rate = 1e-6
train(imdb_model, imdb_train_df, imdb_val_df, learning_rate, EPOCHS) 

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 11%|█         | 1376/12500 [02:39<21:24,  8.66it/s]

In [None]:
evaluate(imdb_model, imdb_test_df)

100%|██████████| 10000/10000 [11:32<00:00, 14.43it/s]



Test Accuracy :  0.081


## Test on testing df

In [None]:
# predict for 5 random sentences from test dataset
predict_on_samples_from_df(imdb_model, imdb_test_df, n_samples=5)  

Predict the following text:

Man, is it great just to see Young and The Restless star Melody Thomas Scott as something other than flighty Nikki Newman! A doctor with a brain no less! And super nice to see her with the likes of the gorgeous Lorenzo Lamas instead of Victor Newman!<br /><br />Mel plays a college professor of micro-biology who goes to the islands with her son for spring break, only to find herself a prisoner of the island infested with a rapidly spreading virus. Handy for her there is the hunky character played by Lorenzo, who has a daughter just her son's age.<br /><br />Mel shines, as does Lorenzo with a bit of the overacting from the younger couple. Interesting premise in these times of chemical and biological terrorism talk. Worthwhile seeing, especially for Y&R fans.
The predicted emotion is : neg
The exact emotion is:  pos
++++++++++++++++++++++++++++++++++++++++++++++++++ 

Predict the following text:

I have to say I was pleasantly surprised by this movie. Other th

## Test on Single Sentences

In [None]:
_, imdb_decoded_labels = encode_decode_labels(imdb_train_df)
imdb_decoded_labels 

{0: 'pos', 1: 'neg'}

In [None]:
predict(device, imdb_model, imdb_decoded_labels, 
        "Yesterday's movie was fantastic.")

The predicted emotion is : pos


In [None]:
predict(device, imdb_model, imdb_decoded_labels, 
        "The sound quality was not good.")

The predicted emotion is : neg


# BERT Transferl Learning:  IMDB -\> SemEval

In [None]:
imdb_model

BertSentimentClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [None]:
''' We can change the out_features from 2 to 4 to do transfer learning on 
 a bert model first trained by the imdb dataset. After changing the out_features
 attribute of the model, we re-train, validate and test with the SemEval dataset.
'''
imdb_model.lin, nn.Linear(768, 4)

(Linear(in_features=768, out_features=2, bias=True),
 Linear(in_features=768, out_features=4, bias=True))

In [None]:

train(imdb_model, imdb_train_df, imdb_val_df, learning_rate, EPOCHS) 


In [None]:
EPOCHS = 20

imdb_model.lin = nn.Linear(768, 4) # modify the number of output neurons
learning_rate = 1e-6 
start = time.time()
# train the modified imdb_model with the semeval data set
train(imdb_model, se_train_df, se_val_df, learning_rate, EPOCHS)
print(f'Training time is: {(time.time()-start)/3600} hrs.')

100%|██████████| 1536/1536 [05:24<00:00,  4.73it/s]


Epochs: 1 | Train Loss:  0.670                 | Train Accuracy:  0.376                 | Val Loss:  0.740                 | Val Accuracy:  0.113


100%|██████████| 1536/1536 [05:27<00:00,  4.69it/s]


Epochs: 2 | Train Loss:  0.593                 | Train Accuracy:  0.595                 | Val Loss:  0.764                 | Val Accuracy:  0.133


In [None]:
#evaluate(sem_eval_model, se_test_df) 
evaluate(imdb_model, se_test_df) 

100%|██████████| 174/174 [00:10<00:00, 16.06it/s]


Test Accuracy :  0.280





In [None]:
# predict for 5 random sentences from test dataset
#predict_on_samples_from_df(sem_eval_model, se_test_df, n_samples=5) 
predict_on_samples_from_df(imdb_model, se_test_df, n_samples=5) 

Predict the following text:

All Brian does is sleep and aggravate me
The predicted emotion is : sadness
The exact emotion is:  anger
++++++++++++++++++++++++++++++++++++++++++++++++++ 

Predict the following text:

Sorry guys I have absolutely no idea what time i'll be on cam tomorrow but will keep you posted. #fuming
The predicted emotion is : sadness
The exact emotion is:  anger
++++++++++++++++++++++++++++++++++++++++++++++++++ 

Predict the following text:

Whatt a trailerrrr !!! @karanjohar @AnushkaSharma #RanbirKapoor #AishwaryaRaiBachchan i am COMPLETELY BLOWN !! #awestruck #longingformore
The predicted emotion is : joy
The exact emotion is:  fear
++++++++++++++++++++++++++++++++++++++++++++++++++ 

Predict the following text:

@CNNPolitics I can't wait to hear what he had to say about the brilliant Dr. Hawking... it should be rich... In the poorest of taste! 
The predicted emotion is : sadness
The exact emotion is:  fear
++++++++++++++++++++++++++++++++++++++++++++++++++ 

Pre

In [None]:
_, se_decoded_labels = encode_decode_labels(se_train_df)
se_decoded_labels 
# We can also test with our own statemnts 
#predict(device, sem_eval_model, se_decoded_labels,
predict(device, imdb_model, se_decoded_labels,
        "Your really did great to me. I am proud of you.")

The predicted emotion is : joy


In [None]:
# We can also test with our own statemnts 
predict(device, imdb_model, se_decoded_labels,
        "Your really did bad to me. I feel sorry.")

The predicted emotion is : fear


In [None]:
# what if a little modification of the above sentence
predict(device, imdb_model, se_decoded_labels,
        "Your really did bad. I feel sad.")

The predicted emotion is : fear


# possible duplicates

In [None]:
#@title IMDB_Dataset Class
tokenizer= BertTokenizer.from_pretrained("bert-base-cased")
# encode the labels 
#imdb_labels = {'pos':0, 'neg':1}

class IMDB_Dataset(torch.utils.data.Dataset): 
    def __init__(self, df): 
        # get the encoded labels from the dataframe {'pos':0, 'neg':1}
        encoded_labels, _ = encode_decode_labels(df) # labels = {'pos':0, 'neg':1}
        #extract our labels from the df 
        self.labels = [encoded_labels[label] for label in df["emotion"]]
        #tokenize our texts to the format that BERT expects to get as input 
        self.texts = [tokenizer(text, padding='max_length', max_length=512, 
                                truncation=True, return_tensors="pt")  
                     for text in df["text"]] 
    def classes(self):
        return self.labels
    
    def __len__(self): 
        return len(self.labels)
    
    #fetch a batch of labels
    def get_batch_labels(self,indx): 
        return np.array(self.labels[indx])
    # fetch a batch of texts 
    def get_batch_texts(self,indx): 
        return self.texts[indx]

    #get an item with the texts and the label
    def __getitem__(self,indx): 
        batch_texts = self.get_batch_texts(indx)
        batch_y = self.get_batch_labels(indx)
        
        return batch_texts, batch_y

In [None]:
#@title BertTextClassifier Model IMDB
class BertTextClassifier(nn.Module): 
    #def __init__(self, dropout=0.5):
    def __init__(self, n_classes, dropout=0.5): 
        super(BertTextClassifier, self).__init__()
        
        self.bert=BertModel.from_pretrained("bert-base-cased")
        self.dropout = nn.Dropout(dropout) 
        # bert output a vector of size 768 
        # self.lin = torch.nn.Linear(768, 2) 
        self.lin = nn.Linear(self.bert.config.hidden_size, n_classes)
        #self.relu = nn.ReLU()
        self.softmax = nn.Softmax()
    def forward(self,input_id,mask): 
        # as output, the bert model give us first the embedding vector of all the tokens of the sequence 
        # second we get the embedding vector of the CLS token.
        # fot a classification task it's enough to use this embedding for our classifier
        _,pooled_output = self.bert(input_ids= input_id, attention_mask = mask, return_dict = False)
        dropout_output = self.dropout(pooled_output)
        linear_output  = self.lin(dropout_output)
        final_layer = self.softmax(linear_output)
        
        return final_layer  

In [None]:
#@title Training IMDB
# we are creating a standard pytorch training loop 

def train(model, train_data, val_data, learning_rate, epochs=5):
    #creating a custom Dataset objects using the training and validation data
    train, val = IMDB_Dataset(train_data), IMDB_Dataset(val_data)
    #creating dataloaders
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr= learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                # get the predictions 
                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc
                # updating the Gradient Descent and Backpropagation operation
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            # now we evaluate on the validation data
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')

In [None]:
#@title EValuate the model
'''Now that we trained the model on the training set, we are going to use 
the test data to evaluate the performance of the model on unseen data'''
def evaluate(model, test_df):
    test = IMDB_Dataset(test_df)
    test_data_loader = torch.utils.data.DataLoader(test,batch_size=2)
    
    cuda_available = torch.cuda.is_available()
    
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    if cuda_available:
        model = model.cuda()
    
    total_acc = 0
    for test_input , test_label in tqdm(test_data_loader):
        test_label = test_label.to(device)
        mask = test_input["attention_mask"].to(device)
        input_id = test_input["input_ids"].squeeze(1).to(device)
        output = model(input_id,mask)
        
        acc = (output.argmax(dim=1) == test_label).sum().item()
        total_acc +=acc 
        
    print(f"\nTest Accuracy : {total_acc / len(test_df): .3f}")
    

In [None]:
#@title Predict on df samples function
def predict_on_samples_from_df(model, df, n_samples):
    
  # Test with sample texts from the test set first
  textset = df.sample(n_samples).reset_index(drop=True)
  textset
  for i in range(len(textset)):
      text = textset.loc[i, 'text']
      emotion = textset.loc[i, 'emotion']
      print('Predict the following text:\n')
      print(text)
      print('='*50)
      predict(device, model, text)
      print('The exact emotion is: ', emotion)
      print('+'*50, '\n')     

In [None]:
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")
# labels = {'pos':0, 'neg':1} decode the predicted
# labels as {0: 'pos', 1: 'neg'} 
inverse_imdb_labels = {v:k for k,v in imdb_labels.items()} # decode the labels 

In [None]:
#@title Prediction Fuction 
def predict(device, model, sentence): 
    #sentence = sentence 
    sentence_input = tokenizer(sentence, padding='max_length', max_length=512, 
                               truncation=True, return_tensors="pt").to(device) 
    input_id = sentence_input["input_ids"] 
    mask = sentence_input["attention_mask"] 
    output = model(input_id,mask) 
    predicted_class_label = output.argmax(dim=1) 
    predicted_class = inverse_imdb_labels[predicted_class_label.item()] 
    print(f"The predicted emotion is : {predicted_class}") 

## Option two

In [None]:
from pathlib import Path

def read_imdb_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir == "neg" else 1)

    return texts, labels

train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')

  labels.append(0 if label_dir is "neg" else 1)


In [None]:
train_labels

In [None]:
list(train_df['emotion'])

In [None]:
#@title train test
# text format is same but not label
train_text, train_y = list(train_df['text']), list(train_df['emotion'])
test_text, test_y = list(test_df['text']), list(test_df['emotion'])

In [None]:
test_texts[:3]

In [None]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

In [None]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [None]:
from torch.utils.data import DataLoader
from transformers import DistilBertForSequenceClassification, AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

# Next to Kaggle

In [None]:
#@title Label encoding

def label_encode(le, labels):
    enc = le.transform(labels)
    return tf.keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

label_encoder = LabelEncoder()
y_train = train_df['emotion']
y_test = test_df['emotion']
label_encoder.fit(y_train)

y_train = label_encode(label_encoder, y_train)
y_test = label_encode(label_encoder, y_test)

In [None]:
list(y_train)

In [None]:
train_df['target'] = list(y_train) # store encoded labels as a column 
test_df['target'] = list(y_test)

In [None]:
class_names = train_df['emotion'].unique()
class_names

array(['fear', 'sadness', 'anger', 'joy'], dtype=object)

In [None]:
train_df = train_df[['text', 'target']]
test_df = test_df[['text', 'target']]
test_df.head(3)

Unnamed: 0,text,target
76,(Sam) Brown's Law: Never offend people with st...,"[1.0, 0.0, 0.0, 0.0]"
18,@TrussElise Obama must be fuming.. lol,"[1.0, 0.0, 0.0, 0.0]"
146,That's an awful miss from Rooney.,"[0.0, 1.0, 0.0, 0.0]"


In [None]:
x_train = train_df['text']
x_test = test_df['text']

In [None]:
PRE_TRAINED_MODEL_NAME = "bert-base-cased"

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
x_train[0]

'How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ##'

In [None]:
tokens = tokenizer.tokenize(x_train[0])
ids = tokenizer.convert_tokens_to_ids(tokens)
print('Sentence: ', x_train[0])
print('Tokens: ', tokens)
print('Ids: ', ids)

Sentence:  How the fu*k! Who the heck! moved my fridge!... should I knock the landlord door. #angry #mad ##
Tokens:  ['How', 'the', 'f', '##u', '*', 'k', '!', 'Who', 'the', 'heck', '!', 'moved', 'my', 'fridge', '!', '.', '.', '.', 'should', 'I', 'knock', 'the', 'landlord', 'door', '.', '#', 'angry', '#', 'mad', '#', '#']
Ids:  [1731, 1103, 175, 1358, 115, 180, 106, 2627, 1103, 26913, 106, 1427, 1139, 18243, 106, 119, 119, 119, 1431, 146, 7466, 1103, 21406, 1442, 119, 108, 4259, 108, 6340, 108, 108]


In [None]:
encoding = tokenizer.encode_plus(
    x_train[0],
    max_length=12,
    add_special_tokens=True,  # Add '[CLS]' marker for BERT to know the task is 
                  # classification task and '[SEP]' for making end of a sentence
    return_token_type_ids=False,
    padding="max_length",
    return_attention_mask=True,
    return_tensors="pt",  # Return PyTorch tensors
)
encoding.keys()

dict_keys(['input_ids', 'attention_mask'])

In [None]:
encoding["input_ids"]

tensor([[  101,  1731,  1103,   175,  1358,   115,   180,   106,  2627,  1103,
         26913,   106,  1427,  1139, 18243,   106,   119,   119,   119,  1431,
           146,  7466,  1103, 21406,  1442,   119,   108,  4259,   108,  6340,
           108,   108,   102]])

In [None]:
# token ids are now stored in a Tensor and padded to a length of 32
print(len(encoding["input_ids"][0]))
encoding["input_ids"][0]

33


tensor([  101,  1731,  1103,   175,  1358,   115,   180,   106,  2627,  1103,
        26913,   106,  1427,  1139, 18243,   106,   119,   119,   119,  1431,
          146,  7466,  1103, 21406,  1442,   119,   108,  4259,   108,  6340,
          108,   108,   102])

In [None]:
#The attention mask has the same length:

print(len(encoding["attention_mask"][0]))
encoding["attention_mask"]

33


tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [None]:
tokenizer.convert_ids_to_tokens(encoding["input_ids"][0])

In [None]:
#@title more libraries
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch

In [None]:
#@title Dataset Class 1
class SemEvalDataset(Dataset):
    def __init__(self, df, target, tokenizer, max_len):
        self.df = df
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        text = str(self.df[item])
        target = self.target[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt", # pytorch tensor
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "targets": torch.tensor(target, dtype=torch.float),
        }

In [None]:
MAX_LEN = 72

In [None]:
#@title Dataset Class
class SemEvalDataset(Dataset):
    def __init__(self, df, targets, tokenizer, max_len):
        self.df = df
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index: int):
        data_row = self.df.iloc[index]

        text = data_row.text
        target = data_row.target

        #text = str(self.df[item])
        #target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt", # pytorch tensor
        )

        return {
            "text": text,
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "targets": torch.tensor(target, dtype=torch.float),
        }

In [None]:
# get some validation data from the training set
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = SemEvalDataset(
        df=df.text.to_numpy(),
        target=df.target.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len,
    )

    return DataLoader(ds, batch_size=batch_size, num_workers=4)

In [None]:
BATCH_SIZE = 16 

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_df, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

dict_keys(['text', 'input_ids', 'attention_mask', 'targets'])

In [None]:
print(data["input_ids"].shape)
print(data["attention_mask"].shape)
print(data["targets"].shape)

torch.Size([16, 72])
torch.Size([16, 72])
torch.Size([16, 4])


In [None]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=True)

Downloading:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
bert_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
outputs = bert_model(input_ids=encoding['input_ids'],
                     attention_mask=encoding['attention_mask'])

In [None]:
outputs[0].shape

torch.Size([1, 33, 768])

In [None]:
outputs[0]

tensor([[[ 0.3614,  0.3904,  0.0350,  ..., -0.0743,  0.1762,  0.2355],
         [ 0.2447, -0.7109,  0.5312,  ..., -0.1326,  0.0855,  0.0017],
         [ 0.0591, -0.5710, -0.0998,  ...,  1.1275, -0.2706,  0.2877],
         ...,
         [ 0.1560,  0.5956, -0.1008,  ...,  0.5559,  0.4003,  0.5600],
         [ 0.0478,  0.2991,  0.0666,  ...,  0.5912,  0.2901,  0.8633],
         [ 1.0316,  0.2229,  0.1790,  ...,  0.4998,  0.2759,  0.1014]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
class SentimentPredictor(nn.Module):
    def __init__(self, n_classes):
        super(SentimentPredictor, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(pooled_output)
        return self.out(output)

In [None]:
# @title Setup & Config
import transformers

import torch
import os
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
import zipfile

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style="whitegrid", palette="muted", font_scale=1.2)

HAPPY_COLORS_PALETTE = [
    "#01BEFE",
    "#FFDD00",
    "#FF7D00",
    "#FF006D",
    "#ADFF02",
    "#8F00FF",
]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams["figure.figsize"] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
model = SentimentPredictor(len(class_names))
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
input_ids = data["input_ids"].to(device)
attention_mask = data["attention_mask"].to(device)

print(input_ids.shape)  # batch size x seq length
print(attention_mask.shape)  # batch size x seq length

torch.Size([16, 72])
torch.Size([16, 72])


In [None]:
F.softmax(model(input_ids, attention_mask), dim=1)

tensor([[0.4375, 0.2170, 0.2072, 0.1383],
        [0.3137, 0.2447, 0.2712, 0.1703],
        [0.3055, 0.2890, 0.1976, 0.2079],
        [0.4489, 0.1205, 0.2141, 0.2165],
        [0.2339, 0.2845, 0.2936, 0.1880],
        [0.3945, 0.1443, 0.3223, 0.1390],
        [0.4079, 0.1616, 0.3062, 0.1243],
        [0.3033, 0.2330, 0.2438, 0.2199],
        [0.2869, 0.1961, 0.3266, 0.1904],
        [0.3574, 0.2025, 0.2493, 0.1909],
        [0.3019, 0.1899, 0.2962, 0.2120],
        [0.3323, 0.3090, 0.2115, 0.1472],
        [0.2903, 0.2640, 0.2848, 0.1610],
        [0.3421, 0.2281, 0.2576, 0.1722],
        [0.5545, 0.1738, 0.1569, 0.1148],
        [0.3704, 0.2034, 0.2514, 0.1748]], grad_fn=<SoftmaxBackward0>)

In [None]:
EPOCHS = 5

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, 
                                            num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
# Let's continue with writing a helper function for training our model for one epoch:

def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["targets"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
#Let's write another one that helps us evaluate the model on a given data loader:

def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            print(outputs)
            _, preds = torch.max(outputs, dim=4)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
train_acc = []
valid_acc = []
#Using those two, we can write our training loop. We'll also store the training history:


In [None]:
%%time
history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, 
                                      optimizer, device, scheduler, len(train_df))

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(model, test_data_loader, loss_fn, 
                                 device, len(test_df))
  train_acc=train_acc.cpu()
  val_acc=val_acc.cpu()  
  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)                             
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)
#   history=torch.tensor(history)
#   history=history.detach()
    
  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'data/saved_models_bert_multi_class/best_model_state.bin')
    best_accuracy = val_acc

Epoch 1/5
----------


RuntimeError: ignored