# Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
import matplotlib.pyplot as plt
# import torch
# from torch import nn
# from transformers import BertModel
# from torch.optim import Adam
# from tqdm import tqdm

# Data Reading

In [2]:
dataset = pd.read_csv("IMDB Dataset.csv", sep=',')
# dataset = pd.read_csv("fruits.csv", sep=',')
# display(dataset)

# Text Pre-processing

### Remove punctuation & lowercase all characters

In [3]:
import string
string.punctuation

def remove_punctuation(text):
    return "".join([i.lower() for i in text if i not in string.punctuation])

# storing the puntuation free and lowercased text
dataset['free_punc_review']= dataset['review'].apply(lambda x:remove_punctuation(x))

# display(dataset)

### Tokenization

In [4]:
import re
def tokenization(text):
    tokens = word_tokenize(text)
    return tokens
dataset['review_tokenied'] = dataset['free_punc_review'].apply(lambda x: tokenization(x))

# display(dataset)

### Remove stop words and lemmatize

In [5]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
stop_words = nltk.corpus.stopwords.words('english')
stop_words[0:10]
['i', 'the', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

def remove_stopwords_lemmatize(text):
    output = []
    if type(text) is float : 
        return []
    for i in text:
        if i not in stop_words:
            output.append(wordnet_lemmatizer.lemmatize(i))
    return output

dataset['lemmatized_no_stopwords'] = dataset['review_tokenied'].apply(lambda x:remove_stopwords_lemmatize(x))

display(dataset)

Unnamed: 0,review,sentiment,free_punc_review,review_tokenied,lemmatized_no_stopwords
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,"[one, of, the, other, reviewers, has, mentione...","[one, reviewer, mentioned, watching, 1, oz, ep..."
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production br br the filmin...,"[a, wonderful, little, production, br, br, the...","[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,basically theres a family where a little boy j...,"[basically, theres, a, family, where, a, littl...","[basically, there, family, little, boy, jake, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love in the time of money is a ...,"[petter, matteis, love, in, the, time, of, mon...","[petter, matteis, love, time, money, visually,..."
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,positive,i thought this movie did a down right good job...,"[i, thought, this, movie, did, a, down, right,...","[thought, movie, right, good, job, wasnt, crea..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...,"[bad, plot, bad, dialogue, bad, acting, idioti...","[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,negative,i am a catholic taught in parochial elementary...,"[i, am, a, catholic, taught, in, parochial, el...","[catholic, taught, parochial, elementary, scho..."
49998,I'm going to have to disagree with the previou...,negative,im going to have to disagree with the previous...,"[im, going, to, have, to, disagree, with, the,...","[im, going, disagree, previous, comment, side,..."


# Data Preparation

In [6]:
dataset['review'] = dataset['lemmatized_no_stopwords'].apply(lambda x:' '.join(x))
dataset.drop('free_punc_review', inplace=True, axis=1)
dataset.drop('review_tokenied', inplace=True, axis=1)
dataset.drop('lemmatized_no_stopwords', inplace=True, axis=1)

display(dataset)

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive
...,...,...
49995,thought movie right good job wasnt creative or...,positive
49996,bad plot bad dialogue bad acting idiotic direc...,negative
49997,catholic taught parochial elementary school nu...,negative
49998,im going disagree previous comment side maltin...,negative


# Data Split

In [7]:
# split data to positive and negative
p_samples = dataset[dataset['sentiment'] == 'positive']
n_samples = dataset[dataset['sentiment'] == 'negative']
# display(p_samples)
# display(n_samples)

# split the positive class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
p_testing = p_samples.sample(frac = 0.2)
p_validation = p_samples.drop(p_testing.index).sample(frac = 0.125)
p_training = p_samples.drop(p_validation.index).drop(p_testing.index)

# split the negative class samples to (70%, 10%, 20%) for (training, validation, testing) respectively 
n_testing = n_samples.sample(frac = 0.2)
n_validation = n_samples.drop(n_testing.index).sample(frac = 0.125)
n_training = n_samples.drop(n_validation.index).drop(n_testing.index)

# concatenating the 70% of p-class and n-class to form the training set
training_set = pd.concat([p_training, n_training], axis=0, ignore_index=True)

# concatenating the 10% of p-class and n-class to form the validation set
validation_set = pd.concat([p_validation, n_validation], axis=0, ignore_index=True)

# concatenating the 20% of p-class and n-class to form the testing set
testing_set = pd.concat([p_testing, n_testing], axis=0, ignore_index=True)

# display(training_set)
# display(validation_set)
# display(testing_set)

# Classification using BERT

### Bert Tokenization

In [8]:
import torch
import numpy as np
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

labels = {'negative': 0,
          'positive': 1}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['sentiment']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for text in df['review']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [9]:
# training_for_BERT = Dataset(training_set)
# validation_for_BERT = Dataset(validation_set)
testing_for_BERT = Dataset(testing_set)

### Model Building

In [10]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear_layer_0 = nn.Linear(768, 512)
        self.linear_layer_1 = nn.Linear(512, 256)
        self.linear_layer_2 = nn.Linear(256, 128)
        self.linear_layer_3 = nn.Linear(128, 64)
        self.linear_layer_4 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

        
    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        relu_0 = self.relu(self.linear_layer_0(dropout_output))
        relu_1 = self.relu(self.linear_layer_1(relu_0))
        relu_2 = self.relu(self.linear_layer_2(relu_1))
        relu_3 = self.relu(self.linear_layer_3(relu_2))
        final_output = self.relu(self.linear_layer_4(relu_3))

        return final_output

### Training Loop

In [11]:
from torch.optim import Adam
from tqdm import tqdm

epoch_no = []
training_acc = []
validation_acc = []

def train(model, train_data, val_data, learning_rate, epochs):
    global epoch_no, training_acc, validation_acc
    epoch_no = []
    training_acc = []
    validation_acc = []
    
    train, val = Dataset(train_data), Dataset(val_data)

    # load data to main memory
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=8)

    # load data to GPU if possible (if cuda exists)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # define criteria (loss function) & used optimizer
    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    
    if use_cuda:
        print('cuda exists')
        model = model.cuda()
        criterion = criterion.cuda()
    else:
        print('No cuda')
        
    for epoch_num in range(epochs):
        epoch_no.append(epoch_num + 1)
        total_acc_train = 0
        total_loss_train = 0
        
        for train_input, train_label in tqdm(train_dataloader):
            train_label = train_label.reshape((8,1)).float().to(device)
            print(train_label) ###################
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            print(output) ###################

            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()

            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        training_acc.append(total_acc_train / len(train_data))
        
        total_acc_val = 0
        total_loss_val = 0
        with torch.no_grad():

            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask).float()

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()

                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

            validation_acc.append(total_acc_val / len(val_data))
#         print(
#             f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
#             | Train Accuracy: {total_acc_train / len(train_data): .3f} \
#             | Val Loss: {total_loss_val / len(val_data): .3f} \
#             | Val Accuracy: {total_acc_val / len(val_data): .3f}')

In [None]:
no_of_epochs = 5
model = BertClassifier()
learning_rate = 1e-6
              
train(model, training_set, validation_set, learning_rate, no_of_epochs)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|                                                                                         | 0/4375 [00:00<?, ?it/s]

No cuda
tensor([[1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.]])
tensor([[0.0157],
        [0.0337],
        [0.0421],
        [0.0236],
        [0.0419],
        [0.0183],
        [0.0290],
        [0.0267]], grad_fn=<ReluBackward0>)


  0%|                                                                            | 1/4375 [02:45<201:04:54, 165.50s/it]

tensor([[1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [1.]])
tensor([[0.0253],
        [0.0317],
        [0.0420],
        [0.0206],
        [0.0501],
        [0.0090],
        [0.0205],
        [0.0449]], grad_fn=<ReluBackward0>)


### Plotting Accuracy vs No of Epochs in Training & Validation Sets

In [None]:
plt.plot(epoch_no, training_acc)

In [None]:
plt.plot(epoch_no, validation_acc)

### Plotting Acuraccy vs Learning Rate in Validation Set

In [None]:
no_of_epochs = 5 # to be modified
model_ = BertClassifier()
LRs = [1e-6, 1e-5] # to be modified
accuracy = []             
for i in LRs:
    train(model, training_set, validation_set, i, no_of_epochs)
    accuracy.append(validation_acc[-1])
plt.plot(LRs, accuracy)

### Plotting Accuracy with\out Text Preprocessing

In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():
        for test_input, test_label in test_dataloader:
          test_label = test_label.to(device)
          mask = test_input['attention_mask'].to(device)
          input_id = test_input['input_ids'].squeeze(1).to(device)

          output = model(input_id, mask)

          acc = (output.argmax(dim=1) == test_label).sum().item()
          total_acc_test += acc
    
#     print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [None]:
def conf_matrix_calculations(actual, predicted):
    TP = TN = FP = FN = 0
    training_dataset_size = 10000
    for i in range (training_dataset_size):
        if actual[i] == 'positive' and predicted[i] == 'positive':
            TP += 1
        elif actual[i] == 'positive' and predicted[i] == 'negative':
            FN += 1
        elif actual[i] == 'negative' and predicted[i] == 'negative':
            TN += 1
        elif actual[i] == 'negative' and predicted[i] == 'positive':
            FP += 1
    print('--- Confusion Matrix ---')
    print('TP: ', TP, '\tFP: ', FP)
    print('FN: ', FN, '\tTN: ', TN)
    print('Accuracy = ', accuracy(TP, FP, FN, TN))
    p = precision(TP, FP)
    print('Precision = ', p)
    r = recall(TP, FN)
    print('Recall = ', r)
    print('F-score = ', f_score(p, r))
    
def accuracy(TP, FP, FN, TN):
    return (TP + TN) / (TP + FP + FN + TN)

def precision(TP, FP):
    return (TP) / (TP + FP)

def recall(TP, FN):
    return (TP) / (TP + FN)

def f_score(p, r):
    return (2 * p * r) / (p + r)

In [None]:
evaluate(model, df_test)