<a href="https://colab.research.google.com/github/markonium/AI_Pattern_Recognition/blob/master/Movies_Reviews_Classification_Using_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, precision_recall_fscore_support
import random
import numpy as np
import pandas as pd
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
!pip install transformers
import nltk
from matplotlib import pyplot as plt
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from torch import nn, utils
from torch.optim import Adam
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++=
tokenizer2 = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'negative': 0,
          'positive': 1
          }

def calc_metrics(y_test, y_pred):
    c_mat = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = c_mat.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = (tp) / (tp + fp)
    recall = (tp) / (tp + fn)
    fscore = (2 * precision * recall) / (precision + recall)
    print('accuracy: ', accuracy)
    print('precision: ', precision)
    print('recall: ', recall)
    print('fscore: ', fscore)
    print('confusion matrix: ')
    print(c_mat, "\n")
    return

class Dataset(utils.data.Dataset):

    def __init__(self, df):
        self.labels = [labels[label] for label in df['sentiment']]
        self.texts = [tokenizer2(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt") for
                      text in df['review']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


class BertClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        
        self.fn1 = nn.Linear(768, 512)
        self.r1 = nn.ReLU()
        
        self.fn2 = nn.Linear(512, 256)
        self.r2 = nn.ReLU()
        
        self.fn3 = nn.Linear(256, 128)
        self.r3 = nn.ReLU()
        
        self.fn4 = nn.Linear(128, 64)
        self.r4 = nn.ReLU()
        
        self.fn5 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.fn1(dropout_output)
        linear_output = self.r1(linear_output)
        linear_output = self.dropout(linear_output)

        linear_output = self.fn2(linear_output)
        linear_output = self.r2(linear_output)
        linear_output = self.dropout(linear_output)

        linear_output = self.fn3(linear_output)
        linear_output = self.r3(linear_output)
        linear_output = self.dropout(linear_output)

        linear_output = self.fn4(linear_output)
        linear_output = self.r4(linear_output)
        linear_output = self.dropout(linear_output)

        linear_output = self.fn5(linear_output)

        proba = self.sigmoid(linear_output)
        return proba
        #return self.layers(dropout_output)


def train(model, train_data, val_data, learning_rate, epochs, check):
    train, val = Dataset(train_data), Dataset(val_data)
    train_dataloader = utils.data.DataLoader(train, batch_size=1, shuffle=False)
    val_dataloader = utils.data.DataLoader(val, batch_size=1)
    model.eval()
    print(len(train))
    print(len(val))
    print(len(train_dataloader))
    print(len(val_dataloader))
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()
    train_arr = []
    val_arr = []
    index = 0
    bestEpochTrain = 0
    bestEpochValidate = 0
    bestTrainAcc = 0
    bestValAcc = 0
    startRate = 1e-1
    # to 1e-5
    maxRateVal = 0
    maxRateAcc = 0
    arrRates = []
    arrRatesAcc = []
    epochesArr = []
    if (check == 0) : # means improve epoches
        for epoch_num in range(epochs):
            total_acc_train = 0

            for train_input, train_label in tqdm(train_dataloader):
                train_label = train_label.to(device)
                #train_label = train_label.to(torch.float32)

                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                output = torch.round(output)
                acc = (output == train_label).sum().item()

                total_acc_train += acc
                

                model.zero_grad()

                optimizer.step()

            total_acc_val = 0
            

            with torch.no_grad():

                for val_input, val_label in val_dataloader:
                    val_label = val_label.to(device)
                    #val_label = val_label.to(torch.float32)

                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
                    output = model(input_id, mask)

                    output = torch.round(output)
                    acc = (output == val_label).sum().item()
                    total_acc_val += acc
                

            if (bestTrainAcc<(total_acc_train / len(train_data))) :
                bestTrainAcc = total_acc_train / len(train_data)
                bestEpochTrain = epoch_num + 1
            if (bestValAcc<(total_acc_val / len(val_data))) :
                bestValAcc = total_acc_val / len(val_data)
                bestEpochValidate = epoch_num + 1
                print("\n\nbestEpochValidate = ", bestEpochValidate)
            train_arr.append(total_acc_train / len(train_data))
            val_arr.append(total_acc_val / len(val_data))
            epochesArr.append(epoch_num)
            index = index + 1

        plt.plot(epochesArr , train_arr )
        plt.annotate("Maximum" + "(" + str(bestEpochTrain) + "," + str(bestTrainAcc) + ")", (bestEpochTrain, bestTrainAcc))
        plt.show()
        plt.plot(epochesArr, val_arr)
        plt.annotate("Maximum" + "(" + str(bestEpochValidate) + "," + str(bestValAcc) + ")", (bestEpochValidate,bestValAcc))
        plt.show()
        return bestEpochValidate
    elif (check == 1): # means improve learning rate
        while(startRate >= (1e-5)):
            total_acc_val = 0
            with torch.no_grad():

                for val_input, val_label in val_dataloader:
                    val_label = val_label.to(device)
                    #val_label = val_label.to(torch.float32)

                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
                    output = model(input_id, mask)
                    
                    output = torch.round(output)
                    acc = (output == val_label).sum().item()
                    total_acc_val += acc
            if (maxRateAcc < (total_acc_val / len(val_data))):
                maxRateAcc = total_acc_val / len(val_data)
                maxRateVal = startRate
            arrRatesAcc.append(total_acc_val / len(val_data))
            arrRates.append(startRate)
            index = index + 1
            startRate = startRate / 2
        plt.plot(arrRates, arrRatesAcc)
        plt.annotate("Maximum" + "(" + str(maxRateVal) + "," + str(maxRateAcc) + ")",
                     (maxRateVal, maxRateAcc))
        plt.show()
        return maxRateVal
    else :
        olist = []
        for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                train_label = train_label.to(torch.float32)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                

                batch_loss = criterion(output, train_label.unsqueeze(1))
                total_loss_train += batch_loss.item()
                
                output = torch.round(output)
                acc = (output == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    val_label = val_label.to(torch.float32)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.unsqueeze(1))
                    total_loss_val += batch_loss.item()
                    
                    output = torch.round(output)
                    acc = (output == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            y_output = pd.DataFrame (olist, columns = ['sentiment'])
            #calc_metrics(val_data[['sentiment']], y_output)
            olist = []
        
        
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

url='https://drive.google.com/file/d/17KCPXOPTYXx9zDqk1GtouAL-mgQalGiY/view?usp=sharing'
url='https://drive.google.com/uc?id=' + url.split('/')[-2]
unproc_df = pd.read_csv(url)

# ++++++++++++++++++++++
unproc_df = unproc_df.sample(n=300, replace=False, random_state = 42)
# ++++++++++++++++++++++

# Pre-processing the data
# ======================================================================================================================================
proc_df = unproc_df.copy(deep = True)

# Lowercase all characters
proc_df['review'] = proc_df.apply(lambda row: (row['review']).lower(), axis = 1)

# Remove punctuation
tokenizer = RegexpTokenizer(r"\w+")
proc_df['review'] = proc_df.apply(lambda row: ' '.join(tokenizer.tokenize(row['review'])), axis = 1)

# Remove stop words
stop_words = set(stopwords.words('english'))
proc_df['review'] = proc_df.apply(lambda row: ' '.join([w for w in word_tokenize(row['review']) if not w in stop_words]), axis = 1)

# Lemmatization of words
lemmatizer = WordNetLemmatizer()
proc_df['review'] = proc_df.apply(lambda row: ' '.join([lemmatizer.lemmatize(w) for w in word_tokenize(row['review'])]), axis = 1)

# print(unproc_df.head())
# print(proc_df.head())

# End of pre-processing # ==============================================================================================================

# Separating the data based on sentiment i.e positive or negative
unproc_positive = unproc_df[unproc_df.iloc[:, -1] == 'positive'];
unproc_negative = unproc_df[unproc_df.iloc[:, -1] == 'negative'];

proc_positive = proc_df[proc_df.iloc[:, -1] == 'positive'];
proc_negative = proc_df[proc_df.iloc[:, -1] == 'negative'];

# Sampling 70%, 10% and 20% for training, validation and testing respectively from each class, positive and negative
np.random.seed(100)
df_train_p1, df_val_p1, df_test_p1 = np.split(unproc_positive.sample(frac=1, random_state=42), [int(.7*len(unproc_positive)), int(.8*len(unproc_positive))])
df_train_p2, df_val_p2, df_test_p2 = np.split(unproc_negative.sample(frac=1, random_state=42), [int(.7*len(unproc_negative)), int(.8*len(unproc_negative))])

np.random.seed(100)
df_train_p3, df_val_p3, df_test_p3 = np.split(proc_positive.sample(frac=1, random_state=42), [int(.7*len(proc_positive)), int(.8*len(proc_positive))])
df_train_p4, df_val_p4, df_test_p4 = np.split(proc_negative.sample(frac=1, random_state=42), [int(.7*len(proc_negative)), int(.8*len(proc_negative))])
# Concatinating the positive and negative datasets, thus we have balanced sampling for training, validation and testing for both pre-processed and unprocessed data
df_train_unproc = pd.concat([df_train_p1, df_train_p2]);
df_val_unproc = pd.concat([df_val_p1, df_val_p2]);
df_test_unproc = pd.concat([df_test_p1, df_test_p2]);

df_train_proc = pd.concat([df_train_p3, df_train_p4]);
df_val_proc = pd.concat([df_val_p3, df_val_p4]);
df_test_proc = pd.concat([df_test_p3, df_test_p4]);

print("\n\n",len(df_train_unproc[df_train_unproc['sentiment'] == 'positive']))
print("\n\n",len(df_train_unproc[df_train_unproc['sentiment'] == 'negative']))

EPOCHS = 5
model = BertClassifier()
LR = 1e-3

bestEpoch = train(model, df_train_proc, df_val_proc, LR , EPOCHS, 0)
bestRate = train(model, df_train_proc, df_val_proc, LR , bestEpoch , 1)
print("\nbest epoch = ", bestEpoch)
print("\nbest rate = ", bestRate)
train(model, df_train_proc, df_val_proc, bestRate, bestEpoch , 2)
# '''
# bestEpoch2 = train(model, df_train_unproc, df_val_unproc, LR , EPOCHS, 0)
# bestRate2 = train(model, df_train_unproc, df_val_unproc , LR , bestEpoch2 , 1)
# train(model, df_train_unproc,df_val_unproc, bestRate2 , bestEpoch2 , 2)
# '''
# print(len(df_train_unproc))
# print(len(df_val_unproc))
# print(len(df_test_unproc))

# print(len(df_train_proc))
# print(len(df_val_proc))
# print(len(df_test_proc))

# print(df_train_unproc.head())
# print(df_val_unproc.head())
# print(df_test_unproc.head())

# print(df_train_proc.head())
# print(df_val_proc.head())
# print(df_test_proc.head())

Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 34.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 31.7 MB/s 
Collecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 30.8 MB/s 
[31mERROR: Operation cancelled by user[0m


RuntimeError: ignored