# Imports

In [None]:
import nltk
nltk.download('punkt')
!pip install transformers
nltk.download('wordnet')
nltk.download('stopwords')
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import torch
from torch import nn
from transformers import BertModel
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

#input path constant
input_path = "../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"

# Utility Functions

## 1. Read Data

In [None]:
def read_data(path):
  df = pd.read_csv(path)
  return df

## 2. Split Data

In [None]:
def train_test_validate(x):
  
  X_train, X_test = train_test_split(x, test_size = 0.2, random_state=1, stratify=x["sentiment"])

  X_train, X_val = train_test_split(X_train, test_size = 0.1/0.8, random_state=1, stratify=X_train["sentiment"])
  return X_train, X_val, X_test

## 3. Text pre-processing

In [None]:
def pre_process(review):

  #remove punctuation
  review = re.sub(r'[^\w\s]', ' ', review)

  token = word_tokenize(review)

  #lowercase
  lowercase_words = []
  for word in token:
    word = word.lower()
    lowercase_words.append(word)

  #lemmatization
  wml = WordNetLemmatizer()
  lemma = []
  for word in lowercase_words:
      token = wml.lemmatize(word)
      lemma.append(token)

  #remove stop words
  filtered_words = []
  Stopwords = set(stopwords.words('english'))
  for word in lemma:
      if word not in Stopwords:
          filtered_words.append(word)

  return ' '.join(filtered_words)

# Classification

## Dataset Class

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'negative':0,
          'positive':1
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df["sentiment"]]
        self.texts = [tokenizer(pre_process(review) , 
                               padding='max_length', max_length = 512, truncation=True,
                                return_tensors="pt") for review in df["review"]]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y


## Model Building

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(p=dropout, inplace=False)
        self.linear1 = nn.Linear(768, 512)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(512, 256)
        self.relu2 = nn.ReLU()
        self.linear3 = nn.Linear(256, 128)
        self.relu3 = nn.ReLU()
        self.linear4 = nn.Linear(128, 64)
        self.relu4 = nn.ReLU()
        self.linear5 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()


    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear1_output = self.linear1(dropout_output)
        relu1 = self.relu1(linear1_output)
        dropout_output = self.dropout(relu1)
        linear2_output = self.linear2(dropout_output)
        relu2 = self.relu2(linear2_output)
        dropout_output = self.dropout(relu2)
        linear3_output = self.linear3(dropout_output)
        relu3 = self.relu3(linear3_output)
        dropout_output = self.dropout(relu3)
        linear4_output = self.linear4(dropout_output)
        relu4 = self.relu4(linear4_output)
        dropout_output = self.dropout(relu4)
        linear5_output = self.linear5(dropout_output)

        return linear5_output

In [None]:
def binary_acc(y_pred, y_test): 
    y_pred_tag = torch.round(torch.sigmoid(y_pred)) 
    correct_results_sum = (y_pred_tag == y_test).sum().float() 
    acc = correct_results_sum
    acc = torch.round(acc * 100) 

    return acc

## Training

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=1)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, 'max', patience=2)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()

    min_loss = float('inf')
    best_epoch = 0
    results = []
    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader): 
                train_label = train_label.to(device) 
                mask = train_input['attention_mask'].to(device) 
                input_id = train_input['input_ids'].squeeze(1).to(device) 
                optimizer.zero_grad() 
                output = model(input_id, mask) 
                 
                train_label = train_label.to(torch.float32) 
                batch_loss = criterion(output, train_label.unsqueeze(1)) 
                total_loss_train += batch_loss.item() 
                 
                acc = binary_acc(output,train_label.unsqueeze(1)) 
                total_acc_train += acc.item() 
  
                batch_loss.backward() 
                optimizer.step() 
             
             
            total_acc_val = 0 
            total_loss_val = 0 
     
            print(f'Acc: {total_acc_train/len(train_dataloader):.3f} ')
            
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    val_label = val_label.to(torch.float32)
                    batch_loss = criterion(output, val_label.unsqueeze(1))
                    total_loss_val += batch_loss.item()
                    
                    acc = binary_acc(output,val_label.unsqueeze(1)) 
                    total_acc_val += acc.item() 
            save_path = f'./model-epoch-{epoch_num}.pth'
            torch.save(model.state_dict(), save_path)
            if min_loss > total_loss_val:
              min_loss = total_loss_val
              best_epoch = epoch_num
            scheduler.step(total_acc_val)
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            print('Learning Rate: ', optimizer.param_groups[0]['lr'])
            
            results.append((epoch_num + 1,total_loss_train / len(train_data), 
                    total_acc_train / len(train_data), total_loss_val / len(val_data),
                    total_acc_val / len(val_data)))
    model.load_state_dict(torch.load(f'./model-epoch-{best_epoch}.pth'))
    return results


## Testing

In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

              test_label = test_label.to(device)
              mask = test_input['attention_mask'].to(device)
              input_id = test_input['input_ids'].squeeze(1).to(device)

              output = model(input_id, mask)

              acc = binary_acc(output,test_label.unsqueeze(1)) 
              total_acc_test += acc.item() 
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    

# Main

In [None]:
x= read_data(input_path)

X_train, X_val, X_test = train_test_validate(x);

EPOCHS = 5
model = BertClassifier()
LR = 1e-5
              
results = train(model, X_train, X_val, LR, EPOCHS)

In [None]:
evaluate(model, X_test)
print(results)