### Importing required packages

In [None]:
import re
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
BATCH_SIZE = 16

### Importing data

In [None]:
!pip install gdown

In [None]:
!gdown 1ZDFmDZDOi_hrfXrKYHjt9RorHBEcT1mq

In [None]:
df = pd.read_csv(Path(os.path.join(os.getcwd(),'imdb_data.csv')))
df.head()

### Text PreProcessing

In [None]:
def text_preprocessing(text):
    # Replacing n't with not since it could be really important in sentiment analysis
    text = re.sub("n't", ' not ', text)
    # Removing URLs
    text = re.sub('(http).*\/', ' ', text)
    # Removing HTML tags
    text = re.sub('<[^>]*>', '', text)
    # Extracting emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|P|D|]|})', text)
    # Removing punctuations
    text = re.sub('[\W]+', ' ', text.lower())
    # Adding emoticons at end and converting :-) to :)
    text = text + ' ' + ' '.join(emoticons).replace('-', '')
    return text   

In [None]:
df['Review'] = df['Review'].apply(text_preprocessing)
df.head()

### Splitting data into Train, Test, Valid

In [None]:
X = df['Review'].values 
y = df['Sentiment'].values 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

### Tokenizer

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased')

In [None]:
train_tokenized = tokenizer(list(X_train), truncation=True, padding=True)
valid_tokenized = tokenizer(list(X_valid), truncation=True, padding=True)
test_tokenized = tokenizer(list(X_test), truncation=True, padding=True)

### Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, tokenized, labels):
        self.tokenized = tokenized
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        item = {key: torch.tensor(value[index]) for key, value in self.tokenized.items()}
        item['labels'] = torch.tensor(self.labels[index])
        return item

### DataLoader

In [None]:
train_data = CustomDataset(train_tokenized, y_train)
train_loader = DataLoader(train_data, BATCH_SIZE, shuffle=True)

valid_data = CustomDataset(valid_tokenized, y_valid)
valid_loader = DataLoader(valid_data, BATCH_SIZE, shuffle=True)

test_data = CustomDataset(test_tokenized, y_test)
test_loader = DataLoader(test_data, BATCH_SIZE, shuffle=True)

### Accuracy

In [None]:
def get_accuracy(model, data_loader):
    accuracy = 0.0
    total_loss = 0.0
    with torch.no_grad():
        model.eval()
        for _, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
            loss = outputs['loss']
            total_loss += loss.item()*len(labels)
            logits = outputs['logits']
            y_preds = torch.argmax(logits, 1)
            correct_counts = (y_preds == labels).float().sum().item()
            accuracy += correct_counts 
    accuracy = accuracy/len(data_loader.dataset)
    total_loss = total_loss/len(data_loader.dataset)  
    return accuracy, total_loss 

### Training

In [None]:
def training(model, model_name, optimizer, train_data_loader, valid_data_loader, num_epochs = 10):
    history = []
    for epoch in tqdm(range(num_epochs)):
        model.train()
        for batch_idx, batch in enumerate(train_data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
            
            loss = outputs['loss']
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch_idx % 200 == 0 :
                print(f'Epoch No. {epoch+1}/{num_epochs} | Batch No. {batch_idx}/{len(train_data_loader)} | Loss = {loss:.5f}')
        training_accuracy, training_loss = get_accuracy(model, train_data_loader)
        valid_accuracy, valid_loss = get_accuracy(model, valid_data_loader)
        print(f'Training Accuracy = {training_accuracy:.2f}%, Loss = {training_loss:.4f}')
        print(f'Valid Accuracy = {valid_accuracy:.2f}%, Loss = {valid_loss:.4f}')
        history.append([training_accuracy, training_loss, valid_accuracy, valid_loss])
    return model, history    

### Post Training

In [None]:
def post_training(trained_model, test_loader, history, model_name):
    test_acc, test_loss = get_accuracy(trained_model, test_loader)
    history_np = np.array(history)
    fig, axes = plt.subplots(1, 2, figsize = (12, 4))
    epochs_list = np.arange(1, len(history_np)+1)
    axes[0].plot(epochs_list, history_np[:, 0], label = "Training Accuracy", marker = '.')
    axes[0].plot(epochs_list, history_np[:, 2], label = "Validation Accuracy", marker = '.')
    axes[0].set_xlabel('Epochs')
    axes[0].set_ylabel('Accuracy')
    axes[0].legend()

    axes[1].plot(epochs_list, history_np[:, 1], label = "Training Loss", marker = '.')
    axes[1].plot(epochs_list, history_np[:, 3], label = "Validation Loss", marker = '.')
    axes[1].set_xlabel('Epochs')
    axes[1].set_ylabel('Loss')
    axes[1].legend()

    fig.suptitle(f'Training for {model_name}')
    plt.show()
    return test_loss, test_acc

### Training using DistilBert Model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
trained_model, history = training(model, 'DistilBERT', optimizer, train_loader, valid_loader, num_epochs = 5)

### Post Processing

In [None]:
test_loss, test_acc = post_training(trained_model, test_loader, history, 'DistilBERT')