### Importing required packages

In [56]:
import re
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
import transformers
from transformers import DistilBertTokenizerFast
from transformers import DistilBertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [25]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [22]:
BATCH_SIZE = 16

### Importing data

In [47]:
!pip install gdown

Collecting gdown
  Downloading gdown-5.1.0-py3-none-any.whl.metadata (5.7 kB)
Collecting beautifulsoup4 (from gdown)
  Downloading beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->gdown)
  Using cached soupsieve-2.5-py3-none-any.whl.metadata (4.7 kB)
Collecting PySocks!=1.5.7,>=1.5.6 (from requests[socks]->gdown)
  Downloading PySocks-1.7.1-py3-none-any.whl.metadata (13 kB)
Downloading gdown-5.1.0-py3-none-any.whl (17 kB)
Downloading beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
   ---------------------------------------- 0.0/147.9 kB ? eta -:--:--
   -- ------------------------------------- 10.2/147.9 kB ? eta -:--:--
   ---------- ---------------------------- 41.0/147.9 kB 495.5 kB/s eta 0:00:01
   ------------------------ -------------- 92.2/147.9 kB 751.6 kB/s eta 0:00:01
   ------------------------------- ------ 122.9/147.9 kB 804.6 kB/s eta 0:00:01
   -------------------------------------- 147.9/147.9 kB 800.5 kB/s eta 0:00

In [48]:
!gdown 1ZDFmDZDOi_hrfXrKYHjt9RorHBEcT1mq

Downloading...
From: https://drive.google.com/uc?id=1ZDFmDZDOi_hrfXrKYHjt9RorHBEcT1mq
To: c:\Users\mndpp\Desktop\Github_Projects\01_movie_review_sentiment\notebook\imdb_data.csv

  0%|          | 0.00/66.0M [00:00<?, ?B/s]
  1%|          | 524k/66.0M [00:00<00:12, 5.07MB/s]
  2%|▏         | 1.05M/66.0M [00:00<00:15, 4.28MB/s]
  2%|▏         | 1.57M/66.0M [00:00<00:14, 4.47MB/s]
  3%|▎         | 2.10M/66.0M [00:00<00:14, 4.53MB/s]
  4%|▍         | 2.62M/66.0M [00:00<00:17, 3.57MB/s]
  5%|▍         | 3.15M/66.0M [00:00<00:17, 3.64MB/s]
  6%|▌         | 3.67M/66.0M [00:00<00:17, 3.66MB/s]
  6%|▋         | 4.19M/66.0M [00:01<00:16, 3.66MB/s]
  7%|▋         | 4.72M/66.0M [00:01<00:16, 3.69MB/s]
  8%|▊         | 5.24M/66.0M [00:01<00:17, 3.57MB/s]
  9%|▊         | 5.77M/66.0M [00:01<00:16, 3.70MB/s]
 10%|▉         | 6.29M/66.0M [00:01<00:16, 3.69MB/s]
 10%|█         | 6.82M/66.0M [00:01<00:15, 3.90MB/s]
 11%|█         | 7.34M/66.0M [00:01<00:14, 4.09MB/s]
 12%|█▏        | 7.86M/66.0M [00:01<

In [57]:
df = pd.read_csv(Path(os.path.join(os.getcwd(),'imdb_data.csv')))
df.head()

Unnamed: 0,Review,Rating,Sentiment
0,Imagine The Big Chill with a cast of twenty-so...,2,0
1,I'd have to say that I've seen worse Sci Fi Ch...,3,0
2,Director Fabio Barreto got a strange Academy N...,1,0
3,Pretty bad PRC cheapie which I rarely bother t...,4,0
4,This is a very intriguing short movie by David...,8,1


### Text PreProcessing

In [3]:
def text_preprocessing(text):
    # Replacing n't with not since it could be really important in sentiment analysis
    text = re.sub("n't", ' not ', text)
    # Removing URLs
    text = re.sub('(http).*\/', ' ', text)
    # Removing HTML tags
    text = re.sub('<[^>]*>', '', text)
    # Extracting emoticons
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|P|D|]|})', text)
    # Removing punctuations
    text = re.sub('[\W]+', ' ', text.lower())
    # Adding emoticons at end and converting :-) to :)
    text = text + ' ' + ' '.join(emoticons).replace('-', '')
    return text   

In [4]:
df['Review'] = df['Review'].apply(text_preprocessing)
df.head()

Unnamed: 0,Review,Rating,Sentiment
0,imagine the big chill with a cast of twenty so...,2,0
1,i d have to say that i ve seen worse sci fi ch...,3,0
2,director fabio barreto got a strange academy n...,1,0
3,pretty bad prc cheapie which i rarely bother t...,4,0
4,this is a very intriguing short movie by david...,8,1


### Splitting data into Train, Test, Valid

In [5]:
X = df['Review'].values 
y = df['Sentiment'].values 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)

### Tokenizer

In [6]:
tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased')

In [8]:
train_tokenized = tokenizer(list(X_train), truncation=True, padding=True)
valid_tokenized = tokenizer(list(X_valid), truncation=True, padding=True)
test_tokenized = tokenizer(list(X_test), truncation=True, padding=True)

### Dataset

In [19]:
class CustomDataset(Dataset):
    def __init__(self, tokenized, labels):
        super().__init__()
        self.tokenized = tokenized
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, index):
        item = {key: torch.tensor(value[index]) for key, value in train_tokenized.items()}
        item['labels'] = torch.tensor(self.labels[index])
        return item

### DataLoader

In [23]:
train_data = CustomDataset(train_tokenized, y_train)
train_loader = DataLoader(train_data, BATCH_SIZE, shuffle=True)

valid_data = CustomDataset(valid_tokenized, y_valid)
valid_loader = DataLoader(valid_data, BATCH_SIZE, shuffle=True)

test_data = CustomDataset(test_tokenized, y_test)
test_loader = DataLoader(test_data, BATCH_SIZE, shuffle=True)

### Accuracy

In [40]:
def get_accuracy(model, data_loader):
    accuracy = 0.0
    total_loss = 0.0
    with torch.no_grad():
        model.eval()
        for _, batch in enumerate(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
            loss = outputs['loss']
            total_loss += loss.item()*len(labels)
            logits = outputs['logits']
            y_preds = torch.argmax(logits, 1)
            correct_counts = (y_preds == labels).float().sum().item()
            accuracy += correct_counts 
    accuracy = accuracy/len(data_loader.dataset)
    total_loss = total_loss/len(data_loader.dataset)  
    return accuracy, total_loss 

### Training

In [41]:
def training(model, model_name, optimizer, train_data_loader, valid_data_loader, num_epochs = 10):
    history = []
    for epoch in tqdm(range(num_epochs)):
        model.train()
        for batch_idx, batch in enumerate(train_data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
            
            loss = outputs['loss']
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            if batch_idx % 200 == 0 :
                print(f'Epoch No. {epoch}/{num_epochs} | Batch No. {batch_idx}/{len(train_data_loader)} | Loss = {loss:.5f}')
        training_accuracy, training_loss = get_accuracy(model, train_data_loader)*100
        valid_accuracy, valid_loss = get_accuracy(model, valid_data_loader)*100
        print(f'Training Accuracy = {training_accuracy:.2f}%, Loss = {training_loss:.4f}')
        print(f'Valid Accuracy = {valid_accuracy:.2f}%, Loss = {valid_loss:.4f}')
        history.append([training_accuracy, training_loss, valid_accuracy, valid_loss])
    return model, history    

### Post Training

In [46]:
def post_training(trained_model, test_loader, history, model_name):
    test_acc, test_loss = get_accuracy(trained_model, test_loader)
    history_np = np.array(history)
    fig, axes = plt.subplots(1, 2, figsize = (12, 4))
    epochs_list = np.arange(1, len(history_np)+1)
    axes[0].plot(epochs_list, history_np[:, 0], label = "Training Accuracy", marker = '.')
    axes[0].plot(epochs_list, history_np[:, 2], label = "Validation Accuracy", marker = '.')
    axes[0].set_xlabel('Epochs')
    axes[0].set_ylabel('Accuracy')
    axes[0].legend()

    axes[1].plot(epochs_list, history_np[:, 1], label = "Training Loss", marker = '.')
    axes[1].plot(epochs_list, history_np[:, 3], label = "Validation Loss", marker = '.')
    axes[1].set_xlabel('Epochs')
    axes[1].set_ylabel('Loss')
    axes[1].legend()

    fig.suptitle(f'Training for {model_name}')
    plt.show()
    return test_loss, test_acc

### Training using DistilBert Model

In [38]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
trained_model, history = training(model, 'DistilBERT', optimizer, train_loader, valid_loader, num_epochs = 5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### Post Processing

In [None]:
test_loss, test_acc = post_training(trained_model, test_loader, history, 'DistilBERT')