# Import necessary libraries

In [1]:
!pip install transformers



In [76]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

from transformers import AutoModelForSequenceClassification, AutoTokenizer

import torch
from torch import cuda
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


# Download model and tokenizer

# DistilBERT

In [3]:
# Model's name. DistilBERT
MODEL_CHECKPOINT = "distilbert-base-uncased"

In [4]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
device = "cuda" if cuda.is_available() else "cpu"
device

'cuda'

In [6]:
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Download Dataset

In [7]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'imdb-dataset-of-50k-movie-reviews' dataset.
Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [8]:
# find main root
os.listdir(path)

['IMDB Dataset.csv']

In [9]:
main_path = os.path.join(path, "IMDB Dataset.csv")
df = pd.read_csv(main_path)

In [10]:
# show 10 samples
df.sample(10)

Unnamed: 0,review,sentiment
7782,After watching John preform this one of a kind...,positive
49344,"I liked the first movie, but this is a textboo...",negative
30234,It was dumb. Sort of like an Adam Sandler movi...,negative
33574,A crackling and magnificent thriller about a c...,positive
46600,"this movie is a very relaxed, romantic-comedy,...",positive
40174,The first episode of 'Man to Man with Dean Lea...,positive
12058,I seriously don´t know why this movie got such...,positive
3645,"This version is pretty insipid, I'm afraid. Ja...",negative
48142,"Though I saw this movie dubbed in French, so I...",positive
5634,I am currently on vacation in Israel for summe...,positive


# Preprocess Dataset

In [11]:
# convert "positive" -> 1, "negative" -> 0
df["labels"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [12]:
df

Unnamed: 0,review,sentiment,labels
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1
...,...,...,...
49995,I thought this movie did a down right good job...,positive,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,0
49997,I am a Catholic taught in parochial elementary...,negative,0
49998,I'm going to have to disagree with the previou...,negative,0


In [13]:
# check our tokenizer
tokenizer(
    df["review"][0],
    max_length=256,
    padding="max_length",
    truncation=True,
    return_token_type_ids=True,
    add_special_tokens=True
)

{'input_ids': [101, 2028, 1997, 1996, 2060, 15814, 2038, 3855, 2008, 2044, 3666, 2074, 1015, 11472, 2792, 2017, 1005, 2222, 2022, 13322, 1012, 2027, 2024, 2157, 1010, 2004, 2023, 2003, 3599, 2054, 3047, 2007, 2033, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 1996, 2034, 2518, 2008, 4930, 2033, 2055, 11472, 2001, 2049, 24083, 1998, 4895, 10258, 2378, 8450, 5019, 1997, 4808, 1010, 2029, 2275, 1999, 2157, 2013, 1996, 2773, 2175, 1012, 3404, 2033, 1010, 2023, 2003, 2025, 1037, 2265, 2005, 1996, 8143, 18627, 2030, 5199, 3593, 1012, 2023, 2265, 8005, 2053, 17957, 2007, 12362, 2000, 5850, 1010, 3348, 2030, 4808, 1012, 2049, 2003, 13076, 1010, 1999, 1996, 4438, 2224, 1997, 1996, 2773, 1012, 1026, 7987, 1013, 1028, 1026, 7987, 1013, 1028, 2009, 2003, 2170, 11472, 2004, 2008, 2003, 1996, 8367, 2445, 2000, 1996, 17411, 4555, 3036, 2110, 7279, 4221, 12380, 2854, 1012, 2009, 7679, 3701, 2006, 14110, 2103, 1010, 2019, 6388, 2930, 1997, 1996, 3827, 2073, 2035, 1996, 4442, 2031, 3221, 21430,

In [14]:
# dataset class
class IMDBDataset(Dataset):
  def __init__(self, tokenizer, dataframe, max_length):
    self.tokenizer = tokenizer
    self.dataframe = dataframe
    self.max_length = max_length

  def __len__(self):
    return len(self.dataframe)

  def __getitem__(self, index):
    text = self.dataframe["review"][index]
    text = text.lower()

    # here we tokenize our text
    inputs = tokenizer(
        text,
        max_length = self.max_length,
        padding="max_length",
        truncation=True,
        add_special_tokens=True
    )

    # output structure
    output = {
        "ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
        "attention_mask": torch.tensor(inputs["attention_mask"], dtype=torch.long),
        "labels": torch.tensor(self.dataframe["labels"][index], dtype=torch.long)
    }

    return output

In [15]:
# divide into train, val and test split
train_df, temp_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df["labels"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    random_state=42,
    stratify=temp_df["labels"]
)

In [16]:
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [17]:
print(f"Len of Train data: {len(train_df)}")
print(f"Len of Validation data: {len(val_df)}")
print(f"Len of Test data: {len(test_df)}")

Len of Train data: 40000
Len of Validation data: 5000
Len of Test data: 5000


In [19]:
# create data class for each split
train_dataset = IMDBDataset(tokenizer, train_df, max_length=512)
val_dataset = IMDBDataset(tokenizer, val_df, max_length=512)
test_dataset = IMDBDataset(tokenizer, test_df, max_length=512)

In [20]:
# create data loader for each split
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=32)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

# Train Model

In [21]:
# optimizer: method how model trains
optimizer = Adam(model.parameters(), lr=3e-5)

In [22]:
# loss function: calculate loss betwenn outputs and labels
criterion = nn.CrossEntropyLoss()

In [23]:
# function to calculate accuracy
def accuracy_(outputs, labels):
  # Move outputs and labels to CPU and convert to NumPy arrays
  outputs = outputs.cpu().detach().numpy()
  labels = labels.cpu().detach().numpy()
  # Get the index of the max log-probability
  predicted = np.argmax(outputs, axis=1)
  # Calculate the number of correct predictions in the batch
  correct = (predicted == labels).sum()
  return correct

In [24]:
# training function
def train(epoch):
  model.train()

  # for calculate loss
  total_train_loss = 0
  total_val_loss = 0

  total_train_correct = 0
  total_val_correct = 0

  for i, data in enumerate(train_loader):
    # get data
    ids = data["ids"].to(device)
    attention_mask = data["attention_mask"].to(device)
    labels = data["labels"].to(device)

    # train
    outputs = model(input_ids=ids, attention_mask=attention_mask)

    # optimizer zero grad
    optimizer.zero_grad()

    loss = criterion(outputs.logits, labels)
    total_train_loss += loss.item()

    # loss backward
    loss.backward()

    # optimizer step
    optimizer.step()

    total_train_correct += accuracy_(outputs=outputs.logits, labels=labels)

  # evaluate model on val dataset
  model.eval()
  with torch.no_grad():
    for i, val_data in enumerate(val_loader):
      ids = val_data["ids"].to(device)
      attention_mask = val_data["attention_mask"].to(device)
      labels = val_data["labels"].to(device)

      val_outputs = model(input_ids=ids, attention_mask=attention_mask)
      total_val_correct += accuracy_(val_outputs.logits, labels)

      val_loss_ = criterion(val_outputs.logits, labels)
      total_val_loss += val_loss_.item()

  avg_train_loss = total_train_loss / len(train_loader)
  avg_val_loss = total_val_loss / len(val_loader)

  avg_train_accuracy = total_train_correct / len(train_loader.dataset)
  avg_val_accuracy = total_val_correct / len(val_loader.dataset)

  # logging
  print(f"Epoch: {epoch}, train_loss: {avg_train_loss}, val_loss: {avg_val_loss}, train_accuracy: {avg_train_accuracy}, val_accuracy: {avg_val_accuracy}")

In [25]:
# train model
for epoch in range(5):
  train(epoch)

Epoch: 0, train_loss: 0.2627703068673611, val_loss: 0.2232728799816909, train_accuracy: 0.890825, val_accuracy: 0.9058
Epoch: 1, train_loss: 0.15353035072162746, val_loss: 0.2128683170363022, train_accuracy: 0.94245, val_accuracy: 0.9158
Epoch: 2, train_loss: 0.0804797277382575, val_loss: 0.27262398739386895, train_accuracy: 0.9726, val_accuracy: 0.9186
Epoch: 3, train_loss: 0.04368640862959437, val_loss: 0.2858190594721514, train_accuracy: 0.98595, val_accuracy: 0.9148
Epoch: 4, train_loss: 0.03102095611700788, val_loss: 0.31928907923109734, train_accuracy: 0.99, val_accuracy: 0.9154


In [26]:
# evalate model on test dataset
model.eval()

total_test_correct = 0

with torch.no_grad():
  for data in test_loader:
    ids = data['ids'].to(device)
    attention_mask = data["attention_mask"].to(device)
    labels = data['labels'].to(device)

    outputs = model(input_ids=ids, attention_mask=attention_mask)

    total_test_correct += accuracy_(outputs.logits, labels)
  # calculate accuracy
  avg_test_accuracy = total_test_correct / len(test_loader.dataset)

  print(f"test_accuracy: {avg_test_accuracy}")

test_accuracy: 0.9198


# Testing model

In [66]:
# function to classify text: positive or negative
def classify_text(text):
  # tokenize text
  inputs = tokenizer(
      text,
      add_special_tokens=True,
      truncation=True,
      padding="max_length",
      max_length=512
  )
  ids = torch.tensor(inputs['input_ids'], dtype=torch.long).to(device)
  attention_mask = torch.tensor(inputs["attention_mask"], dtype=torch.long).to(device)

  model.eval()
  with torch.no_grad():
    # here we get a result
    outputs = model(input_ids=ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))

  # return 0 or 1: negative or positive
  softmax = nn.Softmax(dim=1)

  return "POSITIVE" if torch.argmax(softmax(outputs.logits)) == 1 else "NEGATIVE"

In [67]:
# it needs to be negative
classify_text("I hate this movie. I was very bad. I do not watch this sheet again")

'NEGATIVE'

In [68]:
# it needs to be positive
classify_text("This movie was very good!")

'POSITIVE'

# Save model

In [70]:
from google.colab import files

In [74]:
torch.save(model.state_dict(), "distilbert_.pth")

In [75]:
files.download("distilbert_.pth")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>