# Pipeline for IMDB Reviews Classification Dataset

In [1]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /Users/mbarth/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mbarth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mbarth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
import torch

if torch.cuda.is_available():
    device = torch.device("cuda")  
elif torch.backends.mps.is_available():
    device = torch.device("mps")
else: 
    device = torch.device("cpu")
device

device(type='mps')

In [292]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
import re
from sklearn.model_selection import train_test_split
from datasets import Dataset

I'm using the IMDB Reviews dataset which consists of written reviews for movies along with their sentiment. It is perfectly balanced with 50% of reviews being positive or negative respectively.

In [293]:
from datasets import load_dataset

dataset = load_dataset("imdb")

## Pre-Processing

Exploration

In [302]:
#switch to pandas for pre-processing
df_train = dataset['train'].to_pandas()
df_test = dataset['test'].to_pandas()
#dataset.set_format('pandas')

In [303]:
df_train

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [304]:
df_train['label'].value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

In [305]:
df_test['label'].value_counts()

label
0    12500
1    12500
Name: count, dtype: int64

In [306]:
stopwords = set(stopwords.words('english'))

In [307]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [308]:
df_train['text'] = df_train['text'].apply(denoise_text)
df_test['text'] = df_test['text'].apply(denoise_text)

  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")


In [309]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

In [310]:
df_train['text'] = df_train['text'].apply(remove_special_characters)
df_test['text'] = df_test['text'].apply(remove_special_characters)

In [311]:
def stem(text):
    stemmer = PorterStemmer()
    text= ' '.join([stemmer.stem(w) for w in text.split()])
    return text

In [312]:
#Stem all words
df_train['text'] = df_train['text'].apply(stem)
df_test['text'] = df_test['text'].apply(stem)

In [313]:
#Setting English stopwords
def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_words = [w for w in tokens if w not in stopwords]
    filtered_text = ' '.join(filtered_words)    
    return filtered_text

In [314]:
#Removing the stopwords
df_train['text'] = df_train['text'].apply(remove_stopwords)
df_test['text'] = df_test['text'].apply(remove_stopwords)

In [315]:
dataset['train'] = Dataset.from_pandas(df_train, split='train')

In [316]:
dataset['test'] = Dataset.from_pandas(df_test, split='test')

In [317]:
#switch back for tokenization
dataset.reset_format()

In [318]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [319]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding=True, truncation=True, return_tensors="pt").to(device)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:05<00:00, 4688.10 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [00:04<00:00, 5286.35 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:13<00:00, 3781.41 examples/s]


In [321]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [322]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [324]:
small_train_dataset = small_train_dataset.remove_columns(["text"])
small_train_dataset = small_train_dataset.rename_column("label", "labels")
small_train_dataset.set_format("torch")

small_eval_dataset = small_eval_dataset.remove_columns(["text"])
small_eval_dataset = small_eval_dataset.rename_column("label", "labels")
small_eval_dataset.set_format("torch")

In [325]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    small_train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    small_eval_dataset, batch_size=8, collate_fn=data_collator
)

In [326]:
#Using a foundation model based on the BERT architecture
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2).to(device) 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [327]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [328]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

375


## Training

In [329]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 39/39 [38:09<00:00, 58.71s/it]
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 375/375 [04:03<00:00,  1.56it/s]

In [331]:
torch.save(model.state_dict(), '../data/models/sentiment_bert.pt')

In [None]:
#load again with
#model = DistilBertForSequenceClassification(*args, **kwargs)
#model.load_state_dict(torch.load('../data/models/sentiment_bert.pt'))

## Evaluation

In [276]:
#Using an already pre-trained model checkpoint for sentiment analysis
#model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', num_labels=2).to(device) 

In [291]:
import evaluate

metrics = []

metrics.append(evaluate.load('accuracy'))
metrics.append(evaluate.load('precision'))
metrics.append(evaluate.load('recall'))
metrics.append(evaluate.load('f1'))

model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    for k in metrics:
        k.add_batch(predictions=predictions, references=batch["labels"])
    
results = []
for k in metrics:
    results.append(k.compute())

print(results)

[{'accuracy': 0.61}, {'precision': 0.9}, {'recall': 0.19148936170212766}, {'f1': 0.3157894736842105}]
