# Imports

In [1]:
import pandas as pd

In [2]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pip install --upgrade accelerate

Note: you may need to restart the kernel to use updated packages.


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
import torch
print(torch.__version__)

2.6.0


In [6]:
torch.version.cuda

'12.6'

In [7]:
if torch.cuda.is_available():
    print("CUDA is available. Training on GPU.")
    device = torch.device("cuda")
else:
    print("CUDA is not available. Training on CPU.")
    device = torch.device("cpu")


CUDA is available. Training on GPU.


In [8]:
data = pd.read_feather("../data/movie_reviews_4k.feather")

In [9]:
data.shape

(4000, 2)

In [10]:
data

Unnamed: 0,text,label
0,I wanted to vote zero or lower. I loved the co...,0
1,"Karen(Bobbie Phillips)mentions, after one of h...",0
2,This review applies for the cut of the film th...,0
3,"The best film on the battle of San Antonio, Te...",1
4,"In theory, 'Director's Commentary' should have...",0
...,...,...
3995,Excellent show. Instead of watching the same o...,1
3996,"It's hard to believe an ""action"" packed Jet Li...",0
3997,Me and my girlfriend went to see this movie as...,0
3998,This movie is my all time favorite!!! You real...,1


## Load Tokenizer

In [11]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

## Train test split

In [13]:
texts = data['text'].tolist()
labels = data['label'].tolist()

encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)

df_encodings = pd.DataFrame({'input_ids': encodings['input_ids'], 
                             'attention_mask': encodings['attention_mask'], 
                             'labels': labels})

In [14]:
train_df, val_df = train_test_split(df_encodings, test_size=0.2, random_state=42)

In [15]:
train_dataset = SentimentDataset({'input_ids': train_df['input_ids'].tolist(), 
                                  'attention_mask': train_df['attention_mask'].tolist()}, 
                                 train_df['labels'].tolist())

val_dataset = SentimentDataset({'input_ids': val_df['input_ids'].tolist(), 
                                'attention_mask': val_df['attention_mask'].tolist()}, 
                               val_df['labels'].tolist())

## Load Pretrained model

In [16]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Setup training Hyperparams

In [17]:
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=50,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

## Train and evaluate

In [None]:
%%time
trainer.train()

Step,Training Loss
10,0.0751
20,0.1012
30,0.1591
40,0.1464
50,0.2054
60,0.0533
70,0.1713
80,0.1689
90,0.1851
100,0.0886


CPU times: total: 36.6 s
Wall time: 39 s


In [19]:
trainer.evaluate()

{'eval_loss': 0.5596940517425537,
 'eval_runtime': 0.8763,
 'eval_samples_per_second': 912.914,
 'eval_steps_per_second': 14.835,
 'epoch': 3.0}

In [20]:
model.save_pretrained("sentiment_classification_DistillBert")

## Make classifications

In [60]:
def classify_sentences(model, tokenizer, sentences, device):
    # Tokenize the input
    encoded_input = tokenizer(
        sentences,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    )
    model.eval() 
    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in encoded_input.items()}
    
    # (Optional) Ensure model is on the correct device as well
    # model.to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
    # Move predictions back to CPU if you want to convert them to NumPy
    return torch.argmax(predictions, dim=-1).cpu().numpy()




In [63]:
model.eval() 

pred_sentences  =[ "I absolutely hate this movie, total dissaster", 
                  "Most beaytiful movie ever, I watched it 10 times, very good",
                  "Reasonably good movie"]

y_pred = classify_sentences(model, tokenizer, pred_sentences, device )

In [64]:
y_pred

array([0, 1, 1])

In [65]:
import numpy as np

In [None]:
y_pred

array([0, 1, 1])

In [68]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
val_data.reset_index(inplace=True, drop=True)

In [69]:
y_pred = classify_sentences(model, tokenizer, val_data.text.tolist(), device)

In [70]:
val_data["label_pred"] = y_pred

In [71]:
val_data["correct_prediction"] = val_data['label'] == val_data["label_pred"]

In [72]:
val_data["correct_prediction"].mean()

np.float64(0.8475)