In [1]:
import pandas as pd
import numpy as np
import transformers
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm

import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)


Using device: cuda


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load full dataset
df = pd.read_csv("/kaggle/input/imdb-dataset/IMDB Dataset.csv")

# Convert labels to integers
df['label'] = df['sentiment'].map({'negative': 0, 'positive': 1})

# Check label distribution in full dataset
print(df['label'].value_counts(normalize=True))

# Split entire dataset into train+val and test (stratified)
trainval_df, test_df = train_test_split(
    df,
    test_size=0.1,
    stratify=df['label'],
    random_state=42
)

# From trainval_df, take a small subset for faster training/experiments
subset_frac = 0.1  # e.g., 10%
subset_df = trainval_df.sample(frac=subset_frac, random_state=42)

train_subset, val_subset = train_test_split(
    subset_df,
    test_size=0.1,
    stratify=subset_df['label'],
    random_state=42
)

print(f"Subset train size: {len(train_subset)} | val size: {len(val_subset)}")



label
1    0.5
0    0.5
Name: proportion, dtype: float64
Subset train size: 4050 | val size: 450


In [3]:
def compute_f1(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average='weighted')
    return {"f1": f1}


In [4]:
def tokenize(batch, tokenizer):
    return tokenizer(batch["review"], padding='max_length', truncation=True, max_length=256)


In [5]:
def prepare_datasets(train_df, val_df, tokenizer):
    train_ds = Dataset.from_pandas(train_df)
    val_ds = Dataset.from_pandas(val_df)

    train_ds = train_ds.map(lambda x: tokenize(x, tokenizer), batched=True)
    val_ds = val_ds.map(lambda x: tokenize(x, tokenizer), batched=True)

    columns = ['input_ids', 'attention_mask', 'label']
    train_ds.set_format(type='torch', columns=columns)
    val_ds.set_format(type='torch', columns=columns)
    return train_ds, val_ds


In [6]:
candidate_models = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base",
    "distilroberta-base",
    "google/electra-base-discriminator"
]


In [7]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

def finetune_and_evaluate(model_name, train_df, val_df):
    print(f"\nTraining {model_name}...")
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    train_ds, val_ds = prepare_datasets(train_df, val_df, tokenizer)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
    
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name.replace("/", "_")}',
        save_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_steps=10,
        load_best_model_at_end=False,
        disable_tqdm=False,
        report_to=[],
        save_total_limit=1,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_f1,
    )
    
    trainer.train()
    
    eval_results = trainer.evaluate()
    print(f"Model {model_name} - F1: {eval_results['eval_f1']:.4f}")
    
    return model_name, eval_results['eval_f1'], tokenizer, model

# Run finetuning on subset for all models
results = []
for model_name in candidate_models:
    res = finetune_and_evaluate(model_name, train_subset, val_subset)
    results.append(res)

# Sort by best F1 score
results = sorted(results, key=lambda x: x[1], reverse=True)
best_model_name, best_f1, best_tokenizer, best_model = results[0]
print(f"\nBest model: {best_model_name} with F1: {best_f1:.4f}")


2025-08-10 08:18:29.778328: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754813909.950869      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754813909.998881      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered



Training distilbert-base-uncased...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4050 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
10,0.6881
20,0.6094
30,0.3736
40,0.478
50,0.3777
60,0.3203
70,0.2897
80,0.3047
90,0.3263
100,0.3579


Model distilbert-base-uncased - F1: 0.8978

Training bert-base-uncased...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4050 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
10,0.6978
20,0.6579
30,0.5085
40,0.3951
50,0.3357
60,0.2804
70,0.2592
80,0.3126
90,0.2465
100,0.3272


Model bert-base-uncased - F1: 0.8978

Training roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/4050 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
10,0.686
20,0.5565
30,0.3534
40,0.3988
50,0.2971
60,0.2788
70,0.25
80,0.2832
90,0.2268
100,0.2824


Model roberta-base - F1: 0.8933

Training distilroberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/4050 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
10,0.6669
20,0.4778
30,0.4
40,0.3583
50,0.3512
60,0.3261
70,0.289
80,0.3072
90,0.2939
100,0.3029


Model distilroberta-base - F1: 0.8889

Training google/electra-base-discriminator...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/4050 [00:00<?, ? examples/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Step,Training Loss
10,0.6831
20,0.556
30,0.3863
40,0.3456
50,0.319
60,0.2599
70,0.2457
80,0.2297
90,0.2249
100,0.2609


Model google/electra-base-discriminator - F1: 0.9000

Best model: google/electra-base-discriminator with F1: 0.9000


In [8]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

tokenizer = transformers.AutoTokenizer.from_pretrained(best_model_name)
model = AutoModelForSequenceClassification.from_pretrained(best_model_name, num_labels=2).to(device)

# Prepare full train+val datasets with tokenizer
train_full, val_full = train_test_split(trainval_df, test_size=0.1, random_state=42, stratify=trainval_df['label'])
train_ds_full, val_ds_full = prepare_datasets(train_full, val_full, tokenizer)

training_args_full = TrainingArguments(
    output_dir=f'./results/{best_model_name.replace("/", "_")}_full',
    eval_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=2,
    report_to=[],
)

trainer_full = Trainer(
    model=model,
    args=training_args_full,
    train_dataset=train_ds_full,
    eval_dataset=val_ds_full,
    tokenizer=tokenizer,
    compute_metrics=compute_f1,
)

trainer_full.train()
trainer_full.save_model()


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/40500 [00:00<?, ? examples/s]

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

  trainer_full = Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.1906,0.205134,0.934442
2,0.1034,0.177544,0.943999
3,0.0449,0.241092,0.943556


In [9]:
from transformers import pipeline

clf_pipeline = pipeline(
    "text-classification",
    model=trainer_full.model,
    tokenizer=tokenizer,
    device=0 if device == "cuda" else -1,
    batch_size=32
)

test_samples = test_df.sample(10, random_state=42)
reviews = test_samples["review"].tolist()
true_labels = test_samples["label"].tolist()

preds = clf_pipeline(reviews, truncation=True)

for i, (review, pred, true_label) in enumerate(zip(reviews, preds, true_labels)):
    pred_label = 1 if pred['label'].lower() == 'positive' else 0
    print(f"\nReview #{i+1}:")
    print(review[:300] + ("..." if len(review) > 300 else ""))
    print(f"Predicted: {pred['label']} (score={pred['score']:.4f}), True: {'positive' if true_label == 1 else 'negative'}")


Device set to use cuda:0



Review #1:
A plane carrying a rich scientist's daughter goes down in thick wilderness. He assembles a group to go and find her and the others, but the rescue party soon suspects that something is stalking them. Then ulterior motives for the expedition are revealed and that only adds to the already existing ten...
Predicted: LABEL_0 (score=0.9985), True: negative

Review #2:
If any movie ever made Italians look bad, this is it.<br /><br />Duke Mitchell - what an A--HOLE. Duke Mitchell, I s--t on your grave. Seeing as practically every person gunned down in this film by the cowardly Mimi is either black or of some other racial or ethnic minority, it's hard not to become ...
Predicted: LABEL_0 (score=0.9982), True: negative

Review #3:
I desperately want to give this movie a 10...I really do. Some movies, especially horror movies are so budget that they are good. A wise-cracking ninja scarecrow who can implement corn cobs as lethal weaponry...definitely fits this 'budget to brilliance' s