In [1]:
!pip install -U transformers datasets accelerate -q


In [3]:
!pip uninstall -y transformers
!pip uninstall -y transformers  # run twice just in case
!pip install transformers==4.44.2 datasets accelerate -q  # known stable recent version
import transformers
print(transformers.__version__)
from transformers import TrainingArguments
help(TrainingArguments.__init__)


Found existing installation: transformers 4.56.0
Uninstalling transformers-4.56.0:
  Successfully uninstalled transformers-4.56.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m111.6 MB/s[0m eta [36m0:00:00[0m
[?25h4.56.0
Help on function __init__ in module transformers.training_args:

    Initialize self.  See help(type(self)) for accurate signature.



In [7]:
!pip install -U transformers datasets accelerate kagglehub -q

import pandas as pd
import numpy as np
import torch
import random
import kagglehub
from huggingface_hub import login

# -------------------
# 0. Login to Hugging Face
# -------------------
# Replace with YOUR token (Read access is enough)
HF_TOKEN = "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"
login(token=HF_TOKEN)

import transformers
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm

# -------------------
# 1. Download Dataset
# -------------------
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")
print("Path to dataset files:", path)

df = pd.read_csv(f"{path}/IMDB Dataset.csv")
df.rename(columns={"sentiment": "label"}, inplace=True)
df["label"] = df["label"].map({"negative": 0, "positive": 1})

# -------------------
# 2. Subset for testing models quickly
# -------------------
df_subset = df.sample(frac=0.1, random_state=42)  # 10% of data
train, val = train_test_split(df_subset, test_size=0.2, random_state=42)

# -------------------
# 3. Models to test
# -------------------
models_to_test = [
    "bert-base-uncased",
    "roberta-base",
    "microsoft/deberta-base",
    "google/electra-base-discriminator",
    "distilbert-base-uncased"
]

# -------------------
# 4. Function to run training
# -------------------
def run_model(model_name):
    print(f"\n🚀 Training model: {model_name}\n")

    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
    model = transformers.AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    def tokenize(batch):
        return tokenizer(batch["review"], padding="max_length", truncation=True)

    train_ds = Dataset.from_pandas(train)
    val_ds = Dataset.from_pandas(val)

    train_ds = train_ds.map(tokenize, batched=True)
    val_ds = val_ds.map(tokenize, batched=True)

    columns = ['input_ids', 'attention_mask', 'label']
    train_ds.set_format(type='torch', columns=columns)
    val_ds.set_format(type='torch', columns=columns)

    def compute_f1(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=1)
        return {"f1": f1_score(labels, preds)}

    training_args = transformers.TrainingArguments(
        output_dir=f"./results_{model_name.replace('/', '_')}",
        eval_strategy="epoch",   # updated for 4.56.0
        save_strategy="epoch",
        logging_dir="./logs",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        weight_decay=0.01,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1"
    )

    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        compute_metrics=compute_f1,
    )

    trainer.train()
    eval_results = trainer.evaluate()
    print(f"✅ {model_name} F1 score: {eval_results['eval_f1']}")
    return eval_results["eval_f1"], model_name

# -------------------
# 5. Train all models & find best
# -------------------
results = []
for m in models_to_test:
    f1, name = run_model(m)
    results.append((name, f1))

results_df = pd.DataFrame(results, columns=["Model", "F1"])
print("\n📊 Results:")
print(results_df)

best_model_name = results_df.loc[results_df["F1"].idxmax(), "Model"]
print(f"\n🏆 Best Model: {best_model_name}")

# -------------------
# 6. Fine-tune best model on full dataset
# -------------------
print("\n🚀 Training best model on full dataset...\n")

train_full, val_full = train_test_split(df, test_size=0.2, random_state=42)

tokenizer = transformers.AutoTokenizer.from_pretrained(best_model_name)
model = transformers.AutoModelForSequenceClassification.from_pretrained(best_model_name, num_labels=2)

def tokenize(batch):
    return tokenizer(batch["review"], padding="max_length", truncation=True)

train_ds = Dataset.from_pandas(train_full)
val_ds = Dataset.from_pandas(val_full)

train_ds = train_ds.map(tokenize, batched=True)
val_ds = val_ds.map(tokenize, batched=True)

columns = ['input_ids', 'attention_mask', 'label']
train_ds.set_format(type='torch', columns=columns)
val_ds.set_format(type='torch', columns=columns)

def compute_f1(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {"f1": f1_score(labels, preds)}

training_args = transformers.TrainingArguments(
    output_dir=f"./final_{best_model_name.replace('/', '_')}",
    eval_strategy="epoch",  # updated
    save_strategy="epoch",
    logging_dir="./logs_final",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_f1,
)

trainer.train()
trainer.save_model("best_model_final")

# -------------------
# 7. Inference on 10 random reviews
# -------------------
from transformers import pipeline

clf_pipeline = pipeline(
    'text-classification',
    model="best_model_final",
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

sample_reviews = val_full.sample(10, random_state=42)["review"].tolist()
predictions = clf_pipeline(sample_reviews)

for review, pred in zip(sample_reviews, predictions):
    print(f"\nReview: {review[:200]}...")
    print(f"Prediction: {pred['label']}, Score: {pred['score']:.4f}")


Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews

🚀 Training model: bert-base-uncased



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = transformers.Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.2861,0.331841,0.906433
2,0.1368,0.35944,0.911111


✅ bert-base-uncased F1 score: 0.9111111111111111

🚀 Training model: roberta-base



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = transformers.Trainer(


Epoch,Training Loss,Validation Loss,F1
1,0.3228,0.308861,0.924855
2,0.1538,0.295307,0.93857


✅ roberta-base F1 score: 0.9385699899295066

🚀 Training model: microsoft/deberta-base



tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = transformers.Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 2, 'bos_token_id': 1}.


OutOfMemoryError: CUDA out of memory. Tried to allocate 376.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 76.12 MiB is free. Process 24758 has 14.66 GiB memory in use. Of the allocated memory 14.18 GiB is allocated by PyTorch, and 365.54 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [6]:
import wandb
wandb.login(key="8496a2b8a623938673a877c9be61359ca6781377")


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mnavya-rajesh2004[0m ([33mnavya-rajesh2004-svkm-s-narsee-monjee-institute-of-manag[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [1]:
!pip install -U transformers datasets accelerate -q
import transformers
print(transformers.__version__)


4.56.0


In [2]:
import transformers
from transformers import TrainingArguments
print(transformers.__version__)
print(TrainingArguments.__module__)


4.56.0
transformers.training_args
