In [42]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, DistilBertConfig
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset
import evaluate
import numpy as np
import torch

In [43]:
# Load the imdb dataset
dataset = load_dataset('imdb')

# Sample a smaller dataset to reduce training time
small_train = dataset["train"].shuffle(seed=42).select(range(2000))
small_test = dataset["test"].shuffle(seed=42).select(range(1000))

In [44]:
# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

# Tokenize the data
def tokenize(example):
    return tokenizer(
        example['text'],
        padding="max_length",
        truncation=True,
        max_length=256
    )

tokenized_train = small_train.map(tokenize, batched=True)
tokenized_test = small_test.map(tokenize, batched=True)

#Set format for PyTorch
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [45]:
# Load the pre-trained DistilBERT model
config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=2, dropout=0.3, attention_dropout=0.3)
model = DistilBertForSequenceClassification(config)

In [46]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [47]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.1,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    label_smoothing_factor=0.1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

In [48]:
# Define a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

  trainer = Trainer(


In [49]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7405,0.705803,0.52
2,0.7107,0.691438,0.52


TrainOutput(global_step=500, training_loss=0.7028825359344483, metrics={'train_runtime': 601.6291, 'train_samples_per_second': 3.324, 'train_steps_per_second': 0.831, 'total_flos': 132467398656000.0, 'train_loss': 0.7028825359344483, 'epoch': 2.0})

In [50]:
trainer.evaluate()

{'eval_loss': 0.7058034539222717,
 'eval_accuracy': 0.52,
 'eval_runtime': 12.3815,
 'eval_samples_per_second': 16.153,
 'eval_steps_per_second': 1.05,
 'epoch': 2.0}

In [51]:
# Evaluate the model
trainer.save_model("./distilbert-sentiment-model")