In [3]:
!pip install transformers datasets
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch



In [4]:
dataset = load_dataset("amazon_polarity")

# Use a subset for faster training
train_dataset = dataset["train"].shuffle(seed=42).select(range(10000))
test_dataset = dataset["test"].shuffle(seed=42).select(range(2000))

# 2. Preprocessing
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
def preprocess(data):
    return tokenizer(data['content'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

encoded_train_dataset = train_dataset.rename_column("label", "labels")
encoded_test_dataset = test_dataset.rename_column("label", "labels")

# Set format for PyTorch compatibility
encoded_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
encoded_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# 3. Load the pre-trained model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=9e-3,
    per_device_train_batch_size=128,
    num_train_epochs=1,  # Reduced epochs for faster training
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
)

# 5. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_test_dataset,
)



In [7]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mml_monster[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.6926,0.693836


TrainOutput(global_step=79, training_loss=1.3015858795069442, metrics={'train_runtime': 8536.0157, 'train_samples_per_second': 1.172, 'train_steps_per_second': 0.009, 'total_flos': 331168496640000.0, 'train_loss': 1.3015858795069442, 'epoch': 1.0})

In [None]:
# 7. Save the model
model.save_pretrained("./fast_sentiment_model")
tokenizer.save_pretrained("./fast_sentiment_model")

In [None]:
# 8. Inference
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Positive" if prediction == 1 else "Negative"

# Test the model
test_review = "This product is amazing!"
print(predict_sentiment(test_review))