In [None]:
# ========================
# 1. Install dependencies
# ========================
!pip install -q transformers datasets scikit-learn accelerate -U

# ========================
# 2. Import libraries
# ========================
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import torch

# ========================
# 3. Load dataset
# ========================
dataset = load_dataset("imdb")

# ========================
# 4. Tokenization
# ========================
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Select a smaller subset of the dataset for faster training
small_train_dataset = dataset["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = dataset["test"].shuffle(seed=42).select(range(1000))


# ========================
# 5. Load model
# ========================
label_map = {0: "NEGATIVE", 1: "POSITIVE"}
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=label_map,label2id={v: k for k, v in label_map.items()})

# ========================
# 6. Metrics function
# ========================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

# ========================
# 7. Training arguments
# ========================
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    # Add this line to enable mixed precision training (requires GPU)
    # bf16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8,
    # fp16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,
)

# ========================
# 8. Trainer
# ========================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset, # Use the smaller training dataset
    eval_dataset=small_eval_dataset,   # Use the smaller evaluation dataset
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ========================
# 9. Train model
# ========================
trainer.train()

# ========================
# 10. Evaluate
# ========================
metrics = trainer.evaluate()
print(metrics)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6501,0.43818,0.836,0.843511
2,0.3869,0.35519,0.85,0.854651


{'eval_loss': 0.35518962144851685, 'eval_accuracy': 0.85, 'eval_f1': 0.8546511627906976, 'eval_runtime': 14.317, 'eval_samples_per_second': 69.847, 'eval_steps_per_second': 4.4, 'epoch': 2.0}


In [None]:
# ========================
# 11. Test on new text
# ========================
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
# Map default labels to desired output
# label_map = {"LABEL_0": "negative", "LABEL_1": "positive"}

text1 = "The product quality is amazing and delivery was fast!"
result1 = classifier(text1)[0]
# result1['label'] = label_map.get(result1['label']) # Use .get for safer mapping
print(result1)

text2 = "Worst experience ever, I want a refund."
result2 = classifier(text2)[0]
# result2['label'] = label_map.get(result2['label']) # Use .get for safer mapping
print(result2)

Device set to use cuda:0


{'label': 'POSITIVE', 'score': 0.7757363319396973}
{'label': 'NEGATIVE', 'score': 0.7918445467948914}


In [None]:
# ========================
# 12. Save model for GitHub
# ========================
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")

# If in Colab, you can download as zip
!zip -r sentiment_model.zip sentiment_model

updating: sentiment_model/ (stored 0%)
updating: sentiment_model/vocab.txt (deflated 53%)
updating: sentiment_model/tokenizer_config.json (deflated 75%)
updating: sentiment_model/config.json (deflated 46%)
updating: sentiment_model/special_tokens_map.json (deflated 42%)
updating: sentiment_model/tokenizer.json (deflated 71%)
updating: sentiment_model/model.safetensors (deflated 8%)


First, install the `huggingface_hub` library.

In [None]:
!pip install huggingface_hub -q

Next, log in to your Hugging Face account. You will be prompted to enter your token.

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Finally, push your model and tokenizer to the Hub. Replace `"your-username/your-model-name"` with your desired repository name on the Hugging Face Hub.

In [None]:
model.push_to_hub("mhemon/seintiment-distilbert-base-uncased")
tokenizer.push_to_hub("mhemon/seintiment-distilbert-base-uncased")

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  /tmp/tmptmtcy60o/model.safetensors    :   2%|2         | 5.67MB /  268MB            

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/mhemon/seintiment-distilbert-base-uncased/commit/b2d43a5efd05e3a5a8092ee2395de7fc7caafc0f', commit_message='Upload tokenizer', commit_description='', oid='b2d43a5efd05e3a5a8092ee2395de7fc7caafc0f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mhemon/seintiment-distilbert-base-uncased', endpoint='https://huggingface.co', repo_type='model', repo_id='mhemon/seintiment-distilbert-base-uncased'), pr_revision=None, pr_num=None)

In [None]:
from transformers import pipeline

pipeline = pipeline("text-classification", model="mhemon/seintiment-distilbert-base-uncased", device="cuda")

config.json:   0%|          | 0.00/689 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Device set to use cuda


In [None]:
pipeline("The product quality is amazing and delivery was fast!")

[{'label': 'POSITIVE', 'score': 0.7757363319396973}]