In [1]:
!pip install -U transformers



## Local Inference on GPU
Model page: https://huggingface.co/distilbert/distilbert-base-uncased

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/distilbert/distilbert-base-uncased)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("fill-mask", model="distilbert/distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [3]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-uncased")

## Remote Inference via Inference Providers
Ensure you have a valid **HF_TOKEN** set in your environment. You can get your token from [your settings page](https://huggingface.co/settings/tokens). Note: running this may incur charges above the free tier.
The following Python example shows how to run the model remotely on HF Inference Providers, automatically selecting an available inference provider for you.
For more information on how to use the Inference Providers, please refer to our [documentation and guides](https://huggingface.co/docs/inference-providers/en/index).

In [4]:
import os
os.environ['HF_TOKEN'] = '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'

In [5]:
import os
from huggingface_hub import InferenceClient

client = InferenceClient(
    provider="auto",
    api_key=os.environ["HF_TOKEN"],
)

result = client.fill_mask(
    "The answer to the universe is [MASK].",
    model="distilbert/distilbert-base-uncased",
)

In [6]:
!pip install datasets transformers scikit-learn -q


In [7]:
import pandas as pd
from datasets import Dataset, DatasetDict

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)


In [8]:
train_df = pd.read_csv("both_train.csv")
val_df   = pd.read_csv("both_val.csv")
test_df  = pd.read_csv("both_test.csv")

TITLE_COL = "title"
BODY_COL  = "post"
LABEL_COL = "class_id"

# Keep only text + label, drop rows with missing values
train_df = train_df[[TITLE_COL, BODY_COL, LABEL_COL]].dropna()
val_df   = val_df[[TITLE_COL, BODY_COL, LABEL_COL]].dropna()
test_df  = test_df[[TITLE_COL, BODY_COL, LABEL_COL]].dropna()

#combining title and post into one string
train_df["text"] = train_df[TITLE_COL].fillna("") + " " + train_df[BODY_COL].fillna("")
val_df["text"]   = val_df[TITLE_COL].fillna("") + " " + val_df[BODY_COL].fillna("")
test_df["text"]  = test_df[TITLE_COL].fillna("") + " " + test_df[BODY_COL].fillna("")

train_df["labels"] = train_df[LABEL_COL].astype(int)
val_df["labels"]   = val_df[LABEL_COL].astype(int)
test_df["labels"]  = test_df[LABEL_COL].astype(int)

num_labels = train_df["labels"].nunique()
print("num_labels:", num_labels)


num_labels: 6


In [9]:
train_dataset = Dataset.from_pandas(train_df[["text", "labels"]])
val_dataset   = Dataset.from_pandas(val_df[["text", "labels"]])
test_dataset  = Dataset.from_pandas(test_df[["text", "labels"]])

datasets = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset,
})

In [10]:
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,   # you can increase to 256 if you want
    )

tokenized_datasets = datasets.map(preprocess_function, batched=True)


Map:   0%|          | 0/13727 [00:00<?, ? examples/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

Map:   0%|          | 0/1488 [00:00<?, ? examples/s]

In [12]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        preds,
        average="macro"   # or "weighted" if your classes are imbalanced
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "f1": f1,
        "recall": recall,
        "precision": precision,
    }


In [14]:
training_args = TrainingArguments(
    output_dir="./distilbert_reddit_6class_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [15]:
# Train
trainer.train()

# Validation metrics
val_metrics = trainer.evaluate(tokenized_datasets["validation"])
print("Validation metrics:", val_metrics)

# Test metrics
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print("Test metrics:", test_metrics)


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


[34m[1mwandb[0m: Detected [huggingface_hub.inference] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall,Precision
1,0.8852,0.484541,0.834677,0.835645,0.834677,0.84168
2,0.4274,0.449026,0.844758,0.845766,0.844758,0.848308
3,0.3001,0.471794,0.850806,0.851947,0.850806,0.856254


Validation metrics: {'eval_loss': 0.47179409861564636, 'eval_accuracy': 0.8508064516129032, 'eval_f1': 0.8519473341881333, 'eval_recall': 0.8508064516129034, 'eval_precision': 0.8562544432841587, 'eval_runtime': 4.8025, 'eval_samples_per_second': 309.837, 'eval_steps_per_second': 9.787, 'epoch': 3.0}
Test metrics: {'eval_loss': 0.4712755084037781, 'eval_accuracy': 0.8407258064516129, 'eval_f1': 0.8409323801245906, 'eval_recall': 0.8407258064516129, 'eval_precision': 0.8437203214806978, 'eval_runtime': 4.7946, 'eval_samples_per_second': 310.346, 'eval_steps_per_second': 9.803, 'epoch': 3.0}
