<a href="https://colab.research.google.com/github/matthiaswong/MLA_Project/blob/main/DistilBERT_Optimized_Text_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ðŸš€ DistilBERT Optimized Text Classifier (Reddit Sentiment Dataset)

In [1]:
!nvidia-smi

Wed Nov  5 02:36:17 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   47C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

Library Load

In [8]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils import resample

Pre-processing Data

In [9]:
df = pd.read_csv(r"/content/labeled_data_1k (2).csv") #Read Data (Change as needed)
df = df.dropna(subset=["text"]) #Drop Empty Text
df["combined_text"] = df["title"].fillna('') + " " + df["text"].fillna('') #Combine Title and Text
df["label"] = df["post_sentiment"].astype("category").cat.codes # Sentiment as Categories

# Optional balancing, if there is oversampling of a certain label, resample to get equal representation of each label
max_class_size = df["label"].value_counts().max()
balanced_df = pd.concat([
    resample(sub_df, replace=True, n_samples=max_class_size, random_state=42)
    for _, sub_df in df.groupby("label")
])

Tokenisation

In [10]:
dataset = Dataset.from_pandas(balanced_df[["combined_text", "label"]])
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["combined_text"], truncation=True, padding=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2320 [00:00<?, ? examples/s]

Split

In [11]:
# Split (80/20 split)
split_dataset = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

Model

In [12]:
# Model
num_labels = len(df["label"].unique())
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Metrics

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds, average="weighted"),
        "recall": recall_score(labels, preds, average="weighted"),
        "f1": f1_score(labels, preds, average="weighted"),
    }

Training

In [17]:
# Training configuration
training_args = TrainingArguments(
    output_dir="./distilbert_colab",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs_colab",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.598333,0.913793,0.924054,0.913793,0.914073
2,No log,0.390393,0.935345,0.936196,0.935345,0.935251
3,No log,0.414947,0.928879,0.929285,0.928879,0.928475
4,No log,0.439528,0.926724,0.927272,0.926724,0.92618
5,0.001700,0.464594,0.926724,0.927019,0.926724,0.926072
6,0.001700,0.457164,0.926724,0.927272,0.926724,0.92618


TrainOutput(global_step=696, training_loss=0.0012259739598302149, metrics={'train_runtime': 319.2887, 'train_samples_per_second': 34.878, 'train_steps_per_second': 2.18, 'total_flos': 737617936711680.0, 'train_loss': 0.0012259739598302149, 'epoch': 6.0})

Evaluation and results

In [15]:
# ----------------------------
# STEP 9: Save model and predictions (Fixed)
# ----------------------------
trainer.save_model("./distilbert_colab_model")
tokenizer.save_pretrained("./distilbert_colab_model")

# Generate predictions
preds = trainer.predict(test_dataset)

# Convert to numpy arrays
y_pred = preds.predictions.argmax(-1)
y_true = preds.label_ids

# Create a DataFrame with text + true/predicted labels
test_texts = [t for t in test_dataset["combined_text"]]
results_df = pd.DataFrame({
    "text": test_texts,
    "true_label": y_true,
    "predicted_label": y_pred
})

# Map back numeric labels to sentiment names
label_map = dict(enumerate(df["post_sentiment"].astype("category").cat.categories))
results_df["true_sentiment"] = results_df["true_label"].map(label_map)
results_df["predicted_sentiment"] = results_df["predicted_label"].map(label_map)

# Save to CSV
results_df.to_csv("distilbert_predictions.csv", index=False)

print("âœ… Model and predictions saved successfully!")


âœ… Model and predictions saved successfully!
