In [1]:
#!pip3 install -q --upgrade datasets huggingface_hub fsspec
#!pip3 install -U -q "transformers>=4.41.0" "datasets>=2.19.0" "evaluate" "accelerate"
import importlib, os, sys

In [2]:
import transformers, datasets, torch, accelerate, evaluate

In [3]:
from datasets import load_from_disk
ds = load_from_disk("../datasets/20_newsgroups_ds")
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text', 'text_clean', 'text_lemm', 'tokens', 'tokens_nostop'],
        num_rows: 11314
    })
    test: Dataset({
        features: ['text', 'label', 'label_text', 'text_clean', 'text_lemm', 'tokens', 'tokens_nostop'],
        num_rows: 7532
    })
})

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = ds.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(200))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(200))

# Setup evaluation
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Load pretrained model and evaluate model after each epoch
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased" , num_labels=20)
# training_args = TrainingArguments(output_dir="test_trainer")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
from transformers import TrainingArguments, Trainer
training_args = TrainingArguments(
    output_dir='model',
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()



Step,Training Loss


TrainOutput(global_step=50, training_loss=2.8845989990234373, metrics={'train_runtime': 118.7355, 'train_samples_per_second': 1.684, 'train_steps_per_second': 0.421, 'total_flos': 26501984256000.0, 'train_loss': 2.8845989990234373, 'epoch': 1.0})

In [6]:
eval_results = trainer.evaluate()
print(eval_results)



{'eval_loss': 2.969437837600708, 'eval_accuracy': 0.155, 'eval_runtime': 21.3556, 'eval_samples_per_second': 9.365, 'eval_steps_per_second': 4.683, 'epoch': 1.0}


In [11]:
import optuna  # or use Optuna or SigOpt instead

def model_init():
    return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=20)


def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 4),
    }


trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

best_run = trainer.hyperparameter_search(
    direction="maximize",
    hp_space=hp_space,
    n_trials=10
)

print(best_run)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-07-05 02:28:59,530] A new study created in memory with name: no-name-1642d884-6879-4969-b92d-81ab4057c362
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.155
eval/loss,2.96944
eval/runtime,21.3556
eval/samples_per_second,9.365
eval/steps_per_second,4.683
total_flos,26501984256000.0
train/epoch,1.0
train/global_step,50.0
train_loss,2.99562
train_runtime,183.8856




Step,Training Loss




[I 2025-07-05 02:42:48,838] Trial 0 finished with value: 0.22 and parameters: {'learning_rate': 2.6405523746987287e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4}. Best is trial 0 with value: 0.22.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.22
eval/loss,2.86542
eval/runtime,20.1387
eval/samples_per_second,9.931
eval/steps_per_second,4.966
total_flos,106007937024000.0
train/epoch,4.0
train/global_step,52.0
train_loss,2.88543
train_runtime,807.1637




Step,Training Loss


[I 2025-07-05 02:54:27,245] Trial 1 finished with value: 0.085 and parameters: {'learning_rate': 1.0140929546661396e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 0 with value: 0.22.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.085
eval/loss,2.98614
eval/runtime,18.702
eval/samples_per_second,10.694
eval/steps_per_second,5.347
total_flos,106007937024000.0
train/epoch,4.0
train/global_step,28.0
train_loss,2.97793
train_runtime,678.4782




Step,Training Loss


[I 2025-07-05 03:02:20,613] Trial 2 finished with value: 0.09 and parameters: {'learning_rate': 2.1355469359003155e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 0 with value: 0.22.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.09
eval/loss,2.97338
eval/runtime,17.6142
eval/samples_per_second,11.354
eval/steps_per_second,5.677
total_flos,79505952768000.0
train/epoch,3.0
train/global_step,21.0
train_loss,2.96823
train_runtime,454.6598




Step,Training Loss


[I 2025-07-05 03:10:22,030] Trial 3 finished with value: 0.085 and parameters: {'learning_rate': 1.690448328323836e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 0 with value: 0.22.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.085
eval/loss,2.98088
eval/runtime,17.6073
eval/samples_per_second,11.359
eval/steps_per_second,5.679
total_flos,79505952768000.0
train/epoch,3.0
train/global_step,21.0
train_loss,2.97513
train_runtime,462.8004




Step,Training Loss


[I 2025-07-05 03:14:21,633] Trial 4 finished with value: 0.075 and parameters: {'learning_rate': 1.6174661099003372e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 0 with value: 0.22.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.075
eval/loss,2.98358
eval/runtime,17.7878
eval/samples_per_second,11.244
eval/steps_per_second,5.622
total_flos,53003968512000.0
train/epoch,2.0
train/global_step,26.0
train_loss,2.97977
train_runtime,220.822




Step,Training Loss


[I 2025-07-05 03:22:01,610] Trial 5 finished with value: 0.155 and parameters: {'learning_rate': 2.0559816333636157e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4}. Best is trial 0 with value: 0.22.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.155
eval/loss,2.91257
eval/runtime,17.5467
eval/samples_per_second,11.398
eval/steps_per_second,5.699
total_flos,106007937024000.0
train/epoch,4.0
train/global_step,52.0
train_loss,2.9174
train_runtime,441.4514




Step,Training Loss


[I 2025-07-05 03:32:01,691] Trial 6 finished with value: 0.23 and parameters: {'learning_rate': 4.691500611276419e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 4}. Best is trial 6 with value: 0.23.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.23
eval/loss,2.8363
eval/runtime,17.3552
eval/samples_per_second,11.524
eval/steps_per_second,5.762
total_flos,106007937024000.0
train/epoch,4.0
train/global_step,28.0
train_loss,2.86355
train_runtime,581.6977




Step,Training Loss


[I 2025-07-05 03:39:49,038] Trial 7 finished with value: 0.265 and parameters: {'learning_rate': 4.916213280213728e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4}. Best is trial 7 with value: 0.265.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.265
eval/loss,2.67844
eval/runtime,18.0979
eval/samples_per_second,11.051
eval/steps_per_second,5.525
total_flos,106007937024000.0
train/epoch,4.0
train/global_step,52.0
train_loss,2.75231
train_runtime,448.042




Step,Training Loss


[I 2025-07-05 03:43:51,424] Trial 8 finished with value: 0.08 and parameters: {'learning_rate': 2.184373287357567e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2}. Best is trial 7 with value: 0.265.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/accuracy,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁
train/global_step,▁▁

0,1
eval/accuracy,0.08
eval/loss,2.97633
eval/runtime,17.4047
eval/samples_per_second,11.491
eval/steps_per_second,5.746
total_flos,53003968512000.0
train/epoch,2.0
train/global_step,26.0
train_loss,2.97363
train_runtime,223.9049




Step,Training Loss


[I 2025-07-05 03:51:17,733] Trial 9 finished with value: 0.095 and parameters: {'learning_rate': 1.4394631241931684e-05, 'per_device_train_batch_size': 32, 'num_train_epochs': 3}. Best is trial 7 with value: 0.265.


BestRun(run_id='7', objective=0.265, hyperparameters={'learning_rate': 4.916213280213728e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 4}, run_summary=None)


In [None]:
from huggingface_hub import notebook_login
!huggingface-cli login --token TOKEN
!huggingface-cli login -h
#notebook_login()

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `NLP_Colab` has been saved to /Users/marissa/.cache/huggingface/stored_tokens
Your token has been saved to /Users/marissa/.cache/huggingface/token
Login successful.
The current active token is: `NLP_Colab`
usage: huggingface-cli <command> [<args>] login [-h] [--token TOKEN]
                                                [--add-to-git-credential]

options:
  -h, --help            show this help message and exit
  --token TOKEN         Token generated from
                        https://huggingface.co/settings/tokens
  --add-to-git-credential
                        Optional: Save token to git credential helper.


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
!pip3 install git-lfs

Collecting git-lfs
  Downloading git_lfs-1.6-py2.py3-none-any.whl.metadata (1.2 kB)
Downloading git_lfs-1.6-py2.py3-none-any.whl (5.6 kB)
Installing collected packages: git-lfs
Successfully installed git-lfs-1.6

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m


In [23]:
import evaluate

In [24]:
repo_name = "lighthouse-NLP-assignment"
training_args = TrainingArguments(
   output_dir=repo_name,
   push_to_hub=True,
)

In [33]:
trainer.evaluate()



{'eval_loss': 2.888995885848999,
 'eval_accuracy': 0.26,
 'eval_runtime': 20.2488,
 'eval_samples_per_second': 9.877,
 'eval_steps_per_second': 4.939,
 'epoch': 1.0}

In [34]:
trainer.push_to_hub()

Uploading...:   0%|          | 0.00/8.31G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mfaz3/model/commit/b29db90b03345f8fdf462fdb7e1c32bded04b664', commit_message='End of training', commit_description='', oid='b29db90b03345f8fdf462fdb7e1c32bded04b664', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mfaz3/model', endpoint='https://huggingface.co', repo_type='model', repo_id='mfaz3/model'), pr_revision=None, pr_num=None)