In [1]:
# Install Pytorch & other libraries
%pip install -qqq torch torchvision setuptools scikit-learn

# Install Hugging Face libraries
%pip install  --upgrade datasets -qqq accelerate hf-transfer transformers

In [10]:
from datasets import load_dataset

# Dataset id from huggingface.co/dataset
dataset_id = "burtenshaw/PleIAs_common_corpus_code_classification"

# Load raw dataset
dataset = load_dataset(dataset_id)

In [11]:
print(len(dataset["train"]))
print(dataset["train"][0])

127723
{'text': '/*\n * Copyright (c) 2000 Kungliga Tekniska Högskolan\n * (Royal Institute of Technology, Stockholm, Sweden).\n * All rights reserved.\n *\n * Redistribution and use in source and binary forms, with or without\n * modification, are permitted provided that the following conditions\n * are met:\n *\n * 1. Redistributions of source code must retain the above copyright\n *    notice, this list of conditions and the following disclaimer.\n *\n * 2. Redistributions in binary form must reproduce the above copyright\n *    notice, this list of conditions and the following disclaimer in the\n *    documentation and/or other materials provided with the distribution.\n *\n * 3. Neither the name of the Institute nor the names of its contributors\n *    may be used to endorse or promote products derived from this software\n *    without specific prior written permission.\n *\n * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS\'\' AND\n * ANY EXPRESS OR IMPLIED W

In [12]:
from transformers import AutoTokenizer

# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Tokenize helper function
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, return_tensors="pt")

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize, batched=True, remove_columns=["text"])

tokenized_dataset["train"].features.keys()
# dict_keys(['labels', 'input_ids', 'attention_mask'])


Map:   0%|          | 0/127723 [00:00<?, ? examples/s]

Map:   0%|          | 0/14192 [00:00<?, ? examples/s]

dict_keys(['labels', 'input_ids', 'attention_mask'])

In [13]:
from transformers import AutoModelForSequenceClassification

# Model id to load the tokenizer
model_id = "answerdotai/ModernBERT-base"

# Prepare model labels - useful for inference
labels = list(set(tokenized_dataset["train"]["labels"]))
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

In [14]:
# Download the model from huggingface.co/models
model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label,
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
import numpy as np
from sklearn.metrics import f1_score

# Metric helper method
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(
            labels, predictions, labels=labels, pos_label=1, average="weighted"
        )
    return {"f1": float(score) if score == 1 else score}


In [20]:
import wandb

# Initialize Weights & Biases for experiment tracking
wandb.init(project="transformer-fine-tuning", name="bert-mrpc-analysis")

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mparvezamm3[0m ([33mparvezamm3-usmr[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [26]:
from huggingface_hub import HfFolder
from transformers import Trainer, TrainingArguments

# # Define training args
# training_args = TrainingArguments(
#     output_dir= "ModernBERT-code-classifier",
#     per_device_train_batch_size=2,
#     per_device_eval_batch_size=2,
#     gradient_accumulation_steps=16,
#     learning_rate=5e-5,
#     num_train_epochs=5,
#     bf16=True, # bfloat16 training
#     optim="adamw_torch_fused", # improved optimizer
#     # logging & evaluation strategies
#     logging_strategy="steps",
#     logging_steps=100,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     metric_for_best_model="f1",
#     # push to hub parameters
#     push_to_hub=True,
#     hub_strategy="every_save",
#     hub_token=HfFolder.get_token(),
#     report_to="wandb"
# )
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=50,
    save_steps=100,
    logging_steps=10,  # Log metrics every 10 steps
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to="wandb",  # Send logs to Weights & Biases
)



# Overfitting

In [28]:
limited_dataset = tokenized_dataset["train"].select(range(100))

# # Create a Trainer instance
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=limited_dataset,
#     eval_dataset=tokenized_dataset["test"],
#     compute_metrics=compute_metrics,
# )
# trainer.train()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=limited_dataset,
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

# Train and automatically log metrics
trainer.train()

NameError: name 'data_collator' is not defined

In [18]:
# clear memory

import torch
torch.cuda.empty_cache()

del trainer
del model
del limited_dataset

NameError: name 'trainer' is not defined

# Underfitting

In [19]:
# define a low learning rate
training_args.learning_rate = 1e-7

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=limited_dataset,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)
trainer.train()

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-68649a34-0a2e7a03666de61f3d67612f;ffcc82f2-f60b-4a60-b529-eb83b7cf6303)

Invalid username or password.

In [None]:
# clear memory

import torch
torch.cuda.empty_cache()

del trainer
del model

# Just right! 🥣

In [None]:
# define a valid learning rate
training_args.learning_rate = 5e-5

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=limited_dataset,
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
)
trainer.train()

# Inference

In [None]:
from transformers import pipeline

# load model from huggingface.co/models using our repository id
classifier = pipeline(
    task="text-classification",
    model="argilla/ModernBERT-domain-classifier",
    device=0,
)

sample = """def add_numbers(a, b):
    return a + b"""

classifier(sample)
