# Fine-tuning Sandbox


In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


## Dataset


In [3]:
# # How dataset was generated
# # Load the imdb dataset
# imdb_dataset = load_dataset("imdb")
# # Subsample size (this is the size of the dataset used for training and testing)
# N = 1000
# # Generate indexes for random subsample (we don't want to use the entire dataset for training and testing, as it would take too long to train and test the model)
# rand_idx = np.random.randint(24999, size=N) # array of N random indexes
# # Extract train and test data
# x_train = imdb_dataset['train'][rand_idx]['text']
# y_train = imdb_dataset['train'][rand_idx]['label']

# x_test = imdb_dataset['test'][rand_idx]['text']
# y_test = imdb_dataset['test'][rand_idx]['label']

# # Create new dataset
# train_dataset = DatasetDict({
# 	'train': Dataset.from_dict({'text': x_train, 'label': y_train}),
# 	'test': Dataset.from_dict({'text': x_test, 'label': y_test})
# })

In [4]:
# Load the dataset
train_dataset = load_dataset("shawhin/imdb-truncated")
train_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [5]:
# Display % of training data with label 1 (positive)
np.array(train_dataset["train"]["label"]).sum() / len(train_dataset["train"]["label"])

0.5

## Model


In [6]:
model_checkpoint = (
    "distilbert-base-uncased"  # We use a smaller model for faster training
)

# Define label maps
id2label = {0: "negative", 1: "positive"}
label2id = {"negative": 0, "positive": 1}

# Generate classification model from model checkpoint
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Display architecture of the model
model.config

DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "negative",
    "1": "positive"
  },
  "initializer_range": 0.02,
  "label2id": {
    "negative": 0,
    "positive": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.38.2",
  "vocab_size": 30522
}

## Preprocessing data


In [8]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint, add_prefix_space=True
)  # add_prefix_space=True is used to add a space before the first token to avoid a warning. The warning is due to the fact that the model was trained with a space before the first token, but the tokenizer does not add it by default.

# Add pad token to the tokenizer if it does not exist. The pad token is used to pad sequences to the same length.
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

In [9]:
# Create tokenize function
def tokenize_function(examples):
    # Extract text from examples
    text = examples["text"]

    # Tokenize and truncate text
    tokenizer.truncate_side = "left"  # truncate from the left side because the model is trained to read from left to right
    tokenized_inputs = tokenizer(
        text, return_tensors="np", truncation=True, max_length=512
    )
    return tokenized_inputs

In [10]:
# Tokenize training and validation datasets
tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [11]:
# Create data collator: This is used to pad sequences to the same length. The model requires sequences to be of the same length.
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluation


In [12]:
accuracy = evaluate.load("accuracy")

In [13]:
# Define an evaluation function that will be called at the end of each epoch. Epoch is a complete pass through the entire training dataset. For example, if the training dataset has 1000 samples and the batch size is 10, then there are 100 batches in an epoch.


def compute_metrics(p):
    # Extract predictions and labels
    predictions, labels = (
        p  # predictions are the model's output. For sequence classification, the output is the logits (scores) for each class. The class with the highest score is the predicted class. Example: [[-1.2, 1.5], [0.9, -2.1]] means the first sample is predicted as class 1 and the second sample is predicted as class 0. labels are the true labels. Example: [1, 0] means the first sample is class 1 and the second sample is class 0.
    )

    # Convert logits to predicted class
    predictions = np.argmax(
        predictions, axis=1
    )  # convert logits to predicted class. Example: [[-1.2, 1.5], [0.9, -2.1]] becomes [1, 0]. [-1.2, 1.5] and [0.9, -2.1] are the logits for the first and second samples, respectively.

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

## Apply untrained model to text data


In [14]:
# Define list of text examples
text_list = [
    "This is a great movie!",
    "This is a bad movie!",
    "This movie is not good.",
    "This movie is not bad.",
]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # Tokenize text
    inputs = tokenizer.encode(
        text, return_tensors="pt"
    )  # return_tensors="pt" is used to return PyTorch tensors
    # Compute logits (Logits are the scores for each class. The class with the highest score is the predicted class)
    logits = model(inputs).logits
    # Convert logits to label
    predictions = torch.argmax(logits, dim=1)
    # Convert list of tensors to list of integers
    predictions = predictions.tolist()
    for i, prediction in enumerate(predictions):
        print(f"Text: {text_list[i]}")
        print(f"Predicted label: {id2label[prediction]}")
        print(f"Logits: {logits[i]}")
        print()

Untrained model predictions:
----------------------------
Text: This is a great movie!
Predicted label: negative
Logits: tensor([ 0.0271, -0.1207], grad_fn=<SelectBackward0>)

Text: This is a great movie!
Predicted label: negative
Logits: tensor([ 0.0248, -0.1351], grad_fn=<SelectBackward0>)

Text: This is a great movie!
Predicted label: negative
Logits: tensor([ 0.0656, -0.0649], grad_fn=<SelectBackward0>)

Text: This is a great movie!
Predicted label: negative
Logits: tensor([ 0.0603, -0.0611], grad_fn=<SelectBackward0>)



# Train model


In [15]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # task type. In this case, it is sequence classification
    r=4,  # number of layers in the PEFT model
    lora_alpha=32,  # alpha parameter for LoRA. LoRA is used to control the flow of information between layers in the PEFT model. Alpha is a hyperparameter that controls the strength of the connections between layers. A higher alpha means stronger connections between layers.
    lora_dropout=0.01,  # dropout rate for LoRA. Dropout is used to prevent overfitting. It randomly sets a fraction of the input units to 0 at each update during training time, which helps prevent overfitting.
    target_modules=[
        "q_lin"
    ],  # The target modules are the modules that are controlled by LoRA. In this case, we are controlling the linear layers (q_lin) in the PEFT model.
)

In [16]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.SEQ_CLS: 'SEQ_CLS'>, inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False)

In [17]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9306847223789819


In [18]:
# Hyperparameters
lr = 1e-3  # This is the learning rate. It controls the step size during training. A higher learning rate means larger steps and faster training, but it can also lead to divergence. A lower learning rate means smaller steps and slower training, but it can also lead to better convergence.
batch_size = 4  # The number of samples in a batch. A batch is used to update the model's weights. A smaller batch size means the model is updated more frequently, but it also means the training process is slower. A larger batch size means the model is updated less frequently, but it also means the training process is faster. What inside batch are model inputs and labels.

num_epochs = 10  # The number of times the entire training dataset is passed through the model. Each pass is called an epoch. A higher number of epochs means the model is trained for a longer time, but it can also lead to overfitting. A lower number of epochs means the model is trained for a shorter time, but it can also lead to underfitting.

In [19]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=model_checkpoint
    + "-lora-text-classification",  # The output directory where the model predictions and checkpoints will be written
    learning_rate=lr,  # The initial learning rate for training the model
    per_device_train_batch_size=batch_size,  # The batch size for training. The number of samples in a batch used to update the model's weights during training.
    per_device_eval_batch_size=batch_size,  # The batch size for evaluation. The number of samples in a batch used to evaluate the model during training.
    num_train_epochs=num_epochs,  # The number of epochs for training the model (the number of times the entire training dataset is passed through the model)
    weight_decay=0.01,  # Weight decay is a regularization technique. It adds a penalty term to the loss function to prevent overfitting. A higher weight decay means a stronger penalty, which means the model is more regularized.
    evaluation_strategy="epoch",  # The evaluation strategy to use at the end of each epoch. In this case, we evaluate the model at the end of each epoch.
    save_strategy="epoch",  # The strategy to save the model. In this case, we save the model at the end of each epoch.
    load_best_model_at_end=True,  # Whether or not to load the best model at the end of training
)

In [20]:
# Create trainer object
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # The training arguments
    train_dataset=tokenized_dataset["train"],  # The training dataset
    eval_dataset=tokenized_dataset["validation"],  # The evaluation dataset
    tokenizer=tokenizer,  # The tokenizer used to tokenize the data
    data_collator=data_collator,  # The data collator used to pad sequences to the same length
    compute_metrics=compute_metrics,  # The function used to compute metrics at the end of each epoch
)
# Train the model
trainer.train()

                                                  
 10%|█         | 250/2500 [04:07<14:57,  2.51it/s]

{'eval_loss': 0.55406653881073, 'eval_accuracy': {'accuracy': 0.83}, 'eval_runtime': 102.3622, 'eval_samples_per_second': 9.769, 'eval_steps_per_second': 2.442, 'epoch': 1.0}


 20%|██        | 500/2500 [06:34<15:07,  2.20it/s]   

{'loss': 0.4226, 'grad_norm': 2.205007791519165, 'learning_rate': 0.0008, 'epoch': 2.0}


                                                  
 20%|██        | 500/2500 [06:59<15:07,  2.20it/s]

{'eval_loss': 0.44499048590660095, 'eval_accuracy': {'accuracy': 0.87}, 'eval_runtime': 24.8647, 'eval_samples_per_second': 40.218, 'eval_steps_per_second': 10.054, 'epoch': 2.0}


                                                    
 30%|███       | 750/2500 [09:15<13:23,  2.18it/s]

{'eval_loss': 0.7752700448036194, 'eval_accuracy': {'accuracy': 0.862}, 'eval_runtime': 23.8076, 'eval_samples_per_second': 42.003, 'eval_steps_per_second': 10.501, 'epoch': 3.0}


 40%|████      | 1000/2500 [10:56<06:50,  3.65it/s] 

{'loss': 0.1821, 'grad_norm': 0.060890164226293564, 'learning_rate': 0.0006, 'epoch': 4.0}


                                                   
 40%|████      | 1000/2500 [11:18<06:50,  3.65it/s]

{'eval_loss': 0.6749710440635681, 'eval_accuracy': {'accuracy': 0.894}, 'eval_runtime': 22.1852, 'eval_samples_per_second': 45.075, 'eval_steps_per_second': 11.269, 'epoch': 4.0}


                                                     
 50%|█████     | 1250/2500 [13:02<05:19,  3.91it/s]

{'eval_loss': 0.810627818107605, 'eval_accuracy': {'accuracy': 0.895}, 'eval_runtime': 21.9733, 'eval_samples_per_second': 45.51, 'eval_steps_per_second': 11.377, 'epoch': 5.0}


 60%|██████    | 1500/2500 [14:24<03:20,  4.99it/s]  

{'loss': 0.0431, 'grad_norm': 0.013209199532866478, 'learning_rate': 0.0004, 'epoch': 6.0}


                                                   
 60%|██████    | 1500/2500 [14:49<03:20,  4.99it/s]

{'eval_loss': 0.8653650879859924, 'eval_accuracy': {'accuracy': 0.895}, 'eval_runtime': 24.9744, 'eval_samples_per_second': 40.041, 'eval_steps_per_second': 10.01, 'epoch': 6.0}


                                                     
 70%|███████   | 1750/2500 [16:33<02:56,  4.26it/s]

{'eval_loss': 0.996708869934082, 'eval_accuracy': {'accuracy': 0.886}, 'eval_runtime': 27.1848, 'eval_samples_per_second': 36.785, 'eval_steps_per_second': 9.196, 'epoch': 7.0}


 80%|████████  | 2000/2500 [17:45<01:49,  4.55it/s]  

{'loss': 0.013, 'grad_norm': 0.1288709193468094, 'learning_rate': 0.0002, 'epoch': 8.0}


                                                   
 80%|████████  | 2000/2500 [18:08<01:49,  4.55it/s]

{'eval_loss': 0.9884560108184814, 'eval_accuracy': {'accuracy': 0.89}, 'eval_runtime': 23.4856, 'eval_samples_per_second': 42.579, 'eval_steps_per_second': 10.645, 'epoch': 8.0}


                                                     
 90%|█████████ | 2250/2500 [19:52<01:08,  3.66it/s]

{'eval_loss': 0.9701987504959106, 'eval_accuracy': {'accuracy': 0.887}, 'eval_runtime': 24.0707, 'eval_samples_per_second': 41.544, 'eval_steps_per_second': 10.386, 'epoch': 9.0}


100%|██████████| 2500/2500 [20:57<00:00,  4.20it/s]

{'loss': 0.0102, 'grad_norm': 4.3668656871886924e-05, 'learning_rate': 0.0, 'epoch': 10.0}


                                                   
100%|██████████| 2500/2500 [21:21<00:00,  4.20it/s]

{'eval_loss': 0.9624693989753723, 'eval_accuracy': {'accuracy': 0.892}, 'eval_runtime': 23.2819, 'eval_samples_per_second': 42.952, 'eval_steps_per_second': 10.738, 'epoch': 10.0}


100%|██████████| 2500/2500 [21:21<00:00,  1.95it/s]

{'train_runtime': 1281.6845, 'train_samples_per_second': 7.802, 'train_steps_per_second': 1.951, 'train_loss': 0.13418660144805908, 'epoch': 10.0}





TrainOutput(global_step=2500, training_loss=0.13418660144805908, metrics={'train_runtime': 1281.6845, 'train_samples_per_second': 7.802, 'train_steps_per_second': 1.951, 'train_loss': 0.13418660144805908, 'epoch': 10.0})

## Generate predictions

In [21]:
model.to('mps') # moving to mps for Mac (can alternatively do 'cpu')
print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("mps") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
This is a great movie! - positive
This is a bad movie! - negative
This movie is not good. - negative
This movie is not bad. - negative
