<a href="https://colab.research.google.com/github/mit1280/fined-tuning/blob/main/phi_2_classification_fine_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U accelerate peft bitsandbytes trl transformers einops datasets evaluate

In [None]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import bitsandbytes as bnb
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          TrainingArguments,
                          Trainer)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split
from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model
)

from datasets import load_dataset

In [None]:
dataset_name = 'financial_phrasebank'
# load dataset
dataset = load_dataset(dataset_name, "sentences_50agree", split = ["train"])[0]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Set label ids

In [None]:
labels = dataset.features['label'].names
print(labels)

['negative', 'neutral', 'positive']


In [None]:
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
print(id2label)

{0: 'negative', 1: 'neutral', 2: 'positive'}


## Load model and tokenizer

In [None]:
model_name = 'microsoft/phi-2'
# qunatization config
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True)
tokenizer.pad_token = tokenizer.eos_token

quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'dense'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

# load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.config.pad_token_id = tokenizer.pad_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokenize data

In [None]:
dataset

Dataset({
    features: ['sentence', 'label'],
    num_rows: 4846
})

In [None]:
from datasets import Dataset, DatasetDict

dataset = dataset.rename_column('sentence', 'text')

# Splitting the dataset into training and validation sets (80% train, 20% validation)
train_data, test_data = train_test_split(dataset, test_size=0.8, random_state=42)

train_ds = Dataset.from_dict({
    'text': train_data['text'],
    'label': train_data['label']
})

test_ds = Dataset.from_dict({
    'text': test_data['text'],
    'label': test_data['label']
})


train_ds

Dataset({
    features: ['text', 'label'],
    num_rows: 969
})

In [None]:
train_ds = train_ds.select(range(600))
test_ds = test_ds.select(range(200))

In [None]:
train_ds = train_ds.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True),
                        batched=True, )
test_ds = test_ds.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True),
                      batched=True, )

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
train_ds.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])

In [None]:
batch = train_ds[1]
for k,v in batch.items():
  print(k,v.shape)

label torch.Size([])
input_ids torch.Size([2048])
attention_mask torch.Size([2048])


In [None]:
# from torch.utils.data import DataLoader

# train_dataloader = DataLoader(train_ds, batch_size=4, shuffle=True)
# test_dataloader = DataLoader(test_ds, batch_size=4)

# batch = next(iter(train_dataloader))
# for k,v in batch.items():
#   print(k,v.shape)

In [None]:
tokenizer.decode(batch['input_ids'])

'Sarantel, based in Wellingborough, UK, designs high-performance antennas for portable wireless devices.<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|

In [None]:
import evaluate
from transformers import DataCollatorWithPadding
import numpy as np

In [None]:
accuracy = evaluate.load("accuracy")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="phi-2-fine-tune-sentiment-classifier",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.951979,0.625


TrainOutput(global_step=300, training_loss=0.9531647745768229, metrics={'train_runtime': 8180.084, 'train_samples_per_second': 0.073, 'train_steps_per_second': 0.037, 'total_flos': 1.8638362902528e+16, 'train_loss': 0.9531647745768229, 'epoch': 1.0})

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub()

events.out.tfevents.1707074786.3fad6da30f19.1876.0:   0%|          | 0.00/5.91k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mit1208/phi-2-fine-tune-sentiment-classifier/commit/657d89f59475ab262709883196cb4b76cdae12aa', commit_message='End of training', commit_description='', oid='657d89f59475ab262709883196cb4b76cdae12aa', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
tokenizer.push_to_hub("Mit1208/phi-2-fine-tune-sentiment-classifier")


README.md:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mit1208/phi-2-fine-tune-sentiment-classifier/commit/ccca7d0d22eaf181055f6caa58f58ff0b3205a27', commit_message='Upload tokenizer', commit_description='', oid='ccca7d0d22eaf181055f6caa58f58ff0b3205a27', pr_url=None, pr_revision=None, pr_num=None)

# Reference


1.   https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Perceiver/Fine_tune_Perceiver_for_text_classification.ipynb
2.   https://medium.com/@lukas.hauzenberger/multilabel-classification-using-mistral-7b-on-a-single-gpu-with-quantization-and-lora-8f848b5237f3

