In [None]:
!pip install datasets -U -q
!pip install transformers -U -q
!pip install peft -U -q
!pip install bitsandbytes -U -q
!pip install torch -U -q

In [None]:
import os
import torch
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel,
    PeftConfig
)
import bitsandbytes as bnb
import re

In [None]:
!nvidia-smi -L
notebook_login()

In [None]:
# IMPORTANT
# This paramter value establishes the model and tokenizer used
# in the rest of the notebook.

base_model = "microsoft/phi-2"

base_config = AutoConfig.from_pretrained(base_model)
base_config

# Data Preprocessing

In [None]:
"""
https://huggingface.co/datasets/nvidia/HelpSteer

Dataset Description
HelpSteer contains 37,120 samples, each containing a prompt,
a response as well as five human-annotated attributes of the response,
each ranging between 0 and 4 where higher means better for each attribute.

These attributes are:

HELPFULNESS: Overall helpfulness of the response to the prompt.
CORRECTNESS: Inclusion of all pertinent facts without errors.
COHERENCE: Consistency and clarity of expression.
COMPLEXITY: Intellectual depth required to write response
(i.e. whether the response can be written by anyone with basic language
competency or requires deep domain expertise).
VERBOSITY: Amount of detail included in the response, relative to what is asked for in the prompt.

"""

dataset = load_dataset("nvidia/HelpSteer", split='train')
dataset

In [None]:
filtered_dataset = dataset.filter(lambda example:
                                  example['helpfulness'] >= 3 and \
                                  example['correctness'] >= 3 and \
                                  example['coherence'] >= 2 and \
                                  example['complexity'] >= 2 and \
                                  example['verbosity'] <= 2
                            )

filtered_dataset

In [None]:
filtered_dataset[0]

In [None]:
# Check the documentaiton of the tokenizer used for the model you're
# fine-tuning. There is a print line added below that will print to
# console the specific tokenizer name. Refer to the Hugging Face
# documentation to see what parameters the specific tokenizer that
# is being used takes.

def format_data_examples(example):
    return f"<|endoftext|> ### Query: {example['prompt']} \n ### Response: {example['response']} <|endoftext|>"

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

print(type(tokenizer)) # This will print out the tokenizer called from Autotokenizer

def tokenize_format_data_examples(example):
    formatted_pair = format_data_examples(example)
    tokenized_pair = tokenizer(
        formatted_pair,
        truncation=True,
        max_length=2048,
        padding='max_length',
    )
    return tokenized_pair

tokenized_dataset = filtered_dataset.map(tokenize_format_data_examples)
tokenized_dataset

In [None]:
tokenized_dataset[0]

In [None]:
# OPTIONAL
# Plots lenghths of inputs

import matplotlib.pyplot as plt

def plot_data_lengths(tokenized_train_dataset):
    lengths = [len(x['input_ids']) for x in tokenized_dataset]
    print(len(lengths))

    plt.figure(figsize=(10, 6))
    plt.hist(lengths, bins=20, alpha=0.5)
    plt.xlabel('Length of input_ids')
    plt.ylabel('Frequency')
    plt.title('Distribution of Lengths of input_ids')
    plt.show()

plot_data_lengths(tokenized_dataset)

# Model Configuration

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    trust_remote_code = True,
    device_map = 'auto',
    torch_dtype = torch.float16,
  )

print(model)

In [None]:
# OPTIONAL
# Freeze original model layers
# Adjust layer percision

for param in model.parameters():
    param.requires_grad = False
    if param.ndim == 1:
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutputToFloat(torch.nn.Sequential):
  '''
  Casts layer to 'float32'

  '''
  def forward(self, x): return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

In [None]:
# OPTIONAL
# Regex to extract linear layers based on assigned names

model_modules = str(model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

linear_layers = []

for layer in linear_layer_names:
    linear_layers.append(layer)

target_modules = list(set(linear_layers))
print(target_modules)

In [None]:
# LoRA Configuration
# NOTE: The 'target_modules' parameter in LoraConfig assigns the LoRA layers.
# If you not declare them they will be automatically assigned based on the
# config of the model on Hugging Face. There have been published expirements
# that suggest targeting all the linear layers improves performance.
# I've commented the parameter out for now, but please experiment and
# see what works best for your use case.


def print_trainable_params(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

config = LoraConfig(
    r=16,
    lora_alpha=32,
    # target_modules=['q_proj', 'k_proj', 'v_proj', 'dense', 'fc1', 'fc2', 'lm_head'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

lora_model = get_peft_model(model, config)

print(lora_model)
print_trainable_params(lora_model)


# Training

In [None]:
torch.cuda.empty_cache()

In [None]:
# Model fine-tuning run

run_name = "phi-2-instruct-finetune"
output_dir = "./" + run_name

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=TrainingArguments(
        output_dir=output_dir,
        fp16=True,
        warmup_steps=10,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=1,
        max_steps=500,
        learning_rate=2e-5,
        logging_steps=1,
        logging_dir=f"./{run_name}/logs",
        save_strategy="steps",
        save_steps=50,
        evaluation_strategy="steps",
        eval_steps=50,
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
# Publish to Hugging Face Hub

model.push_to_hub(
    "peterkchung/phi-2-Steer-Concise",
    use_auth_token=True,
    commit_message="Steer-Concise fine tuning on Phi-2",
  )

Inference

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

peft_model_id = "peterkchung/phi-2-Instruct"
config = PeftConfig.from_pretrained(peft_model_id)

model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    load_in_8bit=True,
    device_map='auto'
)

tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

batch = tokenizer("“Training models with PEFT and LoRa is cool” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
  output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model_id = "microsoft/phi-2"
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Phi2, same as before
    device_map="auto",
    trust_remote_code=True,
    load_in_8bit=True,
    torch_dtype=torch.float16,
)

eval_tokenizer = AutoTokenizer.from_pretrained(base_model_id, add_bos_token=True, trust_remote_code=True, use_fast=False)
eval_tokenizer.pad_token = tokenizer.eos_token

from peft import PeftModel

ft_model = PeftModel.from_pretrained(base_model, "phi2-journal-finetune/checkpoint-500")

eval_prompt = " The following is a note by Eevee the Dog: # Today I "
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")

ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(
        ft_model.generate(
            **model_input,
            max_new_tokens=100,
            repetition_penalty=1.11
        )[0],
        skip_special_tokens=True
      )
    )

References:
