# LoRA

In [1]:
import torch
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from peft import (
    LoraConfig,
    get_peft_model,
    # PeftModel,
    # PeftConfig
)
import bitsandbytes as bnb
import re
import os

from pprint import pprint

  from .autonotebook import tqdm as notebook_tqdm


In [35]:
!nvidia-smi -L
# notebook_login()
hf_token = os.getenv('HUGGINGFACE_TOKEN')

GPU 0: NVIDIA GeForce RTX 2060 (UUID: GPU-f131edbd-d9d0-b2b0-d0b2-1faa134b1031)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
# IMPORTANT
# This paramter value establishes the model and tokenizer used
# in the rest of the notebook.

base_model = "microsoft/phi-2"

base_config = AutoConfig.from_pretrained(base_model)
base_config

PhiConfig {
  "_name_or_path": "microsoft/phi-2",
  "architectures": [
    "PhiForCausalLM"
  ],
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/phi-2--configuration_phi.PhiConfig",
    "AutoModelForCausalLM": "microsoft/phi-2--modeling_phi.PhiForCausalLM"
  },
  "bos_token_id": 50256,
  "embd_pdrop": 0.0,
  "eos_token_id": 50256,
  "hidden_act": "gelu_new",
  "hidden_size": 2560,
  "initializer_range": 0.02,
  "intermediate_size": 10240,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "phi",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "partial_rotary_factor": 0.4,
  "qk_layernorm": false,
  "resid_pdrop": 0.1,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.37.0",
  "use_cache": true,
  "vocab_size": 51200
}

# Data Preprocessing

In [3]:
"""
https://huggingface.co/datasets/peterkchung/commonsense_cot_partial_annotated_v0.1

### EXAMPLE ###

{'id': '1fe48d12b6f6e4e38f4445f3ec60d5c5',
 'question': 'What can happen  to someone too sure of their learning?',
 'question_concept': 'learning',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
              'text': ['growth',
                       'gaining knowledge',
                       'enlightenment',
                       'knowing more',
                       'overconfidence']},
  'answerKey': 'E',
  'rationale': 'When someone is too sure of their learning, they become '
               'overconfident, thinking that they know everything. This can '
               'prevent them from learning more, as they stop seeking new '
               'knowledge and ideas. They might also miss out on '
               'enlightenment, as they close themselves off to new '
               'perspectives. Overall, their growth might be stunted, as they '
               'stop challenging themselves and expanding their '
               'understanding. So, out of the given choices, the most '
               'appropriate answer is overconfidence.'}

"""

dataset = load_dataset("peterkchung/commonsense_cot_partial_annotated_v0.1")

training_dataset = dataset["train"]
training_dataset

Dataset({
    features: ['id', 'question', 'question_concept', 'choices', 'answerKey', 'rationale'],
    num_rows: 100
})

In [38]:
training_dataset[0]

{'id': '1fe48d12b6f6e4e38f4445f3ec60d5c5',
 'question': 'What can happen  to someone too sure of their learning?',
 'question_concept': 'learning',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['growth',
   'gaining knowledge',
   'enlightenment',
   'knowing more',
   'overconfidence']},
 'answerKey': 'E',
 'rationale': 'When someone is too sure of their learning, they become overconfident, thinking that they know everything. This can prevent them from learning more, as they stop seeking new knowledge and ideas. They might also miss out on enlightenment, as they close themselves off to new perspectives. Overall, their growth might be stunted, as they stop challenging themselves and expanding their understanding. So, out of the given choices, the most appropriate answer is overconfidence.'}

In [4]:
# Check the documentaiton of the tokenizer used for the model you're
# fine-tuning. There is a print line added below that will print to
# console the specific tokenizer name. Refer to the Hugging Face
# documentation to see what parameters the specific tokenizer that
# is being used takes.

def format_data_examples(example):
    answer_choice = example['choices']['label'].index(example['answerKey'])

    return (f"### Query: {example['question']}\n"
            f"### Response: The answer is {example['choices']['text'][answer_choice]}. "
            f"{example['rationale']} <|endoftext|>")

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token

print(type(tokenizer)) # This will print out the tokenizer called from Autotokenizer


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<class 'transformers.models.codegen.tokenization_codegen_fast.CodeGenTokenizerFast'>


In [28]:
pprint(format_data_examples(dataset['train'][0]))

('### Query: What can happen  to someone too sure of their learning?\n'
 '### Response: The answer is overconfidence. When someone is too sure of '
 'their learning, they become overconfident, thinking that they know '
 'everything. This can prevent them from learning more, as they stop seeking '
 'new knowledge and ideas. They might also miss out on enlightenment, as they '
 'close themselves off to new perspectives. Overall, their growth might be '
 'stunted, as they stop challenging themselves and expanding their '
 'understanding. So, out of the given choices, the most appropriate answer is '
 'overconfidence. <|endoftext|>')


In [5]:
def tokenize_format_data_examples(example):
    formatted_pair = format_data_examples(example)
    tokenized_pair = tokenizer(
        formatted_pair,
        truncation=True,
        max_length=2048,
        padding='max_length',
    )
    return tokenized_pair

tokenized_dataset = training_dataset.map(tokenize_format_data_examples)
tokenized_dataset

Dataset({
    features: ['id', 'question', 'question_concept', 'choices', 'answerKey', 'rationale', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [40]:
pprint(tokenized_dataset[0])

{'answerKey': 'E',
 'attention_mask': [1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
               

# Model Configuration

In [6]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    trust_remote_code = True,
    device_map = 'auto',
    # torch_dtype = torch.float16,
  )

print(model)

Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.83s/it]


PhiForCausalLM(
  (model): PhiModel(
    (embed_tokens): Embedding(51200, 2560)
    (embed_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x PhiDecoderLayer(
        (self_attn): PhiAttention(
          (q_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (k_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (v_proj): Linear(in_features=2560, out_features=2560, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (rotary_emb): PhiRotaryEmbedding()
        )
        (mlp): PhiMLP(
          (activation_fn): NewGELUActivation()
          (fc1): Linear(in_features=2560, out_features=10240, bias=True)
          (fc2): Linear(in_features=10240, out_features=2560, bias=True)
        )
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (final_layernorm): LayerNorm((2560,),

In [42]:
# OPTIONAL
# Regex to extract linear layers based on assigned names

model_modules = str(model.modules)
pattern = r'\((\w+)\): Linear'
linear_layer_names = re.findall(pattern, model_modules)

linear_layers = []

for layer in linear_layer_names:
    linear_layers.append(layer)

target_modules = list(set(linear_layers))
print(target_modules)

['q_proj', 'fc1', 'dense', 'v_proj', 'k_proj', 'lm_head', 'fc2']


In [8]:
# LoRA Configuration
# NOTE: The 'target_modules' parameter in LoraConfig assigns the LoRA layers.
# If you not declare them they will be automatically assigned based on the
# config of the model on Hugging Face. There have been published expirements
# that suggest targeting all the linear layers improves performance.
# I've commented the parameter out for now, but please experiment and
# see what works best for your use case.


def print_trainable_params(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=['q_proj', 'k_proj', 'v_proj', 'dense', 'fc1', 'fc2', 'lm_head'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

lora_model = get_peft_model(model, config)

print(lora_model)
print_trainable_params(lora_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PhiForCausalLM(
      (model): PhiModel(
        (embed_tokens): Embedding(51200, 2560)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x PhiDecoderLayer(
            (self_attn): PhiAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2560, out_features=2560, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2560, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=2560, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear(
       


# Training

In [9]:
torch.cuda.empty_cache()

In [10]:
# Model fine-tuning run

run_name = "phi-2-lora-commonsensecot"
output_dir = "./" + run_name

training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    warmup_steps=100,
    # max_steps=500,
    num_train_epochs=2,
    learning_rate=2e-5,
    fp16=True,
    output_dir=output_dir,
    logging_steps=1,
    logging_dir=f"./{run_name}/logs",
    save_strategy="steps",
    save_steps=50,
)


trainer = Trainer(
    model = lora_model,
    args = training_args,
    train_dataset = tokenized_dataset,
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer),
)

model.config.use_cache = False
trainer.train()

NotImplementedError: Cannot copy out of meta tensor; no data!

In [None]:
# Publish to Hugging Face Hub

model.push_to_hub(
    "peterkchung/phi-2-commonsense-cot",
    use_auth_token=True,
    commit_message="Phi-2 fine-tuned on partial CommonsenseCOT dataset.",
  )

Inference

In [None]:
# Quick inference check

References:
