In [None]:
'''
References
https://github.com/edumunozsala/llama-2-7B-4bit-python-coder/blob/main/Llama-2-finetune-qlora-python-coder.ipynb
'''


In [1]:
#Proxy setup
import os

os.environ['http_proxy'] = 'http://internet.ford.com:83'
os.environ['https_proxy'] = 'http://internet.ford.com:83'
os.environ['no_proxy'] = '.ford.com,localhost,19.0.0.0/8,127.0.0.1,10.0.0.0/8,19.*'
os.environ['HTTP_PROXY'] = 'http://internet.ford.com:83'
os.environ['HTTPS_PROXY'] = 'http://internet.ford.com:83'
os.environ['NO_PROXY'] = '.ford.com,localhost,19.0.0.0/8,127.0.0.1,10.0.0.0/8,19.*'

In [2]:
import os
os.environ['HF_HOME'] = '/s/njavaed/ai-software-engineering/hf_cache/'
os.environ['TRANSFORMERS_CACHE'] = '/s/njavaed/ai-software-engineering/hf_cache/'

In [3]:
!pip install wandb --upgrade

Defaulting to user installation because normal site-packages is not writeable
Collecting wandb
  Downloading wandb-0.17.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Downloading wandb-0.17.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: wandb
  Attempting uninstall: wandb
    Found existing installation: wandb 0.17.7
    Uninstalling wandb-0.17.7:
      Successfully uninstalled wandb-0.17.7
[0mSuccessfully installed wandb-0.17.8

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [4]:
from datasets import load_dataset
from random import randrange

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM

from trl import SFTTrainer

In [5]:
# Model Parameters
# The model that you want to train from the Hugging Face hub
# model_id = "codellama/CodeLlama-7b-hf"
model_id = "NousResearch/Llama-2-7b-hf"
# The instruction dataset to use
dataset_name = "iamtarun/python_code_instructions_18k_alpaca"
#dataset_name = "HuggingFaceH4/CodeAlpaca_20K"
# Dataset split
dataset_split= "train"
# Fine-tuned model name
new_model = "fine_tuned_model/codellama-7b-int4-python-code-18k"
# Huggingface repository
#hf_model_repo="edumunozsala/"+new_model
# Load the entire model on the GPU 0
device_map = {"": 0} #place all layers of model on the same gpu. To speed up training we can put different layers on different gpus

################################################################################
# bitsandbytes parameters
################################################################################
# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
# Nested quantization, or double quantization, is a more complex form of quantization where the quantization process is applied twice. This can further reduce the size and increase the speed of the network, but it may also further reduce accuracy.
use_double_nested_quant = False 

################################################################################
# QLoRA parameters
################################################################################
# LoRA attention dimension, controls the size of a smaller "brain" (a LoRA adapter) that you're adding to the main brain.
lora_r = 64
# Alpha parameter for LoRA scaling
# Think of lora_alpha as a volume knob for how much the LoRA adapter's influence matters.
# A higher lora_alpha means the adapter's "voice" is louder, making it learn faster but also potentially overfitting (getting too stuck on the training data).
# 16 is a typical value, giving the adapter enough influence without being too overwhelming.
lora_alpha = 16
# Dropout probability for LoRA layers
# Dropout is like randomly turning off some of the adapter's connections during training.
# This helps prevent overfitting, as it forces the adapter to learn more general patterns instead of memorizing the training data exactly.
# 0.1 means 10% of the connections are randomly turned off, a common value for regularizing models.
lora_dropout = 0.1

################################################################################
# TrainingArguments parameters
################################################################################
# Output directory where the model predictions and checkpoints will be stored
output_dir = new_model
# Number of training epochs
num_train_epochs = 1
# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = True
# Batch size per GPU for training
per_device_train_batch_size = 4
# Number of update steps to accumulate the gradients for
# Imagine the LLM is taking notes while learning. Gradient accumulation is like taking notes for multiple examples before making a change.
gradient_accumulation_steps = 1 # 2
# Enable gradient checkpointing
# This is a memory-saving technique, like taking notes on a small piece of paper instead of a huge book.
# It helps the LLM learn faster without running out of memory.
gradient_checkpointing = True
# Maximum gradient normal (gradient clipping)
# Imagine the LLM is learning by taking big steps. Gradient clipping prevents the LLM from taking steps that are too big, which can make learning unstable.
max_grad_norm = 0.3
# Initial learning rate (AdamW optimizer)
# This is like how much the LLM adjusts its knowledge with each step. A higher learning rate means bigger adjustments, potentially faster learning but also more instability.
learning_rate = 1.0e-5 #2e-4 
# Weight decay to apply to all layers except bias/LayerNorm weights
# This is like gently pushing the LLM to simplify its knowledge, preventing it from getting too complex and overfitting.
weight_decay = 0.001
# Optimizer to use
# The optimizer is like the LLM's teacher, helping it learn effectively.
# paged_adamw_32bit is a popular optimizer for large language models.
optim = "paged_adamw_32bit"
# Learning rate schedule
# This is like changing the LLM's learning speed during training.
# cosine means the learning rate starts high, decreases gradually, and then increases again slightly at the end.
lr_scheduler_type = "cosine" #"constant"
# Number of training steps (overrides num_train_epochs)
max_steps = -1
# Ratio of steps for a linear warmup (from 0 to learning rate)
# This is like letting the LLM practice a bit before starting to learn seriously.
# The learning rate increases gradually for a small portion of the training.
warmup_ratio = 0.03
# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = False
# Save checkpoint every X updates steps
save_steps = 0
# Log every X updates steps
logging_steps = 25
# Disable tqdm
# This disables a progress bar that shows how much of the training is finished.
disable_tqdm= False

################################################################################
# SFTTrainer parameters
################################################################################
# Efficient training: SFT requires processing a lot of text data. These parameters help make training more efficient by controlling the length of sequences and maximizing the use of available processing power.
# Model capacity: The max_seq_length influences the model's ability to handle longer and more complex inputs during training and inference.
# Maximum sequence length to use
# Sets the maximum number of tokens the LLM can process at once.
max_seq_length = 2048 #None
# Pack multiple short examples in the same input sequence to increase efficiency
# Enables grouping multiple short examples into longer sequences to improve training efficiency.
packing = True #False

In [6]:
import pandas as pd
from datasets import Dataset
df = pd.read_parquet('18k_added_train-00000-of-00001-8b6e212f3e1ece96.parquet', engine='pyarrow')
dataset = Dataset.from_pandas(df, split='train')

In [7]:
# Set the instruction format for iamtarun/python_code_instructions_18k_alpaca
def format_instruction(sample):
	return f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
{sample['output']}
"""

In [9]:
# Get the type
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_use_double_quant=use_double_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype
)

In [10]:
# Load the pretrained model
model = AutoModelForCausalLM.from_pretrained("NousResearch/Llama-2-7b-hf", quantization_config=bnb_config, use_cache = True, device_map=device_map)
# model = AutoModelForCausalLM.from_pretrained("/s/njavaed/ai-software-engineering/hf_cache/models--NousResearch--Llama-2-7b-hf", quantization_config=bnb_config, use_cache = True, device_map=device_map)

# Imagine the LLM's brain is divided into multiple sections.
# pretraining_tp tells you how many sections were used for parallel processing during the initial training.
# A value of 1 means the entire brain was processed as a single unit, while a higher value would mean it was split into multiple sections for parallel computation.
model.config.pretraining_tp = 1




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
# Safety: By default, the AutoTokenizer is cautious and won't load tokenizers that contain custom code from the Model Hub. This is a security measure to prevent malicious code from being executed.
# Trust: Setting trust_remote_code=True overrides this default behavior and allows the tokenizer to load even if it contains custom code. This is typically done when you're confident about the source of the tokenizer and its code.
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

In [10]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
)
# Not necessary when using SFTTrainer
# prepare model for training
# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, peft_config)

In [15]:
# Before we can start our training we need to define the hyperparameters (TrainingArguments) we want to use


# Define the training arguments
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size, # 6 if use_flash_attention else 4,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    save_strategy="epoch",
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    warmup_ratio=warmup_ratio,
    max_steps=max_steps,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    disable_tqdm=disable_tqdm,
    seed=42,
    report_to="wandb",
    run_name="llama2-fine-tune-18k-added",
)

In [17]:
# Get the memory footprint for this model
training_config = {
    "model": {
        "pretrained_name": model_id,
        "max_length" : 2048
    },
    "datasets": {
        "path": dataset
    },
    "verbose": True
}

model_flops = (
  model.floating_point_ops(
    {
       "input_ids": torch.zeros(
           (1, training_config["model"]["max_length"])
      )
    }
  )
  * args.gradient_accumulation_steps
)

print(model)
print("Memory footprint", model.get_memory_footprint() / 1e9, "GB")
print("Flops", model_flops / 1e9, "GFLOPs")

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )


In [16]:
# Create the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=packing,
    formatting_func=format_instruction,
    args=args,
)



In [21]:
import os
os.environ["WANDB_DISABLED"] = "True"

In [29]:
# wandb setup
import os

os.environ['WANDB_BASE_URL'] = 'https://www.wandb.ford.com' # PROD
os.environ["WANDB_API_KEY"] = ''
os.environ["REQUESTS_CA_BUNDLE"] = '/s/njavaed/ai-software-engineering/ford-wandb-prod.pem' # change location to your .pem file whereever it's located
os.environ["WANDB_PROJECT"] = "codellama-finetuned-njavaed"
# os.environ["WANDB_AGENT_PYTHON"] = "python3"
# os.environ["WANDB_LOG_MODEL"] = "checkpoint"
# os.environ["WANDB_NOTEBOOK_NAME"] = "wandb-working-training.ipynb"

import wandb
wandb.login()

wandb.init(
    # Set the project where this run will be logged
    project="codellama-finetuned-njavaed",
    # Track hyperparameters and run metadata
        # track hyperparameters and run metadata
    config={
    "learning_rate": 2e-4,
    "dataset": "18k_added_train-00000-of-00001-8b6e212f3e1ece96.parquet",
    "epochs": 1,
    }
    )

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mnjavaed[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [19]:
# train
trainer.train() # there will not be a progress bar since tqdm is disabled

# save model in local
trainer.save_model()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
25,1.0564
50,0.8264
75,0.7292
100,0.6573
125,0.6419
150,0.5966
175,0.6065
200,0.6007
225,0.5985
250,0.5961


In [20]:
wandb.finish()

VBox(children=(Label(value='0.007 MB of 0.007 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▂▂▂▂▃▃▄▄▄▄▅▅▅▆▆▇▇▇▇██
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇▇██
train/learning_rate,▁▃▄▆▇█████████████████
train/loss,█▅▃▂▂▁▂▂▁▁▁▁▁▁▂▁▁▁▁▁▂▁
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,0.12
train/global_step,564.0
train/learning_rate,0.0002
train/loss,0.5979
train/total_flos,9.420869524783104e+16
train/train_loss,0.63472
train/train_runtime,2436.5988
train/train_samples_per_second,7.64
train/train_steps_per_second,1.91


In [1]:
import gc
gc.collect()
gc.collect()

0

In [None]:
# Reload the trained and saved model and merge it then we can save the whole model

from peft import AutoPeftModelForCausalLM

new_model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)

# Merge LoRA and base model
merged_model = new_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)

In [15]:
tokenizer.save_pretrained("merged_model")

('merged_model/tokenizer_config.json',
 'merged_model/special_tokens_map.json',
 'merged_model/tokenizer.json')

In [18]:
# Test the merged model

sample = dataset[randrange(len(dataset))]

prompt = f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = merged_model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.5)

print(f"Prompt:\n{prompt}\n")
print(f"\nGenerated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"\nGround truth:\n{sample['output']}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Prompt:
### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
Create a Python function that takes a string and outputs the count of the number of uppercase letters in the string.

### Input:
‘Hello World’

### Response:



Generated instruction:
def count_uppercase(string):
    uppercase_count = 0
    for letter in string:
        if letter.isupper():
            uppercase_count += 1
    return uppercase_count

print(count_uppercase('Hello World'))


Ground truth:
def countUppercase(myStr):
    uppercaseCount = 0
    for l in myStr:
        if l.isupper():
            uppercaseCount+=1
    return uppercaseCount

print(countUppercase('Hello World'))


In [23]:
sample = {"instruction": "Given a nested fstring, find the sum of the integers in the fstring", "input": "n1  = 1, n2 = 2", 
          "output": ""
}

In [24]:
prompt = f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = merged_model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.5)

print(f"Prompt:\n{prompt}\n")
print(f"\nGenerated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"\nGround truth:\n{sample['output']}")

Prompt:
### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
Given a nested fstring, find the sum of the integers in the fstring

### Input:
n1  = 1, n2 = 2

### Response:



Generated instruction:
import sys

def fstring_sum(n1, n2):
    result = 0
    for i in range(n1, n2+1):
        result += i
    return result

if __name__ == '__main__':
    print(fstring_sum(1, 2))


Ground truth:



In [25]:
def fstring_sum(n1, n2):
    result = 0
    for i in range(n1, n2+1):
        result += i
    return result

print(fstring_sum(1, 2))

3


In [27]:
sample = {"instruction": "Generate a python script using PyGWalker from 2024 Python release", "input": "list_of_edges = [(1, 2), (2, 3), (3, 4), (4, 1)]", 
          "output": ""}

In [28]:
prompt = f"""### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
"""

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = merged_model.generate(input_ids=input_ids, max_new_tokens=100, do_sample=True, top_p=0.9,temperature=0.5)

print(f"Prompt:\n{prompt}\n")
print(f"\nGenerated instruction:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]}")
print(f"\nGround truth:\n{sample['output']}")

Prompt:
### Instruction:
Use the Task below and the Input given to write the Response, which is a programming code that can solve the following Task:

### Task:
Generate a python script using PyGWalker from 2024 Python release

### Input:
list_of_edges = [(1, 2), (2, 3), (3, 4), (4, 1)]

### Response:



Generated instruction:
import pygwalker as pgw

pgw.walk(list_of_edges)


Ground truth:



In [33]:
wandb.init(
    # Set the project where this run will be logged
    project="codellama-finetuned-njavaed",
    # Track hyperparameters and run metadata
        # track hyperparameters and run metadata
    
    )

art = wandb.Artifact("codellama-7b-int4-python-code-18k", type="model")
art.add_file("merged_model/model-00001-of-00002.safetensors")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


<Artifact codellama-7b-int4-python-code-18k>

In [34]:
wandb.log_artifact(art)

<Artifact codellama-7b-int4-python-code-18k>

In [35]:
import wandb
wandb.init(entity='njavaed', project='codellama-finetuned-njavaed')
art = wandb.Artifact('codellama-7b-int4-python-code-18k', type='model')
# ... add content to artifact ...
wandb.log_artifact(art)

VBox(children=(Label(value='0.048 MB of 9514.455 MB uploaded\r'), FloatProgress(value=5.04889963236871e-06, ma…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112279196580251, max=1.0…

<Artifact codellama-7b-int4-python-code-18k>