In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, set_seed
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
import accelerate
import tqdm
from torch import nn
import datasets
import bitsandbytes as bnb
import pandas as pd
import numpy as np
from huggingface_hub import login as hf_login
from trl import SFTTrainer
from wandb import login as wandb_login
import wandb
from datetime import datetime

#note for LongRoPE - we need to copy the repo into the dist utils folder to get it in path. 
# Also move the utils folder into the rope folder and modify the init file in the rope folder from import utils.save_memory to import .utils.save_memory

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open('/run/media/mkuprian/SSD2/Programing/VA_git/VA_work/AI_models/COT_Router/HF_token', 'r') as f:
    HF_token = f.read()
    f.close()
hf_login(HF_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/mkuprian/ubuntu_utils/.cache/huggingface/token
Login successful


In [None]:
#Ensure cuda 
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(1))

torch.device("cuda:1")

True
NVIDIA GeForce RTX 3060


In [4]:
#For change in models uncomment to get base model and tokenizer to analyze config and save locally

# model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-3B")
# tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
# model.config
# model.summary()
# for layer in model.layers:
#     print(layer)
# model.save_pretrained('model/llama-3.2-3B')
# tokenizer.save_pretrained('model/llama-3.2-3B')

In [5]:
#Future step
#With LongRoPE update the max embedding size to 16K (and then will future update to 32K)


In [13]:
#Set bits and bytes config 
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)



In [18]:
%%bash 
export CUDA_VISIBLE_DEVICES=1

%%python3
#Rebuild the model with quantization
model = AutoModelForCausalLM.from_pretrained("/run/media/mkuprian/SSD2/Programing/VA_git/VA_work/AI_models/COT_Router/model/llama-3.2-3B", local_files_only=True,  quantization_config=bnb_config, device_map="auto")
#Since we are not using the instruction model we do need to add the special tokens for system, user, and end of turn
tokenizer = AutoTokenizer.from_pretrained("/run/media/mkuprian/SSD2/Programing/VA_git/VA_work/AI_models/COT_Router/model/llama-3.2-3B", 
            local_files_only=True,
            additional_special_tokens=["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>"])

bash: line 3: fg: no job control
bash: line 5: syntax error near unexpected token `('
bash: line 5: `model = AutoModelForCausalLM.from_pretrained("/run/media/mkuprian/SSD2/Programing/VA_git/VA_work/AI_models/COT_Router/model/llama-3.2-3B", local_files_only=True,  quantization_config=bnb_config, device_map="auto")'


CalledProcessError: Command 'b'export CUDA_VISIBLE_DEVICES=1\n\n%%python3\n#Rebuild the model with quantization\nmodel = AutoModelForCausalLM.from_pretrained("/run/media/mkuprian/SSD2/Programing/VA_git/VA_work/AI_models/COT_Router/model/llama-3.2-3B", local_files_only=True,  quantization_config=bnb_config, device_map="auto")\n#Since we are not using the instruction model we do need to add the special tokens for system, user, and end of turn\ntokenizer = AutoTokenizer.from_pretrained("/run/media/mkuprian/SSD2/Programing/VA_git/VA_work/AI_models/COT_Router/model/llama-3.2-3B", \n            local_files_only=True,\n            additional_special_tokens=["<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>"])\n'' returned non-zero exit status 2.

In [8]:
tokenizer.SPECIAL_TOKENS_ATTRIBUTES
tokenizer.all_special_tokens

['<|begin_of_text|>',
 '<|end_of_text|>',
 '<|start_header_id|>',
 '<|end_header_id|>',
 '<|eot_id|>']

In [9]:
for param in model.parameters():
    param.requires_grad = False
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

class CastOutfutToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)
model.lm_head = CastOutfutToFloat(model.lm_head)

In [37]:
def find_all_linear(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names= name. split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return lora_module_names

In [38]:
modules = find_all_linear(model)
print(list(modules))

['k_proj', 'up_proj', 'down_proj', 'v_proj', 'q_proj', 'gate_proj', 'o_proj']


In [39]:
#Set QLora config

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules= ['gate_proj', 'k_proj', 'v_proj', 'down_proj', 'up_proj', 'o_proj', 'q_proj']
)
model = get_peft_model(model, peft_config)



In [13]:
from typing import TypedDict
class ds_prepare(TypedDict):
    source: list[str]
    target: list[str]
    rationale: list[str]
    text: list[str]

In [14]:
counter = 0
import re
ds_dict = ds_prepare(source=[], target=[], rationale=[], text=[])
with open("/workspaces/VA_work/AI_models/external_repos/datasets/CoT-Collection/data/CoT_collection_en.json", 'r') as j:
    for line in j:
         
print(len(ds_dict['target']))
print(len(ds_dict["source"]))
print(len(ds_dict['rationale']))  
print(len(ds_dict['text']))

1837928
1837928
1837928
1837928


In [15]:
cot_dataset = datasets.Dataset.from_dict(ds_dict)


In [16]:
cot_dataset = cot_dataset.map(batched=True, 
                remove_columns=['source', 'target', 'rationale'],)

Map:   0%|          | 0/1837928 [00:00<?, ? examples/s]

In [17]:
cot_dataset

Dataset({
    features: ['text'],
    num_rows: 1837928
})

In [18]:
cot_dataset = cot_dataset.map(lambda x: tokenizer(x['text']), batched=True)

Map:   0%|          | 0/1837928 [00:00<?, ? examples/s]

In [19]:
seed = 2
set_seed(seed)
cot_dataset = cot_dataset.shuffle(seed=seed)

In [20]:
# def text_prompt_generator(source, target, rationale):
#     prompt = f""" 
#     <|begin_of_text|><|start_header_id|>system<|end_header_id|>
#     You are a helpful AI assistant that utilizes reasoning to answer questions. For the following instructions please use only the content in the question. Please provide your reasoning and then the answer to the question
#     <|eot_id|><|start_header_id|>user<|end_header_id|>
#     [Question]: {source}
#     <|eot_id|><|start_header_id|>assistant<|end_header_id|>
#     [Reasoning]: {rationale}
#     [Answer]: {target}
#     <|end_of_text|>
#     """
#     return prompt



In [21]:
# cot_dataset = cot_dataset.map(lambda x: {'text': text_prompt_generator(x['source'], x['target'], x['rationale'])}) 
# #map function is taking about 6 min and 30 sec

In [22]:
cot_dataset[0]

{'text': ' \n            <|begin_of_text|><|start_header_id|>system<|end_header_id|>\n            You are a helpful AI assistant that utilizes reasoning to answer questions. For the following instructions please use only the content in the question. Please provide your reasoning and then the answer to the question\n            <|eot_id|><|start_header_id|>user<|end_header_id|>\n            [Question]: Use information from the paragraph to answer the question.\\n\\nParagraph :\\n\\nMost nutrients are washed into ocean water from land. Therefore, water closer to shore tends to have more nutrients.\\n\\nQuestion:\\n\\n\\nDave is taking water samples from various parts of Lake Erie. He takes the first sample very close to the shore line. The second sample he takes from the middle of the lake. He expects that the second or first sample will have more nutrients in it/?\n            <|eot_id|><|start_header_id|>assistant<|end_header_id|>\n            [Reasoning]: The paragraph states that wat

In [40]:
print(model.config)
# model.config
# print(model.summary())

LlamaConfig {
  "_name_or_path": "model/llama-3.2-3B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 24,
  "num_hidden_layers": 28,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"

In [24]:

#max length of the model
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [41]:
tokenizer.pad_token=tokenizer.eos_token

In [42]:
memory_used = model.get_memory_footprint()
print(f"memory footprint: {memory_used}")

memory footprint: 2294904064


In [45]:
from trl import SFTTrainer, SFTConfig
model.use_cache = False
training_args = SFTConfig(packing=True,
                          max_seq_length=8192,
                          auto_find_batch_size=True,
                          dataset_text_field='text',
                          output_dir='model/llama/checkpoints',
                          gradient_accumulation_steps=4,
                          per_device_train_batch_size=1,
                          logging_steps=1,
                          optim='paged_adamw_8bit',
                          report_to="wandb",
                          run_name=f"llama-3.2-3B-COT - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",)

trainer = SFTTrainer(
    model=model,
    train_dataset=cot_dataset,
    peft_config=peft_config,
    tokenizer=tokenizer,
    args=training_args,
)
with wandb.init(project="Llama-COT", job_type="train", # the project I am working on
           tags=['llama-3.2', 'COT'],
           notes =f"Fine tuning llama 3.2 with COT-Collection. CoT Prompt Instruction and QLora"):

            trainer.train()
trainer.save_model("model/llama-3.2-3B-COT")

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
Could not load library libcuda.so. Error: libcuda.so: cannot open shared object file: No such file or directory
Traceback (most recent call last):
  File "/tmp/ipykernel_607737/1354107928.py", line 26, in <module>
    trainer.train()
  File "/usr/local/lib/python3.10/dist-packages/trl/trainer/sft_trainer.py", line 434, in train
    output = super().train(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2052, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/accelerate/utils/memory.py", line 157, in decorator
    return function(batch_size, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2

RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/parallel/parallel_apply.py", line 96, in _worker
    output = module(*input, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/peft_model.py", line 1644, in forward
    return self.base_model(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/peft/tuners/tuners_utils.py", line 197, in forward
    return self.model.forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1189, in forward
    outputs = self.model(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 1000, in forward
    layer_outputs = decoder_layer(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 729, in forward
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/transformers/models/llama/modeling_llama.py", line 655, in forward
    attn_output = torch.nn.functional.scaled_dot_product_attention(
RuntimeError: cuDNN Frontend error: [cudnn_frontend] Error: No execution plans support the graph.
