
#
# Fine-tune LLM using: huggingface TRL SFTTrainer, PEFT, BNB
# LLM: "mistralai/Mixtral-8x7B-Instruct-v0.1"
# Task: generate executable SQL for the given 'plain English' request
# MSS 20240210
#


# table of contents

# read prepared trainset from disk

In [11]:
!ls -ltA *.json

-rw-rw-r-- 1 ghtw30s ghtw30s 1182876 Feb 10 09:02 test_dataset.json
-rw-rw-r-- 1 ghtw30s ghtw30s 4755289 Feb 10 09:02 train_dataset.json


In [11]:
from datasets import load_dataset

print(time.asctime( time.localtime( time.time() ) ))

# Load jsonl data from disk
dataset = load_dataset("json", data_files="train_dataset.json", split="train")


print(time.asctime( time.localtime( time.time() ) ))

Sun Feb 11 14:09:50 2024
Sun Feb 11 14:09:51 2024


In [13]:

# mss: inspect data format sent to llm for training

dataset

Dataset({
    features: ['messages'],
    num_rows: 10000
})

In [19]:
dataset[34:39]

{'messages': [[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_name_90 (team_1 VARCHAR)',
    'role': 'system'},
   {'content': 'Name the 2nd leg for team 1 of hamburg', 'role': 'user'},
   {'content': 'SELECT 2 AS nd_leg FROM table_name_90 WHERE team_1 = "hamburg"',
    'role': 'assistant'}],
  [{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE table_name_54 (season VARCHAR, lead VARCHAR, third VARCHAR)',
    'role': 'system'},
   {'content': 'what is the season when the lead is john shuster and third is shawn rojeski?',
    'role': 'user'},
   {'content': 'SELECT season FROM table_name_54 WHERE lead = "john shuster" AND third = "shawn rojeski"',
    'role': 'assistant'}],
  [{'content': 'You are an text to SQL query tr

In [12]:

# Hugging Face model id
#https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1
model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"





import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
# TRL - Transformer Reinforcement Learning   https://huggingface.co/docs/trl/en/index
from trl import setup_chat_format

# QA
print('model_id' ,model_id)



# BitsAndBytesConfig int-4 config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

print(time.asctime( time.localtime( time.time() ) ))



# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    #attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config
)



tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.padding_side = 'right' # to prevent warnings

print(time.asctime( time.localtime( time.time() ) ))






#  *** REMOVE IF YOU START FROM A FINE-TUNED MODEL *** !!!

# set chat template to OAI chatML, remove if you start from a fine-tuned model
model, tokenizer = setup_chat_format(model, tokenizer)

"""
https://huggingface.co/docs/trl/en/sft_trainer
The setup_chat_format() function in trl easily sets up a model and tokenizer for conversational AI tasks. This function:
    Adds special tokens to the tokenizer, e.g. <|im_start|> and <|im_end|>, to indicate the start and end of a conversation.
    Resizes the model’s embedding layer to accommodate the new tokens.
    Sets the chat_template of the tokenizer, which is used to format the input data into a chat-like format. The default is chatml from OpenAI.
    optionally you can pass resize_to_multiple_of to resize the embedding layer to a multiple of the resize_to_multiple_of argument, e.g. 64. If you want to see more formats being supported in the future, please open a GitHub issue on trl
"""




print(time.asctime( time.localtime( time.time() ) ))

2024-02-11 14:10:25.014218: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-11 14:10:25.062786: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-11 14:10:25.062833: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-11 14:10:25.064069: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-11 14:10:25.071893: I tensorflow/core/platform/cpu_feature_guar

model_id mistralai/Mixtral-8x7B-Instruct-v0.1
Sun Feb 11 14:10:27 2024


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

Sun Feb 11 14:11:17 2024
Sun Feb 11 14:11:17 2024


# SFTTrainer integration with peft, efficiently tune LLMs using QLoRA

In [14]:

from peft import LoraConfig


print(time.asctime( time.localtime( time.time() ) ))




# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
    
    lora_alpha=128,

    lora_dropout=0.05,
    
    r=256,
    
    bias="none",
    
    target_modules="all-linear",
        
    task_type="CAUSAL_LM", 
)




print(time.asctime( time.localtime( time.time() ) ))

Sun Feb 11 14:11:50 2024
Sun Feb 11 14:11:50 2024


In [15]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='CAUSAL_LM', inference_mode=False, r=256, target_modules='all-linear', lora_alpha=128, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})

# define the hyperparameters (TrainingArguments

In [17]:



# better to make 'output_dir' uniq (so prior sessions files are preserved), especially when >1 code running in parallel

## auto name
# parm
FlPthBase = "/media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/"
#
import time
import datetime
import random
#
epoch_time = int(time.time())
current_date = datetime.datetime.now()
yymmdd = int(current_date.strftime("%Y%m%d%H%M%S"))
rndint = random.randint(100, 999)

FlPth = FlPthBase + 'MergedLora_' + str(yymmdd) + '_' + str(epoch_time) + '_' + str(rndint) + '/'
print('' , 'dir to save checkpoints: ' , FlPth)








from transformers import TrainingArguments



# MssJupy_FineTune_example_202402101507_2.ipynb : push_to_hub=Fale; 2.use more of gpu ram!
# MSS 20240211: to prevent pusshing to hugfac hub, ==prevent needing internet, then set push_to_hub to False:
args = TrainingArguments(

    output_dir= FlPth , # directory to save and repository id

    push_to_hub=False,                      # MSS 20240211: to prevent pusshing to hugfac hub, ==prevent needing internet, then set push_to_hub to False:
    num_train_epochs=1,
    per_device_train_batch_size=3,

    gradient_accumulation_steps=2,
    
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    
    #optim="adamw_torch_fused",              # use fused adamw optimizer
    optim="paged_adamw_8bit",              # use fused adamw optimizer
    
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=2e-4,                     # learning rate, based on QLoRA paper
    #bf16=True,                              # use bfloat16 precision
    #tf32=True,                              # use tf32 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    report_to="tensorboard",                # report metrics to tensorboard
    
)



print(time.asctime( time.localtime( time.time() ) ))

 dir to save checkpoints:  /media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240211141257_1707678777_508/
Sun Feb 11 14:12:57 2024


In [18]:
args

TrainingArguments(
_n_gpu=2,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_len

# create our SFTTrainer

In [21]:

from trl import SFTTrainer


max_seq_length = 3072




print(time.asctime( time.localtime( time.time() ) ))


trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False, # No need to add additional separator token
    }
)




print(time.asctime( time.localtime( time.time() ) ))

Sun Feb 11 14:13:47 2024
Sun Feb 11 14:14:30 2024


# start training

In [22]:


# e2e time
start00 = time.time()
print('11' , time.asctime( time.localtime( time.time() ) ))




# start training, the model will be automatically saved to the hub and the output directory
trainer.train()

print('22' , time.asctime( time.localtime( time.time() ) ))


# e2e time
end00 = time.time()
#print("Took {} seconds to pull {} websites.".format(end11 - start11, len(work101)))
print("Took {} seconds to run end-to-end.".format(end00 - start00 ))
#
TimeTookE2E = end00 - start00


print( '99' , time.asctime( time.localtime( time.time() ) ))

11 Sun Feb 11 14:16:27 2024


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,0.8722
20,0.5267
30,0.4934
40,0.4658
50,0.4564
60,0.4488




22 Sun Feb 11 17:15:51 2024
Took 10764.772048711777 seconds to run end-to-end.
99 Sun Feb 11 17:15:51 2024


In [23]:
# record ram usage cpu gpu
!free -g -h -t
!nvidia-smi
ArrGpuRamUsage.append( GetGpuRamUsage() )
ArrGpuRamUsage

               total        used        free      shared  buff/cache   available
Mem:           1.2Ti        28Gi       904Gi        94Mi       326Gi       1.2Ti
Swap:          2.0Gi          0B       2.0Gi
Total:         1.2Ti        28Gi       906Gi
Sun Feb 11 18:24:11 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 8000                Off | 00000000:3B:00.0  On |                  Off |
| 34%   33C    P8              33W / 260W |  48556MiB / 49152MiB |      0%      Defaul

[[1065.0, 6.0, 'Sun Feb 11 14:06:05 2024'],
 [12078.0, 13946.0, 'Sun Feb 11 14:11:18 2024'],
 [48556.0, 48588.0, 'Sun Feb 11 18:24:11 2024']]

# save tuned neural layers to disk

In [24]:

print('11' , time.asctime( time.localtime( time.time() ) ))

# save model
trainer.save_model()



print('99' , time.asctime( time.localtime( time.time() ) ))

11 Sun Feb 11 18:24:18 2024




99 Sun Feb 11 18:26:43 2024


In [25]:
args.output_dir

'/media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240211141257_1707678777_508/'

# free the memory

In [None]:

# since next section uses same names for model and etc, then it overwrites anyway!


# free the memory again
"""
del model
del trainer
torch.cuda.empty_cache()
"""


In [27]:
!ls -ltAR $args.output_dir ; date

/media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240211141257_1707678777_508/:
total 16165678
-rwxrwxrwx 1 ghtw30s ghtw30s        4920 Feb 11 18:26 training_args.bin
-rwxrwxrwx 1 ghtw30s ghtw30s     1795677 Feb 11 18:26 tokenizer.json
-rwxrwxrwx 1 ghtw30s ghtw30s      493443 Feb 11 18:26 tokenizer.model
-rwxrwxrwx 1 ghtw30s ghtw30s          51 Feb 11 18:26 added_tokens.json
-rwxrwxrwx 1 ghtw30s ghtw30s         557 Feb 11 18:26 special_tokens_map.json
-rwxrwxrwx 1 ghtw30s ghtw30s        1606 Feb 11 18:26 tokenizer_config.json
-rwxrwxrwx 1 ghtw30s ghtw30s         684 Feb 11 18:26 adapter_config.json
-rwxrwxrwx 1 ghtw30s ghtw30s 16551333320 Feb 11 18:26 adapter_model.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s        5112 Feb 11 18:24 README.md
drwxrwxrwx 1 ghtw30s ghtw30s         408 Feb 11 17:15 checkpoint-67
drwxrwxrwx 1 ghtw30s ghtw30s         216 Feb 11 14:16 runs

/media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/M

# Merge LoRA adapter in to the original model

In [None]:

"""

When using QLoRA, we only train adapters and not the full model. 
This means when saving the model during training we only save the adapter weights and not the full model. 
If you want to save the full model, which makes it easier to use with Text Generation Inference you can merge the adapter weights 
into the model weights using the merge_and_unload method and then save the model with the save_pretrained method. 
This will save a default model, which can be used for inference.

"""


In [28]:

from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import AutoPeftModelForCausalLM

print('11' , time.asctime( time.localtime( time.time() ) ))


# Load PEFT model on CPU
config = PeftConfig.from_pretrained(args.output_dir)
print('11' , time.asctime( time.localtime( time.time() ) ))

model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,low_cpu_mem_usage=True)
print('11' , time.asctime( time.localtime( time.time() ) ))

tokenizer = AutoTokenizer.from_pretrained(args.output_dir)


print('99' , time.asctime( time.localtime( time.time() ) ))

11 Sun Feb 11 19:27:59 2024
11 Sun Feb 11 19:27:59 2024


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


11 Sun Feb 11 19:28:26 2024
99 Sun Feb 11 19:28:26 2024


In [29]:
# record ram usage cpu gpu
!free -g -h -t
!nvidia-smi
ArrGpuRamUsage.append( GetGpuRamUsage() )
ArrGpuRamUsage

               total        used        free      shared  buff/cache   available
Mem:           1.2Ti       203Gi       697Gi        96Mi       357Gi       1.0Ti
Swap:          2.0Gi          0B       2.0Gi
Total:         1.2Ti       203Gi       699Gi
Sun Feb 11 19:28:35 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 8000                Off | 00000000:3B:00.0  On |                  Off |
| 34%   33C    P8              35W / 260W |  21630MiB / 49152MiB |      0%      Defaul

[[1065.0, 6.0, 'Sun Feb 11 14:06:05 2024'],
 [12078.0, 13946.0, 'Sun Feb 11 14:11:18 2024'],
 [48556.0, 48588.0, 'Sun Feb 11 18:24:11 2024'],
 [21630.0, 25162.0, 'Sun Feb 11 19:28:35 2024']]

In [30]:
args.output_dir

'/media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240211141257_1707678777_508/'

In [31]:

print('11' , time.asctime( time.localtime( time.time() ) ))


#
model.resize_token_embeddings(len(tokenizer))
print('13' , time.asctime( time.localtime( time.time() ) ))

model = PeftModel.from_pretrained(model, args.output_dir)
print('15' , time.asctime( time.localtime( time.time() ) ))

model = AutoPeftModelForCausalLM.from_pretrained(
    args.output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)



print('99' , time.asctime( time.localtime( time.time() ) ))

11 Sun Feb 11 19:30:12 2024
13 Sun Feb 11 19:30:16 2024
15 Sun Feb 11 19:31:09 2024


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


99 Sun Feb 11 19:33:14 2024


In [32]:
# record ram usage cpu gpu
!free -g -h -t
!nvidia-smi
ArrGpuRamUsage.append( GetGpuRamUsage() )
ArrGpuRamUsage

               total        used        free      shared  buff/cache   available
Mem:           1.2Ti       136Gi       764Gi        84Mi       357Gi       1.1Ti
Swap:          2.0Gi          0B       2.0Gi
Total:         1.2Ti       136Gi       766Gi
Sun Feb 11 19:33:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 8000                Off | 00000000:3B:00.0  On |                  Off |
| 34%   37C    P2              70W / 260W |  37316MiB / 49152MiB |      0%      Defaul

[[1065.0, 6.0, 'Sun Feb 11 14:06:05 2024'],
 [12078.0, 13946.0, 'Sun Feb 11 14:11:18 2024'],
 [48556.0, 48588.0, 'Sun Feb 11 18:24:11 2024'],
 [21630.0, 25162.0, 'Sun Feb 11 19:28:35 2024'],
 [37316.0, 25162.0, 'Sun Feb 11 19:33:21 2024']]

# save the merged model

In [33]:


# uniq dir to save

print( 'args.output_dir' ,args.output_dir)

## auto name
# parm
FlPthBase = "/media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/"
#
import time
import datetime
import random
#
epoch_time = int(time.time())
current_date = datetime.datetime.now()
yymmdd = int(current_date.strftime("%Y%m%d%H%M%S"))
rndint = random.randint(100, 999)

FlPth = FlPthBase + 'MergedLora_' + str(yymmdd) + '_' + str(epoch_time) + '_' + str(rndint) + ''
print('' , 'dir to save model: ' , FlPth)
#
args.output_dir = FlPth






print( 'args.output_dir' ,args.output_dir)
#args.output_dir /media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240210093118_1707575478_896


print('99' , time.asctime( time.localtime( time.time() ) ))

args.output_dir /media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240211141257_1707678777_508/
 dir to save model:  /media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240211193450_1707698090_754
args.output_dir /media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240211193450_1707698090_754
99 Sun Feb 11 19:34:50 2024


In [34]:

print('11' , time.asctime( time.localtime( time.time() ) ))


# Merge LoRA and base model and save
merged_model = model.merge_and_unload()
print('13' , time.asctime( time.localtime( time.time() ) ))


merged_model.save_pretrained(args.output_dir,safe_serialization=True, max_shard_size="2GB")




print('99' , time.asctime( time.localtime( time.time() ) ))

11 Sun Feb 11 19:35:03 2024
13 Sun Feb 11 19:36:05 2024
99 Sun Feb 11 19:48:00 2024


In [35]:
# record ram usage cpu gpu
!free -g -h -t
!nvidia-smi
ArrGpuRamUsage.append( GetGpuRamUsage() )
ArrGpuRamUsage

               total        used        free      shared  buff/cache   available
Mem:           1.2Ti       137Gi       587Gi        84Mi       534Gi       1.1Ti
Swap:          2.0Gi          0B       2.0Gi
Total:         1.2Ti       137Gi       589Gi
Sun Feb 11 19:48:06 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 8000                Off | 00000000:3B:00.0  On |                  Off |
| 34%   33C    P8              34W / 260W |  37290MiB / 49152MiB |      0%      Defaul

[[1065.0, 6.0, 'Sun Feb 11 14:06:05 2024'],
 [12078.0, 13946.0, 'Sun Feb 11 14:11:18 2024'],
 [48556.0, 48588.0, 'Sun Feb 11 18:24:11 2024'],
 [21630.0, 25162.0, 'Sun Feb 11 19:28:35 2024'],
 [37316.0, 25162.0, 'Sun Feb 11 19:33:21 2024'],
 [37290.0, 25162.0, 'Sun Feb 11 19:48:06 2024']]

In [36]:
!du -hs $args.output_dir ; date
!ls -ltA $args.output_dir

87G	/media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240211193450_1707698090_754
Sun Feb 11 07:48:09 PM EST 2024
total 91216713
-rwxrwxrwx 1 ghtw30s ghtw30s      92658 Feb 11 19:48 model.safetensors.index.json
-rwxrwxrwx 1 ghtw30s ghtw30s  614507328 Feb 11 19:48 model-00048-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1996490952 Feb 11 19:47 model-00047-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019128 Feb 11 19:47 model-00046-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019120 Feb 11 19:47 model-00045-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1996490952 Feb 11 19:47 model-00044-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019128 Feb 11 19:46 model-00043-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019120 Feb 11 19:46 model-00042-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1996490952 Feb 11 19:46 model-00041-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019128 Feb 11 19:46 model-

# Test Model and run Inference

In [37]:
# record ram usage cpu gpu
!free -g -h -t
!nvidia-smi
ArrGpuRamUsage.append( GetGpuRamUsage() )
ArrGpuRamUsage

               total        used        free      shared  buff/cache   available
Mem:           1.2Ti       139Gi       583Gi       112Mi       536Gi       1.1Ti
Swap:          2.0Gi          0B       2.0Gi
Total:         1.2Ti       139Gi       585Gi
Mon Feb 12 09:22:03 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 8000                Off | 00000000:3B:00.0  On |                  Off |
| 34%   34C    P8              42W / 260W |  37336MiB / 49152MiB |     36%      Defaul

[[1065.0, 6.0, 'Sun Feb 11 14:06:05 2024'],
 [12078.0, 13946.0, 'Sun Feb 11 14:11:18 2024'],
 [48556.0, 48588.0, 'Sun Feb 11 18:24:11 2024'],
 [21630.0, 25162.0, 'Sun Feb 11 19:28:35 2024'],
 [37316.0, 25162.0, 'Sun Feb 11 19:33:21 2024'],
 [37290.0, 25162.0, 'Sun Feb 11 19:48:06 2024'],
 [37336.0, 25162.0, 'Mon Feb 12 09:22:04 2024']]

In [38]:
args.output_dir

'/media/ghtw30s/SSD-PUT/mss20230718/LLM/EXECUTE_ExecuteOnAsusG4/FineTune/models/MergedLora_20240211193450_1707698090_754'

In [39]:
!ls -ltA $args.output_dir ; date

total 91216713
-rwxrwxrwx 1 ghtw30s ghtw30s      92658 Feb 11 19:48 model.safetensors.index.json
-rwxrwxrwx 1 ghtw30s ghtw30s  614507328 Feb 11 19:48 model-00048-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1996490952 Feb 11 19:47 model-00047-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019128 Feb 11 19:47 model-00046-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019120 Feb 11 19:47 model-00045-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1996490952 Feb 11 19:47 model-00044-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019128 Feb 11 19:46 model-00043-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019120 Feb 11 19:46 model-00042-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1996490952 Feb 11 19:46 model-00041-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019128 Feb 11 19:46 model-00040-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1963019120 Feb 11 19:46 model-00039-of-00048.safetensors
-rwxrwxrwx 1 ghtw30s ghtw30s 1996490952 Feb

In [46]:

#peft_model_id = "./code-llama-7b-text-to-sql"
model_id = args.output_dir



import torch
from transformers import AutoTokenizer, pipeline 



print('11' , time.asctime( time.localtime( time.time() ) ))




# this to load llm that lora adapter is already merged with main nn
from transformers import AutoModelForCausalLM
#
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    #attn_implementation="flash_attention_2",
    
    #torch_dtype=torch.bfloat16,
    #RuntimeError: cutlassF: no kernel found to launch!
    #https://github.com/Lightning-AI/lit-gpt/issues/327
    
    #quantization_config=bnb_config
)





print('33' , time.asctime( time.localtime( time.time() ) ))

# load into pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)





print('99' , time.asctime( time.localtime( time.time() ) ))

11 Mon Feb 12 10:01:44 2024


Loading checkpoint shards:   0%|          | 0/48 [00:00<?, ?it/s]

33 Mon Feb 12 10:02:40 2024
99 Mon Feb 12 10:02:40 2024


In [47]:
# record ram usage cpu gpu
!free -g -h -t
!nvidia-smi
ArrGpuRamUsage.append( GetGpuRamUsage() )
ArrGpuRamUsage

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


               total        used        free      shared  buff/cache   available
Mem:           1.2Ti       360Gi       362Gi       118Mi       536Gi       891Gi
Swap:          2.0Gi          0B       2.0Gi
Total:         1.2Ti       360Gi       364Gi


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Mon Feb 12 10:03:53 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.154.05             Driver Version: 535.154.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 8000                Off | 00000000:3B:00.0  On |                  Off |
| 34%   37C    P8              35W / 260W |  40198MiB / 49152MiB |      4%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Quadro RTX 8000                Off | 00000000:5E:00.0 Off |  

[[1065.0, 6.0, 'Sun Feb 11 14:06:05 2024'],
 [12078.0, 13946.0, 'Sun Feb 11 14:11:18 2024'],
 [48556.0, 48588.0, 'Sun Feb 11 18:24:11 2024'],
 [21630.0, 25162.0, 'Sun Feb 11 19:28:35 2024'],
 [37316.0, 25162.0, 'Sun Feb 11 19:33:21 2024'],
 [37290.0, 25162.0, 'Sun Feb 11 19:48:06 2024'],
 [37336.0, 25162.0, 'Mon Feb 12 09:22:04 2024'],
 [27596.0, 47346.0, 'Mon Feb 12 09:38:07 2024'],
 [40198.0, 47346.0, 'Mon Feb 12 10:03:54 2024']]

# Let’s load our test dataset try to generate an instruction.

In [44]:
from datasets import load_dataset 
from random import randint


print('11' , time.asctime( time.localtime( time.time() ) ))

# Load our test dataset
eval_dataset = load_dataset("json", data_files="test_dataset.json", split="train")
rand_idx = randint(0, len(eval_dataset))


print('11' , time.asctime( time.localtime( time.time() ) ))

11 Mon Feb 12 09:47:57 2024
11 Mon Feb 12 09:47:57 2024


In [48]:

print('11' , time.asctime( time.localtime( time.time() ) ))


# Test on sample 
prompt = pipe.tokenizer.apply_chat_template(eval_dataset[rand_idx]["messages"][:2], tokenize=False, add_generation_prompt=True)

outputs = pipe(prompt, max_new_tokens=256, do_sample=False, temperature=0.1, top_k=50, top_p=0.1, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
#RuntimeError: cutlassF: no kernel found to launch!



print('11' , time.asctime( time.localtime( time.time() ) ))





print(f"Query:\n{eval_dataset[rand_idx]['messages'][1]['content']}")
print(f"Original Answer:\n{eval_dataset[rand_idx]['messages'][2]['content']}")
print(f"Generated Answer:\n{outputs[0]['generated_text'][len(prompt):].strip()}")

print('99' , time.asctime( time.localtime( time.time() ) ))

11 Mon Feb 12 10:03:59 2024
11 Mon Feb 12 10:10:56 2024
Query:
How many patients do each physician take care of? List their names and number of patients they take care of.
Original Answer:
SELECT T1.name, COUNT(*) FROM physician AS T1 JOIN patient AS T2 ON T1.employeeid = T2.PCP GROUP BY T1.employeeid
Generated Answer:
SELECT T2.name, COUNT(*) FROM patient AS T1 JOIN physician AS T2 ON T1.PCP = T2.employeeid GROUP BY T2.name
11 Mon Feb 12 10:10:56 2024


In [49]:
eval_dataset[rand_idx]

{'messages': [{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE patient (PCP VARCHAR); CREATE TABLE physician (name VARCHAR, employeeid VARCHAR)',
   'role': 'system'},
  {'content': 'How many patients do each physician take care of? List their names and number of patients they take care of.',
   'role': 'user'},
  {'content': 'SELECT T1.name, COUNT(*) FROM physician AS T1 JOIN patient AS T2 ON T1.employeeid = T2.PCP GROUP BY T1.employeeid',
   'role': 'assistant'}]}

In [50]:
eval_dataset[rand_idx]["messages"][:2]

[{'content': 'You are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE patient (PCP VARCHAR); CREATE TABLE physician (name VARCHAR, employeeid VARCHAR)',
  'role': 'system'},
 {'content': 'How many patients do each physician take care of? List their names and number of patients they take care of.',
  'role': 'user'}]

In [51]:
prompt

'<|im_start|>system\nYou are an text to SQL query translator. Users will ask you questions in English and you will generate a SQL query based on the provided SCHEMA.\nSCHEMA:\nCREATE TABLE patient (PCP VARCHAR); CREATE TABLE physician (name VARCHAR, employeeid VARCHAR)<|im_end|>\n<|im_start|>user\nHow many patients do each physician take care of? List their names and number of patients they take care of.<|im_end|>\n<|im_start|>assistant\n'

# eval(FTed LLM) over a set of testset examples

In [52]:

from tqdm import tqdm


def evaluate(sample):
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95, eos_token_id=pipe.tokenizer.eos_token_id, pad_token_id=pipe.tokenizer.pad_token_id)
    predicted_answer = outputs[0]['generated_text'][len(prompt):].strip()
    if predicted_answer == sample["messages"][2]["content"]:
        return 1 
    else:
        return 0




print('11' , time.asctime( time.localtime( time.time() ) ))


success_rate = []


number_of_eval_samples = 1000
number_of_eval_samples = 100


# iterate over eval dataset and predict
for s in tqdm(eval_dataset.shuffle().select(range(number_of_eval_samples))):
    success_rate.append(evaluate(s))

# compute accuracy
accuracy = sum(success_rate)/len(success_rate)

print(f"Accuracy: {accuracy*100:.2f}%")  




print('99' , time.asctime( time.localtime( time.time() ) ))

11 Mon Feb 12 10:32:06 2024


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [11:49:57<00:00, 425.98s/it]

Accuracy: 77.00%
99 Mon Feb 12 22:22:04 2024





In [35]:

import os, gc, re, warnings
# Garbage Collection

print(time.asctime( time.localtime( time.time() ) ))

gc.collect()


print(time.asctime( time.localtime( time.time() ) ))

Fri Jan 19 08:29:34 2024
Fri Jan 19 08:29:35 2024
