In [2]:
!pip install -qqq -U transformers datasets accelerate peft trl bitsandbytes deepspeed --progress-bar off

In [3]:
import gc
import os
import json
from kaggle_secrets import UserSecretsClient

In [6]:
# Get keys from Secrets
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_TOKEN")

In [None]:
import os
from accelerate.utils import write_basic_config

write_basic_config()  # Write a config file
os._exit(00)  # Restart the notebook

In [19]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print(f"Device: {DEVICE}")
print(f"CUDA Version: {torch.version.cuda}")
print(f"Pytorch {torch.__version__}")

# Check the type and quantity of GPUs
if torch.cuda.is_available():
    print('Num CPUs:', os.cpu_count())
    print('Num GPUs:', torch.cuda.device_count())
    print('GPU Type:', torch.cuda.get_device_name(0))


Device: cuda
CUDA Version: 12.1
Pytorch 2.1.2
Num CPUs: 4
Num GPUs: 2
GPU Type: Tesla T4


### Llama_3_8b

In [5]:
# Model
base_model = "meta-llama/Meta-Llama-3-8B"

In [None]:
dataset_name = "mlabonne/orpo-dpo-mix-40k"
dataset = load_dataset(dataset_name, split="all")
dataset = dataset.shuffle(seed=42).select(range(100)) # Only use 1000 samples for quick demo

def format_chat_template(row):
    row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
    row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc= os.cpu_count(),
)
dataset = dataset.train_test_split(test_size=0.01)

In [4]:
from accelerate import notebook_launcher
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
)

In [9]:

def main():
    
    from transformers import BitsAndBytesConfig
    from trl import ORPOConfig, ORPOTrainer, setup_chat_format
    from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
    from accelerate import Accelerator

    accelerator = Accelerator(mixed_precision='fp16')
#     accelerator = Accelerator()
    
    device_map = {"": accelerator.process_index}
#     device_map = {"": "cuda:" + str(int(os.environ.get("LOCAL_RANK") or 0))}
#     device_map={'':torch.cuda.current_device()}

    
    # QLoRA config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_storage=torch.float16,
    )

    # LoRA config
    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
#         target_modules=["all_linear"],
        target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
    )
    
    base_model = "meta-llama/Meta-Llama-3-8B"
    new_model = "Llama-3-8B_FT_ORPO_DDP"
    
    tokenizer = AutoTokenizer.from_pretrained(base_model, token=HF_TOKEN)

    # Load model
    model = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
#         device_map="auto",
        device_map=device_map,
        token=HF_TOKEN,
        attn_implementation="eager",
        torch_dtype=torch.float16,
    )
    
    model, tokenizer = setup_chat_format(model, tokenizer)
    model = prepare_model_for_kbit_training(model)
    
    dataset_name = "mlabonne/orpo-dpo-mix-40k"
    dataset = load_dataset(dataset_name, split="all")
    dataset = dataset.shuffle(seed=42).select(range(900)) # Only use 30 samples for test

    def format_chat_template(row):
        row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
        row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
        return row

    dataset = dataset.map(
        format_chat_template,
        num_proc= os.cpu_count(),
    )
    dataset = dataset.train_test_split(test_size=0.01)
    
    orpo_args = ORPOConfig(
        learning_rate=8e-6,
        lr_scheduler_type="linear",
        max_length=1024,
        max_prompt_length=512,
        beta=0.1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        optim="paged_adamw_8bit",
        num_train_epochs=1,
        evaluation_strategy="steps",
        eval_strategy="steps",
        eval_steps=0.2,
        logging_steps=1,
        warmup_steps=10,
        report_to="none",
        output_dir="./results/",
        remove_unused_columns=False,
        ddp_find_unused_parameters=False,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs = {"use_reentrant": True}, #must be false for DDP
    )

    trainer = ORPOTrainer(
        model=model,
        args=orpo_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        peft_config=peft_config,
        tokenizer=tokenizer,
    )

    print(device_map)
    print(f'n_gpu: {orpo_args.n_gpu}; Mode: {orpo_args.parallel_mode}')
    print(f'Num Processes: {accelerator.num_processes}; Device: {accelerator.device}; Process Index: {accelerator.process_index}')
    print(f'Accel Type: {accelerator.distributed_type}')

    
    trainer.train()
    trainer.save_model(new_model)
    

In [10]:
%%time

notebook_launcher(main, num_processes=2)

Launching training on 2 GPUs.


2024-05-27 02:28:31.465156: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-27 02:28:31.465160: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-27 02:28:31.465225: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-27 02:28:31.465296: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-27 02:28:31.616930: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory

[2024-05-27 02:28:39,219] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-05-27 02:28:39,219] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)







/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
collect2: error: ld returned 1 exit status
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



{'': 0}
n_gpu: 1; Mode: ParallelMode.DISTRIBUTED
Num Processes: 2; Device: cuda:0; Process Index: 0
Accel Type: MULTI_GPU
{'': 1}
n_gpu: 1; Mode: ParallelMode.DISTRIBUTED
Num Processes: 2; Device: cuda:1; Process Index: 1
Accel Type: MULTI_GPU


Could not estimate the number of tokens of the input, floating-point operations will not be computed
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
23,1.7239,1.881545,27.2303,0.331,0.184,-0.106914,-0.134332,0.8,0.027418,-1.343322,-1.069142,-1.505002,-1.15949,1.610141,-0.541349,0.38448
46,1.3456,1.431592,27.1648,0.331,0.184,-0.099002,-0.123506,0.8,0.024504,-1.235061,-0.990021,-1.605106,-1.223708,1.175749,-0.552438,0.356117
69,1.24,1.378136,26.9748,0.334,0.185,-0.094669,-0.117788,0.8,0.023119,-1.177883,-0.946691,-1.582639,-1.224063,1.133539,-0.557794,0.34114
92,1.3121,1.350144,26.8623,0.335,0.186,-0.091728,-0.114803,0.8,0.023074,-1.148027,-0.917283,-1.559889,-1.197512,1.107633,-0.555265,0.347198


Step,Training Loss,Validation Loss,Runtime,Samples Per Second,Steps Per Second,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen,Nll Loss,Log Odds Ratio,Log Odds Chosen
23,1.7239,1.881545,27.2324,0.33,0.184,-0.129463,-0.134168,0.6,0.004705,-1.341682,-1.294635,-1.724199,-1.461578,1.903299,-0.662111,0.070167
46,1.3456,1.431592,27.167,0.331,0.184,-0.120951,-0.124823,0.6,0.003872,-1.248229,-1.209508,-1.747398,-1.485025,1.514651,-0.667016,0.061171
69,1.24,1.378136,26.9778,0.334,0.185,-0.115477,-0.120255,0.6,0.004778,-1.202549,-1.154765,-1.72436,-1.488208,1.45035,-0.659136,0.079157
92,1.3121,1.350144,26.8652,0.335,0.186,-0.112037,-0.117149,0.6,0.005112,-1.171488,-1.120369,-1.706816,-1.472296,1.419418,-0.655846,0.087071



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must be authenticated to access it. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.


CPU times: user 692 ms, sys: 258 ms, total: 950 ms
Wall time: 2h 11min 45s


### Merge Adapter with Base model

In [14]:
# Flush memory
# del trainer, model
gc.collect()
torch.cuda.empty_cache()


In [15]:
# Reload tokenizer and model

from trl import setup_chat_format

tokenizer = AutoTokenizer.from_pretrained(base_model, token=HF_TOKEN)
fp16_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
    token=HF_TOKEN,
)
fp16_model, tokenizer = setup_chat_format(fp16_model, tokenizer)


[2024-05-27 04:44:04,625] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
# merge fine tuned adapter
from peft import PeftModel

new_model = '/kaggle/working/Llama-3-8B_FT_ORPO_DDP'

# Merge adapter with base model
model = PeftModel.from_pretrained(fp16_model, new_model)
model = model.merge_and_unload()

In [17]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128258, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head)

In [35]:
model.save_pretrained('/kaggle/working/model')

### Inference with Fine-tuned model

In [22]:
%%time
question = 'What is the basic structure of a SQL query to join to tables on a field like ID'
# question = 'When is labor day celebrated in USA'
# question = 'When is the American Independence day'

# Create the prompt
prompt = f"""<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
# Generate the outputs from prompt
generate_ids = model.generate(**inputs, max_new_tokens=256)
# Decode the generated output
generated_text = tokenizer.batch_decode(generate_ids,
                                    skip_special_tokens=True,
                                    clean_up_tokenization_spaces=False
                                       )[0]

print('generated_text: ', generated_text)

generated_text:  user
What is the basic structure of a SQL query to join to tables on a field like ID
assistant
What is the basic structure of a SQL query to join to tables on a field like ID?
I have a table called `users` and a table called `user_achievements`. The `user_achievements` table has an `achievement_id` field and a `user_id` field. The `users` table has an `id` field. I want to select the `achievement_id` and `achievement_name` from the `user_achievements` table and the `name` and `id` fields from the `users` table. The `user_id` field in the `user_achievements` table is a foreign key to the `id` field in the `users` table. How do I write the query to join the two tables? I think I need a join, but I'm not sure how to write it. Can someone help me out?
I think you're looking for something like this:
    users.name,
    users.id,
    user_achievements.achievement_id,
    user_achievements.achievement_name
FROM users
INNER JOIN user_achievements ON users.id = user_achievement

In [25]:
%%time
system_message = 'You are a smart assistant, answer the following question'
question = 'When is the American Independence day'
# question = 'When is labor day celebrated in USA'

# Create the prompt
prompt = f"""<|im_start|>user
{question}<|im_end|>
<|im_start|>assistant
"""

# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
# Generate the outputs from prompt
generate_ids = model.generate(**inputs, max_new_tokens=128)
# Decode the generated output
generated_text = tokenizer.batch_decode(generate_ids,
                                    skip_special_tokens=True,
                                    clean_up_tokenization_spaces=False)[0]

print('generated_text: ', generated_text)

generated_text:  user
When is the American Independence day
assistant
When is the American Independence day
The American Independence Day is a holiday that is celebrated in the United States on July 4th every year. It commemorates the adoption of the Declaration of Independence in 1776. The Declaration of Independence was a document that declared the thirteen colonies of the United States to be independent from the British Empire. It was signed by the Continental Congress, a group of representatives from the colonies, on July 4, 1776.
The American Independence Day is a day of celebration and remembrance. It is a time to reflect on the history and values of the United States, and to celebrate the freedoms and
CPU times: user 8.77 s, sys: 2.06 ms, total: 8.77 s
Wall time: 8.76 s
