In [3]:
%%capture

!pip install bitsandbytes accelerate peft trl

import time
from random import randrange, sample, seed

import torch
import os
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

seed(42)


import torch
print(torch.cuda.is_available())

use_flash_attention2 = False

# Replace attention with flash attention 
if torch.cuda.get_device_capability()[0] >= 8:
    use_flash_attention2 = True

print(f"Using flash attention 2: {use_flash_attention2}")


%%capture

if use_flash_attention2:
    !pip install flash-attn --no-build-isolation --upgrade


from datasets import load_dataset
dataset = load_dataset("json", data_files="/home/srvadm001/nucleo-ia/finetuning/Fine-tuning/llama3_7b/snptee-instruction-dataset.json", split="train")
  
def format_instruction(sample):
	return f"""    
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
{sample['output']}
"""

from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()



from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Hugging Face model id
model_id = "meta-llama/Meta-Llama-3-8B"
# model_id = "mistralai/Mistral-7B-v0.1"

# BitsAndBytesConfig int-4 config 
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if use_flash_attention2 else torch.float16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto",
    token=os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
    attn_implementation="flash_attention_2" if use_flash_attention2 else "sdpa"
)
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Specify the directory where you want to save the model and tokenizer
save_directory = "./saved_model"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)


# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj", 
            "up_proj", 
            "down_proj",
        ]
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)


args = TrainingArguments(
    output_dir="fine-tuned-snptee",
    num_train_epochs=1,
    per_device_train_batch_size=6 if use_flash_attention2 else 2, # you can play with the batch size depending on your hardware
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=use_flash_attention2,
    fp16=not use_flash_attention2,
    tf32=use_flash_attention2,
    max_grad_norm=0.3,
    warmup_steps=5,
    lr_scheduler_type="linear",
    disable_tqdm=False,
    report_to="none"
)

model = get_peft_model(model, peft_config)


trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction, 
    args=args,
)


# train
trainer.train()

# save model
trainer.save_model()



In [None]:
print('instructions', instructions)
print('inputs', inputs)
print('outputs', outputs)

instructions ['Summarize the main contribution of LoRA-Guard in content moderation of large language models.', 'Explain the limitation of existing model-based guardrails for content moderation of large language models.', 'Describe the advantage of LoRA-Guard over existing approaches in terms of parameter overhead and accuracy.']
inputs ['We introduce LoRA-Guard, a parameter-efficient guardrail adaptation method that relies on knowledge sharing between LLMs and guardrail models.', 'Existing model-based guardrails have not been designed for resource-constrained computational portable devices, such as mobile phones, more and more of which are running LLM-based applications locally.', 'We show that LoRA-Guard outperforms existing approaches with 100-1000x lower parameter overhead while maintaining accuracy, enabling on-device content moderation.']
outputs ['LoRA-Guard is a parameter-efficient method for content moderation of LLMs that adapts language features from LLMs using low-rank adapt

In [None]:
print('instructions', instructions)
print('inputs', inputs)
print('outputs', outputs)

instructions ['Summarize the main contribution of LoRA-Guard in content moderation of large language models.', 'Explain the limitation of existing model-based guardrails for content moderation of large language models.', 'Describe the advantage of LoRA-Guard over existing approaches in terms of parameter overhead and accuracy.']
inputs ['We introduce LoRA-Guard, a parameter-efficient guardrail adaptation method that relies on knowledge sharing between LLMs and guardrail models.', 'Existing model-based guardrails have not been designed for resource-constrained computational portable devices, such as mobile phones, more and more of which are running LLM-based applications locally.', 'We show that LoRA-Guard outperforms existing approaches with 100-1000x lower parameter overhead while maintaining accuracy, enabling on-device content moderation.']
outputs ['LoRA-Guard is a parameter-efficient method for content moderation of LLMs that adapts language features from LLMs using low-rank adapt

In [1]:
%%capture

!pip install bitsandbytes accelerate peft trl

In [2]:
import time
from random import randrange, sample, seed

import torch
import os
from datasets import load_dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import torch
print(torch.cuda.is_available())


True


In [4]:
use_flash_attention2 = False

# Replace attention with flash attention 
if torch.cuda.get_device_capability()[0] >= 8:
    use_flash_attention2 = True

print(f"Using flash attention 2: {use_flash_attention2}")

Using flash attention 2: False


In [5]:
%%capture

if use_flash_attention2:
    !pip install flash-attn --no-build-isolation --upgrade

In [37]:
# Load dataset from the hub
# dataset = load_dataset("yahma/alpaca-cleaned", split="train")
from datasets import load_dataset
dataset = load_dataset("json", data_files="/home/srvadm001/nucleo-ia/finetuning/Fine-tuning/llama3_7b/snptee-instruction-dataset.json", split="train")

# print(f"Dataset size: {len(dataset)}")
# print(dataset[randrange(len(dataset))])

# #Reduce dataset to size N
# n_samples = sample(range(len(dataset)), k=1000)
# print(f"First 5 samples: {n_samples[:5]}")
# dataset = dataset.select(n_samples)
# print(f"Reduced dataset size: {len(dataset)}")

Generating train split: 64 examples [00:00, 7171.67 examples/s]


In [39]:
def format_instruction(sample):
	return f"""    
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{sample['instruction']}

### Input:
{sample['input']}

### Response:
{sample['output']}
"""

In [40]:
print(format_instruction(dataset[randrange(len(dataset))]))

    
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
['Resuma o conteúdo da figura 3 do artigo.', 'O que é apresentado na página inicial do Espaço de Gestão do Conhecimento da Gerência de Integração e Acesso?', 'A página inicial do Espaço de Gestão do Conhecimento apresenta o conceito de Gestão do Conhecimento (GC) e um menu com os macroprocessos da Gerência de Integração e Acesso do ONS.']

### Input:
['Descreva a estrutura da página do macroprocesso de Integração das Instalações de Transmissão ao SIN.', 'Como é estruturada a página do macroprocesso de Integração das Instalações de Transmissão ao SIN no Espaço de GC?', 'A página apresenta a apresentação do macroprocesso selecionado, com links rápidos para conteúdos relacionados, e botões de acesso às práticas de GC de Mapeamento de Processos, Checklists e Lições Aprendidas.']

### Response:
['Explique a abord

In [28]:
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

True

In [31]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Hugging Face model id
model_id = "meta-llama/Meta-Llama-3-8B"
# model_id = "mistralai/Mistral-7B-v0.1"

# BitsAndBytesConfig int-4 config 
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if use_flash_attention2 else torch.float16
)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    quantization_config=bnb_config, 
    use_cache=False, 
    device_map="auto",
    token=os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
    attn_implementation="flash_attention_2" if use_flash_attention2 else "sdpa"
)
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=os.environ["HF_TOKEN"],  # if model is gated like llama or mistral
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Specify the directory where you want to save the model and tokenizer
save_directory = "./saved_model"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)


Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.56s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/tokenizer.json')

In [35]:
# LoRA config based on QLoRA paper
peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj", 
            "up_proj", 
            "down_proj",
        ]
)

# Prepare model for training
model = prepare_model_for_kbit_training(model)

In [36]:
args = TrainingArguments(
    output_dir="fine-tuned-snptee",
    num_train_epochs=1,
    per_device_train_batch_size=6 if use_flash_attention2 else 2, # you can play with the batch size depending on your hardware
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=use_flash_attention2,
    fp16=not use_flash_attention2,
    tf32=use_flash_attention2,
    max_grad_norm=0.3,
    warmup_steps=5,
    lr_scheduler_type="linear",
    disable_tqdm=False,
    report_to="none"
)

model = get_peft_model(model, peft_config)

In [41]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length=2048,
    tokenizer=tokenizer,
    packing=True,
    formatting_func=format_instruction, 
    args=args,
)

Generating train split: 10 examples [00:00, 242.20 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  self.dataloader_config.dispatch_batches = dispatch_batches
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [42]:
# train
trainer.train()

# save model
trainer.save_model()



Step,Training Loss


In [49]:
if False:
    # Path to finetuned model
    finetuned_model_dir="/kaggle/working/mistral-int4-alpaca"

    # Load finetuned LLM model and tokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        finetuned_model_dir,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        load_in_4bit=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(finetuned_model_dir)

instruction = {
    "instruction": "Responda as seguinte pergunta",
    "input": "Quais os desafios para reduzir as transmissões do efeito estufa?",
    "output": ""
}
prompt = format_instruction(instruction)

input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()

start_time = time.time()
with torch.inference_mode():
    outputs = model.generate(input_ids=input_ids, pad_token_id=tokenizer.eos_token_id, max_new_tokens=100, do_sample=True, top_p=0.5,temperature=0.5)
end_time = time.time()
total_time = end_time - start_time
output_length = len(outputs[0])-len(input_ids[0])

print(f"\nInstruction generated from finetuned model | Inference time - {total_time:.2f}s:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")


Instruction generated from finetuned model | Inference time - 80.98s:
    
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Responda as seguinte pergunta

### Input:
Quais os desafios para reduzir as transmissões do efeito estufa?

### Response:

Os desafios para reduzir as transmissões do efeito estufa são:

- Aumento da temperatura global
- Mudanças no clima
- Mudanças no ciclo hidrológico
- Mudanças na distribuição de chuvas
- Mudanças na distribuição de chuvas
- Mudanças na distribuição de chuvas
- Mudanças na distribuição de chuvas
- Mudanças na distribuição de chu


In [47]:

instruction = {
    "instruction": "Responda as seguinte pergunta",
    "input": "Quais as variáveis usadas para avaliar a satisfação dos clientes?",
    "output": ""
}
prompt = format_instruction(instruction)

prompt

'    \nBelow is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nResponda as seguinte pergunta\n\n### Input:\nQuais as variáveis usadas para avaliar a satisfação dos clientes?\n\n### Response:\n\n'