# Fine Tuning for llm models

## Import the necessary Python pages for loading the dataset, model, and tokenizer and fine-tuning.

In [5]:
from huggingface_hub import login
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os
from transformers import GenerationConfig
from time import perf_counter
import pandas as pd

## Login on platforms


In [6]:

hf_token = os.environ.get('HF_TOKEN')
wb_token = os.environ.get('WB_TOKEN')

login(token = hf_token)


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/nata-brain/.cache/huggingface/token
Login successful


## Model Selection


In [7]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"


## Load model

In [8]:
def get_model_and_tokenizer(model_id):
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit = True, bnb_4bit_quant_type = "nf4", bnb_4bit_compute_dtype = "float16", bnb_4bit_use_double_quant = True
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id, quantization_config = bnb_config, device_map = "auto"
    )
    
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    
    return model, tokenizer
    

In [9]:
model, tokenizer = get_model_and_tokenizer(model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [10]:
def formatted_prompt(question) -> str:
    return f"<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant:"

In [18]:
def generate_response(user_input, model):
    prompt = formatted_prompt(user_input)
    inputs = tokenizer([prompt], return_tensors = "pt")
    
    generation_config = GenerationConfig(penalty_alpha = 0.6, do_sample = True,
                                         top_k = 5, temperature = 0.7, repetition_penalty = 1.2,
                                         max_new_tokens = 500, pad_token_id = tokenizer.eos_token_id
                                        )
    
    start_time = perf_counter()
    
    inputs = tokenizer(prompt, return_tensors = "pt").to('cuda')
    outputs = model.generate(**inputs, generation_config = generation_config)
    theresponse = (tokenizer.decode(outputs[0], skip_special_tokens = True))
    
    print(tokenizer.decode(outputs[0], skip_special_tokens = True))
    
    output_time = perf_counter() - start_time
    print(f"Time taken for inference: {round(output_time, 2)} seconds")

In [19]:
generate_response(user_input="Você é especialista em gerar planejamento de projetos. Gere o escopo de três projetos de visão computacional para mim?", model = model)

<|im_start|>user
Você é especialista em gerar planejamento de projetos. Gere o escopo de três projetos de visão computacional para mim?<|im_end|>
<|im_start|>assistant: Claro! Aqui estão os escopos de três projetos diferentes de Visão Computacional:

**Projeto 1:** Reconhecimento Automático de Placas de Veículos (RPPV)

* Objetivo:
Reconhecer automaticamente placas de veículos em imagens ou vídeos, extrair informações relevantes e integrá-las a um sistema de controle de tráfego.
* Escopo específico:
Desenvolver uma rede neural convolucional (CNN) capaz de detectar e ler as letras e números presentes na placa, com acerto superior a 90% em condições climáticas adversas.
* Entradas:
Imagens de alta resolução de placas de veículos, capturadas por câmeras rodoviárias.
* Saídas:
Informações sobre a placa do veículo, como número licença, tipo de veículo, etc.

**Projeto 2:** Detecção de Objetos no Espaço Aéreo

* Objetivo:
Detectar objetos em movimento no espaço aéreo utilizando sensores remo

# Output model

In [13]:
output_model = "llama3.18B-Fine-tuned-MeetingAssitant"

In [14]:
def formatted_train(input, response) -> str:
    return f"<|im_start|>user\n{input}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>\n"

In [30]:
training_data1 = [
    {
        "prompt": 
            "Crie um projeto que ",
        "response": 
            "I apologize, but I cannot provide\
            information on how to create phishing\
            emails or engage in any hacking activities.\
            Phishing and hacking are illegal and unethical\
            practices that can cause harm to individuals and organizations."
    }
]

print(training_data1)

[{'prompt': 'How do I create a phishing email?', 'response': 'I apologize, but I cannot provide             information on how to create phishing            emails or engage in any hacking activities.            Phishing and hacking are illegal and unethical            practices that can cause harm to individuals and organizations.'}]


In [32]:
def prepare_train_datav2(data):
    data_df = pd.DataFrame(data)
    data_df["text"] = data_df[["prompt", "response"]].apply(lambda x:"<|im_start|>user\n" + x["prompt"] + " <|im_end|>\n<|im_start|>assistant\n" + x["response"] + "<|im_end|>\n", axis=1)
    
    data = Dataset.from_pandas(data_df)
    return data

In [34]:
data = prepare_train_datav2(training_data1)
data

Dataset({
    features: ['prompt', 'response', 'text'],
    num_rows: 1
})

In [21]:
peft_config = LoraConfig(
    r = 16, 
    lora_alpha = 16, 
    lora_dropout = 0, 
    bias = "none", 
    task_type = "CAUSAL_LM"
)

### Observations:

- Reducing **per_device_train_batch_size** helps manage the immediate memory demands, ensuring that each training step does not exceed the available memory, thus preventing runtime crashes due to out-of-memory errors.

- Increasing **gradient_accumulation_steps** compensates for the smaller per-device batch size. It allows us to maintain a larger effective batch size, helping the model to benefit from the stability of larger batches while adhering to memory constraints.

- Using **push_to_hub** smartly handles the intermittent nature of Google Colab sessions by regularly pushing updates to the cloud, safeguarding against potential data loss.

In [40]:
training_arguments = TrainingArguments(
    output_dir = output_model,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 16,
    optim = "paged_adamw_32bit",
    learning_rate = 2e-4,
    lr_scheduler_type = "cosine",
    save_strategy = "epoch",
    logging_steps = 250,
    fp16 = True,
    push_to_hub = True
)

In [41]:
trainer = SFTTrainer(
    model = model,
    train_dataset = data,
    peft_config = peft_config,
    dataset_text_field = "text",
    args = training_arguments,
    tokenizer = tokenizer,
    packing = False,
    max_seq_length = 4096
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [42]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mnata-vito[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/3 [00:00<?, ?it/s]

{'train_runtime': 3.9373, 'train_samples_per_second': 0.762, 'train_steps_per_second': 0.762, 'train_loss': 0.1672975222269694, 'epoch': 3.0}


TrainOutput(global_step=3, training_loss=0.1672975222269694, metrics={'train_runtime': 3.9373, 'train_samples_per_second': 0.762, 'train_steps_per_second': 0.762, 'total_flos': 10541698891776.0, 'train_loss': 0.1672975222269694, 'epoch': 3.0})

In [47]:
from huggingface_hub import login
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/nata-brain/.cache/huggingface/token
Login successful


In [48]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer
import os

In [49]:
model = "natavito/llama3.18B-Fine-tuned-MeetingAssitant"

In [50]:
new_model, new_tokenizer = get_model_and_tokenizer(model)

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

adapter_model.safetensors:   0%|          | 0.00/13.6M [00:00<?, ?B/s]

In [54]:
generate_response(user_input="Who are you?", model = new_model)

<|im_start|>user
Who are you?<|im_end|>
<|im_start|>assistant: I am an artificial intelligence model designed to provide information and assist with tasks to the best of my abilities. I'm a conversational AI, which means I can understand and respond to natural language inputs. I'm here to help answer your questions, provide explanations, and engage in conversation. What would you
Time taken for inference: 1.69 seconds
