In [1]:
# Ambiente configurado para treinamento local em um PC com Placa de Vídeo Nvidia RTX-3060 12GB

# Utilizando miniconda, instalado em um Linux Ubuntu conforme orientações do link: https://docs.anaconda.com/miniconda/
# Utilizando miniconda para criação do ambiente do unsloth conforme orientação no link: https://docs.unsloth.ai/get-started/installation/conda-install

# >> Para configurar o ambiente, remova o comentário ("##") e execute os comandos. Lembre-se de instalar o miniconda previamente

#!pip install nbformat
#!conda install -c conda-forge ipywidgets
#!conda create --name unsloth_env python=3.10 pytorch-cuda=12.1 pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers -y
#!conda activate unsloth_env
#!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
#!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes

#!pip install accelerate peft bitsandbytes transformers trl

# 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

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Installing More Dependencies
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

from transformers import GenerationConfig
from time import perf_counter

import os
import pandas as pd
import numpy as np
import random

In [4]:
model_id = "unsloth/Meta-Llama-3.1-8B"

SEED = 123
MAX_ROWS = 100

major_version, minor_version = torch.cuda.get_device_capability()
SUPPORTS_BFLOAT16 = False
HAS_FLASH_ATTENTION = False
HAS_FLASH_ATTENTION_SOFTCAPPING = False

if major_version >= 8:
  SUPPORTS_BFLOAT16 = True

# Fixes a weird Torch 2.3 bug which says T4s have bfloat16


def is_bfloat16_supported():
  return SUPPORTS_BFLOAT16


pass

In [5]:
train_prompt = \
"""<|im_start|>user
Product name [{}]<|im_end|>
<|im_start|>assistant 
Review: {}
"""

In [6]:
def torch_fix_seed(seed=123):
  # Python random
  random.seed(seed)
  # Numpy
  np.random.seed(seed)
  # Pytorch
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.use_deterministic_algorithms = True
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
  # Enable CUDNN deterministic mode
  torch.backends.cudnn.benchmark = False


torch_fix_seed(SEED)

In [7]:
def print_start_memory_usage():
  gpu_stats = torch.cuda.get_device_properties(0)
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
  print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
  print(f"{start_gpu_memory} GB of memory reserved.")

  return start_gpu_memory, max_memory

In [8]:
def get_model_and_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True, 
      bnb_4bit_quant_type="nf4", 
      bnb_4bit_compute_dtype="float16", 
      bnb_4bit_use_double_quant=True
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_id, 
      quantization_config=bnb_config, 
      device_map="auto"
  )
  model.config.use_cache=False
  model.config.pretraining_tp=1
  return model, tokenizer

In [9]:
def formatted_prompt(question) -> str:
  return train_prompt.format(question, '')

In [10]:
def generate_response(user_input, model, tokenizer):
  prompt = formatted_prompt(user_input)
  inputs = tokenizer([prompt], return_tensors="pt")
  
  generation_config = GenerationConfig(
    penalty_alpha=0.6,
    do_sample = True,
    top_k=5,
    temperature=0.1,
    repetition_penalty=1.2,
    max_new_tokens=100,
    pad_token_id=tokenizer.eos_token_id
  )
  
  start_time = perf_counter()
  inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
  outputs = model.generate(**inputs, generation_config=generation_config)
  theresponse = (tokenizer.decode(outputs[0], skip_special_tokens=True))
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
  output_time = perf_counter() - start_time
  print(f"Time taken for inference: {round(output_time,2)} seconds")

In [11]:
def prepare_train_datav2(data_df: pd.DataFrame):
  # Create a new column called "text"
  data_df.to_csv('./data_used_to_train.csv', sep=';', index=False)
  data_df["text"] = data_df[["title", "content"]].apply(lambda x: train_prompt.format(x["title"], x["content"]), axis=1)
  # Create a new Dataset from the DataFrame
  data = Dataset.from_pandas(data_df)

  return data

In [12]:
dataset = pd.read_csv('../data/trn_sample.csv', sep=';', nrows=MAX_ROWS)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    100 non-null    object
 1   content  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB


In [13]:
data = prepare_train_datav2(dataset)

In [14]:
data

Dataset({
    features: ['title', 'content', 'text'],
    num_rows: 100
})

In [15]:
model, tokenizer = get_model_and_tokenizer(model_id)

tokenizer.padding_side = 'right'

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
print_start_memory_usage()

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.65 GB.
5.508 GB of memory reserved.


(5.508, 11.65)

In [17]:
int_randon = random.randint(1, MAX_ROWS)
print(f'Random Int: {int_randon}')
title_predict = dataset.sample(frac=1, random_state=SEED).head(int_randon)['title'].values[0]
print(f'Random Title: [{title_predict}]')

Random Int: 7
Random Title: [Castle Cats]


In [18]:
generate_response(title_predict, model, tokenizer)

<|im_start|>user
Product name [Castle Cats]<|im_end|>
<|im_start|>assistant 
Review: 
The game is a great time killer, but it's not very challenging. I'm on level 10 and have only lost once.
I like the graphics and how you can customize your cat with different hats etc. It would be nice if there were more options for customizing though (like changing fur color or adding accessories).
There are also some issues that need to be fixed:
- The sound effects aren't always accurate; sometimes they play when something happens off-screen instead of what actually happened in-game
Time taken for inference: 6.5 seconds


In [19]:
output_model = "llama3.18B-Fine-tuned_FIAP"

In [20]:
peft_config = LoraConfig(
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [21]:
training_arguments = TrainingArguments(
    output_dir=output_model,
    # per_device_train_batch_size=4,
    # gradient_accumulation_steps=16,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    save_strategy="epoch",
    logging_steps=1,
    num_train_epochs=6,
    #max_steps=250,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    push_to_hub=True,
    weight_decay = 0.01,
    seed=SEED
)

In [22]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=1024,
)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [23]:
trainer.train()

  0%|          | 0/72 [00:00<?, ?it/s]

{'loss': 2.8427, 'grad_norm': 1.09375, 'learning_rate': 0.0001999048221581858, 'epoch': 0.08}
{'loss': 3.023, 'grad_norm': 0.8046875, 'learning_rate': 0.00019961946980917456, 'epoch': 0.16}
{'loss': 2.9056, 'grad_norm': 0.86328125, 'learning_rate': 0.00019914448613738106, 'epoch': 0.24}
{'loss': 2.9385, 'grad_norm': 0.94921875, 'learning_rate': 0.00019848077530122083, 'epoch': 0.32}
{'loss': 2.8924, 'grad_norm': 1.0, 'learning_rate': 0.00019762960071199333, 'epoch': 0.4}
{'loss': 2.789, 'grad_norm': 1.2421875, 'learning_rate': 0.00019659258262890683, 'epoch': 0.48}
{'loss': 2.6178, 'grad_norm': 1.3046875, 'learning_rate': 0.0001953716950748227, 'epoch': 0.56}
{'loss': 2.3835, 'grad_norm': 1.6484375, 'learning_rate': 0.00019396926207859084, 'epoch': 0.64}
{'loss': 2.6041, 'grad_norm': 1.984375, 'learning_rate': 0.0001923879532511287, 'epoch': 0.72}
{'loss': 2.2032, 'grad_norm': 3.515625, 'learning_rate': 0.000190630778703665, 'epoch': 0.8}
{'loss': 1.9876, 'grad_norm': 3.515625, 'learni

TrainOutput(global_step=72, training_loss=1.8928722540537517, metrics={'train_runtime': 354.6898, 'train_samples_per_second': 1.692, 'train_steps_per_second': 0.203, 'total_flos': 3636165317738496.0, 'train_loss': 1.8928722540537517, 'epoch': 5.76})

In [24]:
print_start_memory_usage()

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.65 GB.
9.738 GB of memory reserved.


(9.738, 11.65)