In [1]:
# Ambiente configurado para treinamento local em um PC com Placa de Vídeo Nvidia RTX-3060 12GB

# Utilizando miniconda, instalado em um Linux Ubuntu conforme orientações do link: https://docs.anaconda.com/miniconda/
# Utilizando miniconda para criação do ambiente do unsloth conforme orientação no link: https://docs.unsloth.ai/get-started/installation/conda-install

# >> Para configurar o ambiente, remova o comentário ("##") e execute os comandos. Lembre-se de instalar o miniconda previamente

#!pip install nbformat
#!conda install -c conda-forge ipywidgets
#!conda create --name unsloth_env python=3.10 pytorch-cuda=12.1 pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers -y
#!conda activate unsloth_env
#!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
#!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes

#!pip install accelerate peft bitsandbytes transformers trl

# 

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# Installing More Dependencies
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer

from transformers import GenerationConfig
from time import perf_counter

import os
import pandas as pd
import numpy as np
import random

In [4]:
#model_id="meta-llama/Meta-Llama-3.1-8B"
model_id="marcelolimagomes/llama3.18B-Fine-tuned_FIAP"
output_model = "marcelolimagomes/llama3.18B-Fine-tuned_FIAP_2"

#model_id="meta-llama/Meta-Llama-3.1-8B-Instruct"
#output_model = "marcelolimagomes/llama3.18B-Instruct-Fine-tuned_FIAP"

SEED = 123
MAX_ROWS = 10000 # Amostra (sample) Dez mil registros

major_version, minor_version = torch.cuda.get_device_capability()
SUPPORTS_BFLOAT16 = False
HAS_FLASH_ATTENTION = False
HAS_FLASH_ATTENTION_SOFTCAPPING = False

if major_version >= 8:
  SUPPORTS_BFLOAT16 = True

# Fixes a weird Torch 2.3 bug which says T4s have bfloat16
def is_bfloat16_supported():
  return SUPPORTS_BFLOAT16

In [5]:
#train_prompt = \
# """<|im_start|>user
# Title of book [{}]<|im_end|>
# <|im_start|>assistant 
# Review of book [{}]: {}<|im_end|>
# """

train_prompt = \
"""<|im_start|>user
Title of book [{}]<|im_end|>
<|im_start|>assistant 
Customers Review of book [{}]: {}<|im_end|>
"""

In [6]:
def torch_fix_seed(seed=123):
  # Python random
  random.seed(seed)
  # Numpy
  np.random.seed(seed)
  # Pytorch
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.use_deterministic_algorithms = True
  os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
  os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
  # Enable CUDNN deterministic mode
  torch.backends.cudnn.benchmark = False


torch_fix_seed(SEED)

In [7]:
def print_start_memory_usage():
  gpu_stats = torch.cuda.get_device_properties(0)
  start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
  max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
  print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
  print(f"{start_gpu_memory} GB of memory reserved.")

  return start_gpu_memory, max_memory

In [8]:
def get_model_and_tokenizer(model_id):
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.pad_token = tokenizer.eos_token
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True, 
      bnb_4bit_quant_type="nf4", 
      bnb_4bit_compute_dtype="float16", 
      bnb_4bit_use_double_quant=True
  )
  model = AutoModelForCausalLM.from_pretrained(
      model_id, 
      quantization_config=bnb_config, 
      device_map="auto"
  )
  model.config.use_cache=False
  model.config.pretraining_tp=1
  return model, tokenizer

In [9]:
def prepare_train_datav2(data_df: pd.DataFrame):
  # Create a new column called "text"
  data_df.to_csv('./data_used_to_train.csv', sep=';', index=False)
  data_df["text"] = data_df[["title", "content"]].apply(lambda x: train_prompt.format(x["title"], x["title"], x["content"]), axis=1)
  # Create a new Dataset from the DataFrame
  data = Dataset.from_pandas(data_df)

  return data

In [10]:
dataset = pd.read_csv('../data/trn_sample.csv', sep=';', nrows=MAX_ROWS)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    10000 non-null  object
 1   content  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [11]:
data = prepare_train_datav2(dataset)

In [12]:
model, tokenizer = get_model_and_tokenizer(model_id)

# tokenizer.padding_side = 'right'

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [13]:
print_start_memory_usage()

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.65 GB.
5.508 GB of memory reserved.


(5.508, 11.65)

In [16]:
peft_config = LoraConfig(
    r=8, 
    lora_alpha=16, 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

In [17]:
training_arguments = TrainingArguments(
    output_dir=output_model,
    # per_device_train_batch_size=4,
    # gradient_accumulation_steps=16,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    #save_strategy="epoch",
    save_strategy="steps",
    save_steps=10, 
    logging_steps=1,
    num_train_epochs=2,
    #max_steps=250,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    push_to_hub=True,
    weight_decay = 0.01,
    seed=SEED
)

In [18]:
trainer = SFTTrainer(
    model=model,
    train_dataset=data,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=1024,
)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [19]:
trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

{'loss': 2.2637, 'grad_norm': 0.8671875, 'learning_rate': 0.0001999999210431752, 'epoch': 0.0}
{'loss': 2.4517, 'grad_norm': 0.7578125, 'learning_rate': 0.00019999968417282543, 'epoch': 0.0}
{'loss': 2.455, 'grad_norm': 0.76171875, 'learning_rate': 0.00019999928938932473, 'epoch': 0.0}
{'loss': 2.1695, 'grad_norm': 0.9140625, 'learning_rate': 0.0001999987366932966, 'epoch': 0.0}
{'loss': 2.3795, 'grad_norm': 0.8359375, 'learning_rate': 0.0001999980260856137, 'epoch': 0.0}
{'loss': 2.2674, 'grad_norm': 1.109375, 'learning_rate': 0.00019999715756739833, 'epoch': 0.0}
{'loss': 2.2396, 'grad_norm': 1.3359375, 'learning_rate': 0.00019999613114002186, 'epoch': 0.01}
{'loss': 2.4467, 'grad_norm': 1.6015625, 'learning_rate': 0.00019999494680510518, 'epoch': 0.01}
{'loss': 2.1962, 'grad_norm': 3.6875, 'learning_rate': 0.0001999936045645186, 'epoch': 0.01}
{'loss': 1.8066, 'grad_norm': 3.015625, 'learning_rate': 0.00019999210442038162, 'epoch': 0.01}
{'loss': 2.0419, 'grad_norm': 5.71875, 'learn

In [None]:
print_start_memory_usage()