In [1]:
## Ambiente configurado para treinamento local em um PC com Placa de Vídeo Nvidia RTX-3060 12GB

## Utilizando miniconda, instalado em um Linux Ubuntu conforme orientações do link: https://docs.anaconda.com/miniconda/
## Utilizando miniconda para criação do ambiente do unsloth conforme orientação no link: https://docs.unsloth.ai/get-started/installation/conda-install

## >> Para configurar o ambiente, remova o comentário ("##") e execute os comandos. Lembre-se de instalar o miniconda previamente

#!pip install nbformat
#!conda install -c conda-forge ipywidgets
#!conda create --name unsloth_env python=3.10 pytorch-cuda=12.1 pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers -y
#!conda activate unsloth_env
#!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
#!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
import helper
import torch; 

import datasets

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

print(torch.__version__)
print(torch.version.cuda)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
2.4.1
12.1


In [3]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [4]:
model_name, raw_model, tokenizer = helper.get_model_by_id(0, max_seq_length, dtype, load_in_4bit)  ## "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!

Id Model: 0 - Model Name: unsloth/Meta-Llama-3.1-8B
==((====))==  Unsloth 2024.9: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [5]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func_train(examples):        
    inputs       = examples['title']
    outputs      = examples['content']
    texts = []
    #for instruction, input, output in zip(instructions, inputs, outputs):
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = helper.alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [6]:
model = helper.get_fast_language_model(raw_model)

Unsloth 2024.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
dataset = datasets.Dataset.from_csv('../data/trn_sample.csv', sep=';')
dataset = dataset.map(formatting_prompts_func_train, batched = True,)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['title', 'content', 'text'],
    num_rows: 10000
})

In [8]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        #max_steps = 60,
        #learning_rate = 2e-4,
        learning_rate = 3e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [9]:
start_gpu_memory, max_memory = helper.print_start_memory_usage()

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.65 GB.
5.984 GB of memory reserved.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 1,250
 "-____-"     Number of trainable parameters = 41,943,040


  0%|          | 0/1250 [00:00<?, ?it/s]

{'loss': 2.92, 'grad_norm': 2.6884117126464844, 'learning_rate': 5.9999999999999995e-05, 'epoch': 0.0}
{'loss': 3.0875, 'grad_norm': 1.964647889137268, 'learning_rate': 0.00011999999999999999, 'epoch': 0.0}
{'loss': 2.9715, 'grad_norm': 2.1089975833892822, 'learning_rate': 0.00017999999999999998, 'epoch': 0.0}
{'loss': 3.0611, 'grad_norm': 4.286407470703125, 'learning_rate': 0.00023999999999999998, 'epoch': 0.0}
{'loss': 2.7159, 'grad_norm': 1.6686680316925049, 'learning_rate': 0.0003, 'epoch': 0.0}
{'loss': 2.0985, 'grad_norm': 1.9944469928741455, 'learning_rate': 0.0002997590361445783, 'epoch': 0.0}
{'loss': 2.3526, 'grad_norm': 5.69419527053833, 'learning_rate': 0.0002995180722891566, 'epoch': 0.01}
{'loss': 2.2694, 'grad_norm': 2.912416696548462, 'learning_rate': 0.00029927710843373495, 'epoch': 0.01}
{'loss': 2.3342, 'grad_norm': 1.6300982236862183, 'learning_rate': 0.0002990361445783132, 'epoch': 0.01}
{'loss': 2.1471, 'grad_norm': 1.4643645286560059, 'learning_rate': 0.000298795

KeyboardInterrupt: 

In [None]:
helper.print_final_memory_usage(start_gpu_memory, max_memory, trainer_stats)

In [None]:
# Teste do modelo depois do treinamento

df = dataset.to_pandas().sample(frac=1).head(5).copy()
for _, row in df.iterrows():
  title = row['title']
  print(f"Resultado da predição para o título: [{title}]\n")
  helper.predict_text_streamer(model, tokenizer, title)
  

In [None]:
model.save_pretrained('Meta-Llama-3.1-8B') # Local saving
tokenizer.save_pretrained('Meta-Llama-3.1-8B')