In [1]:
## Ambiente configurado para treinamento local em um PC com Placa de Vídeo Nvidia RTX-3060 12GB

## Utilizando miniconda, instalado em um Linux Ubuntu conforme orientações do link: https://docs.anaconda.com/miniconda/
## Utilizando miniconda para criação do ambiente do unsloth conforme orientação no link: https://docs.unsloth.ai/get-started/installation/conda-install

## >> Para configurar o ambiente, remova o comentário ("##") e execute os comandos. Lembre-se de instalar o miniconda previamente

#!pip install nbformat
#!conda install -c conda-forge ipywidgets
#!conda create --name unsloth_env python=3.10 pytorch-cuda=12.1 pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers -y
#!conda activate unsloth_env
#!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
#!pip install --no-deps "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
import helper
import torch 
import pandas as pd

import datasets

from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

print(torch.__version__)
print(torch.version.cuda)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
2.4.1
12.1


In [3]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

In [4]:
model_name, raw_model, tokenizer = helper.get_model_by_id(2, max_seq_length, dtype, load_in_4bit)  ## "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!

Id Model: 2 - Model Name: unsloth/Meta-Llama-3.1-8B-bnb-4bit
==((====))==  Unsloth 2024.9: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce RTX 3060. Max memory: 11.65 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [5]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func_train(examples):        
    inputs       = examples['title']
    outputs      = examples['content']
    texts = []
    #for instruction, input, output in zip(instructions, inputs, outputs):
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = helper.alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

In [6]:
model = helper.get_fast_language_model(raw_model)

Unsloth 2024.9 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [7]:
dataset = datasets.Dataset.from_csv('../data/trn_sample.csv', sep=';', nrows=100)
dataset = dataset.map(formatting_prompts_func_train, batched = True,)
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({
    features: ['title', 'content', 'text'],
    num_rows: 100
})

In [8]:
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 2, # Set this for 1 full training run.
        #max_steps = 60,
        #learning_rate = 2e-4,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Map (num_proc=2):   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
start_gpu_memory, max_memory = helper.print_start_memory_usage()

GPU = NVIDIA GeForce RTX 3060. Max memory = 11.65 GB.
5.984 GB of memory reserved.


In [10]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 100 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 24
 "-____-"     Number of trainable parameters = 41,943,040


  0%|          | 0/24 [00:00<?, ?it/s]

{'loss': 2.9238, 'grad_norm': 5.992072582244873, 'learning_rate': 4e-05, 'epoch': 0.08}
{'loss': 3.0795, 'grad_norm': 3.9728596210479736, 'learning_rate': 8e-05, 'epoch': 0.16}
{'loss': 2.9872, 'grad_norm': 3.2877695560455322, 'learning_rate': 0.00012, 'epoch': 0.24}
{'loss': 3.0473, 'grad_norm': 3.614825487136841, 'learning_rate': 0.00016, 'epoch': 0.32}
{'loss': 2.6963, 'grad_norm': 3.3144521713256836, 'learning_rate': 0.0002, 'epoch': 0.4}
{'loss': 2.1205, 'grad_norm': 3.167720317840576, 'learning_rate': 0.00018947368421052632, 'epoch': 0.48}
{'loss': 2.0921, 'grad_norm': 3.1012771129608154, 'learning_rate': 0.00017894736842105264, 'epoch': 0.56}
{'loss': 2.0548, 'grad_norm': 2.90396785736084, 'learning_rate': 0.00016842105263157895, 'epoch': 0.64}
{'loss': 2.0922, 'grad_norm': 2.794199228286743, 'learning_rate': 0.00015789473684210527, 'epoch': 0.72}
{'loss': 2.1183, 'grad_norm': 2.486328125, 'learning_rate': 0.00014736842105263158, 'epoch': 0.8}
{'loss': 1.8921, 'grad_norm': 1.984

In [11]:
helper.print_final_memory_usage(start_gpu_memory, max_memory, trainer_stats)

109.5598 seconds used for training.
1.83 minutes used for training.
Peak reserved memory = 6.52 GB.
Peak reserved memory for training = 0.536 GB.
Peak reserved memory % of max memory = 55.966 %.
Peak reserved memory for training % of max memory = 4.601 %.


In [12]:
# Teste do modelo depois do treinamento

df = dataset.to_pandas().sample(frac=1).head(5).copy()
for _, row in df.iterrows():
  title = row['title']
  helper.predict_text_streamer(model, tokenizer, title)
  print('\n')
  

<|begin_of_text|>Below is a book that contains a title and description. 

###INSTRUCTION:
Write a resume description that appropriately corresponds to the title.

###TITLE:
Operation Fortitude The True Story of the Key Spy Operation of WWII That Saved DDay

###DESCRIPTION:
A fascinating story that contains a wealth of detail that will appeal to military history buffs and anyone interested in the Second World War. The Times A cracking story of deception and espionage. The Mirror A cracking read. The Mirror<|end_of_text|>


<|begin_of_text|>Below is a book that contains a title and description. 

###INSTRUCTION:
Write a resume description that appropriately corresponds to the title.

###TITLE:
I Miss Mummy The true story of a frightened young girl who is desperate to go home

###DESCRIPTION:
Cathy Glass was a foster carer for twenty years from 1990. She is now an award winning author and writes her foster carer diary entries in her books. She has three teenage children of her own one of 

In [13]:
model_name.split('/')[1]

'Meta-Llama-3.1-8B-bnb-4bit'

In [14]:
model.save_pretrained(model_name.split('/')[1]) # Local saving
tokenizer.save_pretrained(model_name.split('/')[1])

('Meta-Llama-3.1-8B-bnb-4bit/tokenizer_config.json',
 'Meta-Llama-3.1-8B-bnb-4bit/special_tokens_map.json',
 'Meta-Llama-3.1-8B-bnb-4bit/tokenizer.json')