In [None]:
!pip install -q transformers==4.40.1
!pip install -q peft==0.10.0
!pip install -q bitsandbytes==0.46.0
!pip install -q accelerate==0.29.3
!pip install -q datasets==2.19.0
!pip install -q trl==0.8.6
!pip install -q huggingface_hub==0.22.2

In [None]:
import random
import numpy as np
import torch # Se estiver usando PyTorch

seed = 21
random.seed(seed)
np.random.seed(seed)
if torch.cuda.is_available():
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

print("Random Python:", random.random())
print("Random NumPy:", np.random.rand())

Random Python: 0.16494947983319797
Random NumPy: 0.04872488080912729


In [None]:
from datasets import load_dataset
spider_dataset = load_dataset("spider")
spider_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['db_id', 'query', 'question', 'query_toks', 'query_toks_no_value', 'question_toks'],
        num_rows: 7000
    })
    validation: Dataset({
        features: ['db_id', 'query', 'question', 'query_toks', 'query_toks_no_value', 'question_toks'],
        num_rows: 1034
    })
})

In [None]:
def formatting_to_prompts(example):
    # Format: [INST] <natural language question> [/INST] <SQL query>
    # Ensure this matches the Mistral-7B-Instruct-v0.2 prompt format.
    return {
        "text": f"<s>[INST] {example['question']} [/INST] {example['query']}</s>"
    }

formatting_to_prompts(spider_dataset["train"][0])

{'text': '<s>[INST] How many heads of the departments are older than 56 ? [/INST] SELECT count(*) FROM head WHERE age  >  56</s>'}

In [None]:
train = spider_dataset['train'].map(formatting_to_prompts)

In [None]:
train[0]['text']

'<s>[INST] How many heads of the departments are older than 56 ? [/INST] SELECT count(*) FROM head WHERE age  >  56</s>'

In [None]:
train

Dataset({
    features: ['db_id', 'query', 'question', 'query_toks', 'query_toks_no_value', 'question_toks', 'text'],
    num_rows: 7000
})

In [None]:
type(train['text'])

list

In [None]:
from datasets import Dataset
processed_examples = [{'text': train['text'][i]} for i in range(0, len(train['text']), 2) ]
train = Dataset.from_list(processed_examples)

In [None]:
train

Dataset({
    features: ['text'],
    num_rows: 3500
})

In [None]:
from huggingface_hub import login
login(token='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')

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 # Use bfloat16 for computation if your GPU supports it (e.g., T4, A100)
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto", # Automatically maps model to available devices
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Important for causal LMs

# Prepare model for k-bit training (important for QLoRA)
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16, # Rank of the update matrices. Smaller r means fewer trainable parameters.
    lora_alpha=16, # LoRA scaling factor.
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # Modules to apply LoRA to
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM", # For language modeling tasks
)

model = get_peft_model(model, lora_config)

# Print trainable parameters to verify LoRA setup
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable params: 41,943,040 || all params: 7,283,675,136 || trainable%: 0.5758499550960753


In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer

training_arguments = TrainingArguments(
    output_dir="./results", # Directory to save checkpoints and logs
    num_train_epochs=1, # Number of training epochs

    per_device_train_batch_size=4, # Adjust batch size based on GPU memory. Start small.
    gradient_accumulation_steps=16, # Accumulate gradients to simulate larger batch size

    save_steps=25, # Save checkpoint every X steps
    logging_steps=25, # Log training metrics every X steps
    learning_rate=2e-2, # Initial learning rate

    fp16=False, # Set to True if your GPU supports it and you're not using bfloat16 compute_dtype
    bf16=True if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else False, # Use bfloat16 for Ampere and newer GPUs
    max_grad_norm=0.3, # Max gradient norm
    max_steps=-1, # Set to a specific number of steps or -1 for num_train_epochs
    warmup_ratio=0.03, # Linear warmup ratio
    lr_scheduler_type="cosine", # Learning rate scheduler
    report_to="tensorboard", # Report metrics to TensorBoard
    disable_tqdm=False, # Enable tqdm progress bar
    # remove_unused_columns=False, # Important for SFTTrainer
)

trainer = SFTTrainer(
    model=model,                             # O modelo base (já com PEFT aplicado ou será aplicado via lora_config)
    tokenizer=tokenizer,
    train_dataset=train,
    peft_config=lora_config,                 # Passar a LoraConfig aqui. SFTTrainer aplicará o get_peft_model.
    dataset_text_field="text",               # Nome da coluna no dataset que contém o texto formatado.
    max_seq_length=512,                      # Comprimento máximo da sequência. Ajuste conforme seu VRAM e dados.
    args=training_arguments,
    # data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), # Opcional, SFTTrainer tem um padrão.
)

"""
trainer = SFTTrainer(
    model=model,
    train_dataset=train, # Your formatted Spider dataset
    peft_config=lora_config if 'lora_config' in locals() else None, # Pass LoRA config if not using Unsloth's direct model patching
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False, # Set to True for packing multiple short examples into one sequence for efficiency
)
"""

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

"\ntrainer = SFTTrainer(\n    model=model,\n    train_dataset=train, # Your formatted Spider dataset\n    peft_config=lora_config if 'lora_config' in locals() else None, # Pass LoRA config if not using Unsloth's direct model patching\n    tokenizer=tokenizer,\n    args=training_arguments,\n    packing=False, # Set to True for packing multiple short examples into one sequence for efficiency\n)\n"

In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss
50,8.3647


  return fn(*args, **kwargs)


TrainOutput(global_step=54, training_loss=8.190933933964482, metrics={'train_runtime': 5274.6102, 'train_samples_per_second': 0.664, 'train_steps_per_second': 0.01, 'total_flos': 1.4886398192418816e+16, 'train_loss': 8.190933933964482, 'epoch': 0.9874285714285714})

In [None]:
trainer.save_model("mistral_spider_qlora")

In [None]:
# import zipfile
# import os
# from google.colab import files

# folder_to_download = "mistral_spider_qlora" #Substitua pelo nome da pasta que você quer baixar
# output_zip_file = f"{folder_to_download}.zip"

# Crie o arquivo ZIP
# zipf = zipfile.ZipFile(output_zip_file, 'w', zipfile.ZIP_DEFLATED)
# for root, dirs, files_in_folder in os.walk(folder_to_download):
#     for file in files_in_folder:
#         zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(folder_to_download, '..')))
# zipf.close()

# print(f"Pasta '{folder_to_download}' compactada para '{output_zip_file}'.")

# files.download(output_zip_file)
# print(f"Download de '{output_zip_file}' iniciado.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>