In [None]:
# ====================  INSTALAÇÃO DE PACOTES ====================
!pip install --upgrade pip
!pip install accelerate bitsandbytes einops sentencepiece
!pip install git+https://github.com/huggingface/peft.git
!pip install unsloth
!pip install --no-deps xformers
!pip install trl  # Versão mais recente sem restrição
!pip install --upgrade transformers datasets
!pip install tqdm



Collecting git+https://github.com/huggingface/peft.git
  Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-wuv6a1d0
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-wuv6a1d0
  Resolved https://github.com/huggingface/peft.git to commit b0954e0daa9b263449cef6d6b4b31f31e862e041
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,<=4.56.2,>=4.51.3 (from unsloth)
  Downloading transformers-4.56.2-py3-none-any.whl.metadata (40 kB)
Downloading transformers-4.56.2-py3-none-any.whl (11.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m39.7 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found 

In [None]:
# ====================  IMPORTS ====================
import json
import os
import time
import torch
import gc
import pandas as pd
from pathlib import Path
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    TrainerCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from IPython.display import display, clear_output
import logging



In [None]:
# ==================== 3️⃣ CONFIGURAÇÃO DE LOG ====================
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ==================== 4️⃣ MONTAR DRIVE ====================
from google.colab import drive
drive.mount('/content/drive')

# ==================== 5️⃣ DEFINIR DIRETÓRIOS ====================
BASE_DIR = Path("/content/drive/MyDrive/FIAP_FINETUNING")
BASE_DIR.mkdir(parents=True, exist_ok=True)
# Carregar dados
BASE_DIR = Path("/content/drive/MyDrive/FIAP_FINETUNING")
TOKENIZED_DIR = BASE_DIR / "tokenized_datasets"

INPUT_FILE = BASE_DIR / "trn.json"
CHUNKS_DIR = BASE_DIR / "chunks"
OUTPUT_DIR = BASE_DIR / "output"
CHECKPOINT_DIR = BASE_DIR / "checkpoints"


CHUNKS_DIR.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

LOG_FILE = OUTPUT_DIR / "training_log.csv"



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ==================== 6️⃣ FUNÇÃO PARA DIVIDIR EM CHUNKS ====================
def split_json_in_chunks(input_file: Path, output_dir: Path, chunk_size: int = 50000):
    """Divide arquivo JSON em chunks menores"""
    logger.info(f"Iniciando divisão do arquivo: {input_file}")

    with open(input_file, "r", encoding="utf-8") as f:
        chunk = []
        chunk_index = 1

        for i, line in enumerate(f, start=1):
            chunk.append(json.loads(line))

            if i % chunk_size == 0:
                chunk_path = output_dir / f"chunk_{chunk_index:03d}.json"
                with open(chunk_path, "w", encoding="utf-8") as out:
                    json.dump(chunk, out, ensure_ascii=False, indent=2)
                logger.info(f"Chunk {chunk_index:03d} salvo ({len(chunk)} linhas)")

                chunk = []
                chunk_index += 1

        # Último chunk (resto)
        if chunk:
            chunk_path = output_dir / f"chunk_{chunk_index:03d}.json"
            with open(chunk_path, "w", encoding="utf-8") as out:
                json.dump(chunk, out, ensure_ascii=False, indent=2)
            logger.info(f"Chunk {chunk_index:03d} salvo ({len(chunk)} linhas)")

    logger.info("✅ Divisão concluída com sucesso!")

# ==================== 7️⃣ EXECUTAR DIVISÃO (se necessário) ====================
if not list(CHUNKS_DIR.glob("*.json")):
    split_json_in_chunks(INPUT_FILE, CHUNKS_DIR, chunk_size=50000)
else:
    logger.info("ℹ️ Chunks já existem, pulando divisão...")

# ==================== 8️⃣ CARREGAR MODELO E TOKENIZER ====================
logger.info("🔄 Carregando modelo e tokenizer...")

model_name = "unsloth/Llama-3.2-1B-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Configurar pad token corretamente
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Carregar modelo quantizado
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16,
)

# Preparar para treinamento quantizado
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

logger.info("✅ Modelo carregado com sucesso!")

# ==================== 9️⃣ CONFIGURAR LORA ====================
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# ==================== 🔟 CARREGAR DATASETS ====================
# Lista todos os arquivos JSON ordenados
json_files = sorted([str(f) for f in CHUNKS_DIR.glob("*.json")])

if not json_files:
    raise FileNotFoundError("❌ Nenhum arquivo JSON encontrado em chunks/")

logger.info(f"📊 Carregando {len(json_files)} arquivo(s) JSON...")

# OPÇÃO 1: Carregar apenas o primeiro chunk (para testes rápidos)
# dataset = load_dataset("json", data_files=str(CHUNKS_DIR / "chunk_001.json"), split="train")

# OPÇÃO 2: Carregar TODOS os chunks (recomendado para treino completo)
dataset = load_dataset("json", data_files=json_files, split="train")

logger.info(f"✅ Dataset carregado com {len(dataset)} entradas")

# ==================== 1️⃣1️⃣ DIVIDIR EM TREINO E VALIDAÇÃO ====================
# 90% treino, 10% validação
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

logger.info(f"📊 Divisão: {len(train_dataset)} treino, {len(eval_dataset)} validação")

# ==================== 1️⃣2️⃣ TOKENIZAÇÃO ====================
max_length = 1024

def preprocess(examples):
    """Preprocessa exemplos com formato estruturado"""
    titles = examples.get("title", [""] * len(examples.get("content", [])))
    contents = examples.get("content", [])

    # Formato estruturado
    texts = [f"### Título: {t}\n### Conteúdo: {c}" for t, c in zip(titles, contents)]

    # Tokenizar
    tokenized = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=max_length,
        return_tensors=None
    )

    # Criar labels para language modeling
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

logger.info("🔄 Tokenizando datasets...")
tokenized_train = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizando treino"
)
tokenized_eval = eval_dataset.map(
    preprocess,
    batched=True,
    remove_columns=eval_dataset.column_names,
    desc="Tokenizando validação"
)
logger.info("✅ Tokenização concluída!")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`torch_dtype` is deprecated! Use `dtype` instead!
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Tokenizando validação:   0%|          | 0/136655 [00:00<?, ? examples/s]

In [None]:
TOKENIZED_DIR = BASE_DIR / "tokenized_datasets"
TOKENIZED_DIR.mkdir(parents=True, exist_ok=True)

logger.info("💾 Salvando datasets tokenizados...")
tokenized_train.save_to_disk(str(TOKENIZED_DIR / "train"))
tokenized_eval.save_to_disk(str(TOKENIZED_DIR / "eval"))
logger.info(f"✅ Datasets salvos em: {TOKENIZED_DIR}")


Saving the dataset (0/33 shards):   0%|          | 0/1229886 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/136655 [00:00<?, ? examples/s]

In [None]:
from datasets import load_from_disk
from pathlib import Path

# Carregar dados
BASE_DIR = Path("/content/drive/MyDrive/FIAP_FINETUNING")
TOKENIZED_DIR = BASE_DIR / "tokenized_datasets"

# Carregar datasets (SEM espaços nos caminhos)
tokenized_train = load_from_disk(str(TOKENIZED_DIR / "train"))
tokenized_eval = load_from_disk(str(TOKENIZED_DIR / "eval"))

print(f"✅ Train: {len(tokenized_train)} | Eval: {len(tokenized_eval)}")

Loading dataset from disk:   0%|          | 0/33 [00:00<?, ?it/s]

✅ Train: 1229886 | Eval: 136655


In [None]:
# ==================== 1️⃣3️⃣ TESTES DE VALIDAÇÃO ====================
def test_dataset(dataset, name="Dataset"):
    assert len(dataset) > 0, f"{name} está vazio!"
    logger.info(f"✅ {name} válido com {len(dataset)} entradas.")

def test_tokenization(dataset, name="Dataset"):
    sample = dataset[0]
    assert "input_ids" in sample, f"{name} não gerou input_ids!"
    assert len(sample["input_ids"]) <= max_length, f"{name} excedeu max_length!"
    logger.info(f"✅ Tokenização de {name} válida.")

def test_output_dir(output_dir):
    assert output_dir.exists(), f"Diretório {output_dir} não existe!"
    logger.info(f"✅ Diretório de saída {output_dir} pronto.")

# Executar testes
test_dataset(tokenized_train, "Train dataset")
test_dataset(tokenized_eval, "Eval dataset")
test_tokenization(tokenized_train, "Train dataset")
test_tokenization(tokenized_eval, "Eval dataset")
test_output_dir(OUTPUT_DIR)



In [None]:
# ==================== 1️⃣4️⃣ DATA COLLATOR ====================
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    return_tensors="pt"
)

# ==================== 1️⃣5️⃣ CALLBACK CUSTOMIZADO PARA LOGGING ====================
class CustomLoggingCallback(TrainerCallback ):
    def __init__(self, log_file):
        super().__init__(early_stopping_patience=3)
        self.log_file = log_file
        self.start_time = time.time()

        # Criar CSV se não existir
        if not self.log_file.exists():
            pd.DataFrame(columns=["step", "loss", "elapsed_min", "eval_loss", "progress_%"]).to_csv(
                self.log_file, index=False
            )

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            elapsed_min = (time.time() - self.start_time) / 60
            progress = round(state.global_step / state.max_steps * 100, 2) if state.max_steps else 0

            row = {
                "step": state.global_step,
                "loss": logs.get("loss", ""),
                "elapsed_min": round(elapsed_min, 2),
                "eval_loss": logs.get("eval_loss", ""),
                "progress_%": f"{progress}%"
            }

            # Salvar no CSV
            pd.DataFrame([row]).to_csv(
                self.log_file, mode='a', header=False, index=False
            )

            # Mostrar na tela a cada 100 steps
            if state.global_step % 100 == 0:
                try:
                    df = pd.read_csv(self.log_file)
                    clear_output(wait=True)
                    print("📊 Últimas 10 métricas:")
                    display(df.tail(10))
                except:
                    pass




In [None]:

# ==================== TRAINING ARGUMENTS ====================
training_args = TrainingArguments(
    output_dir=str(CHECKPOINT_DIR),

    # Batch
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,

    # Learning rate
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_steps=100,

    # Precisão
    bf16=torch.cuda.is_bf16_supported(),
    fp16=not torch.cuda.is_bf16_supported(),

    # Logging
    logging_dir=str(OUTPUT_DIR / "logs"),
    logging_steps=100,
    logging_first_step=True,

    # Salvamento
    save_strategy="steps",
    save_steps=300,
    save_total_limit=3,

    # Avaliação
    eval_strategy="steps",
    eval_steps=300,

    # Steps
    num_train_epochs=1,
    max_steps=2000,

    # Otimizações
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,

    # Outros
    report_to="none",
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
)
trainer = Trainer(
    model=model,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator
)

# ==================== 1️⃣7️⃣ VERIFICAR CHECKPOINT EXISTENTE ====================
latest_checkpoint = None
if CHECKPOINT_DIR.exists():
    checkpoints = [d for d in CHECKPOINT_DIR.iterdir() if d.is_dir() and "checkpoint" in d.name]
    if checkpoints:
        latest_checkpoint = str(sorted(checkpoints)[-1])
        logger.info(f"🔄 Checkpoint encontrado: {latest_checkpoint}")

# ==================== 1️⃣8️⃣ CRIAR LOG CSV ====================
if not LOG_FILE.exists():
    pd.DataFrame(columns=["step", "loss", "elapsed_min", "checkpoint", "progress_%"]).to_csv(
        LOG_FILE, index=False
    )

In [None]:
# ==================== LOOP DE TREINAMENTO AJUSTADO ====================
import pandas as pd
pd.set_option('display.width', 120)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 15)
pd.set_option('display.float_format', '{:.4f}'.format)

logger.info("🚀 Iniciando loop de treinamento customizado...")

# --- Configuração Inicial ---
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.train()

train_dataloader = DataLoader(
    tokenized_train,
    batch_size=training_args.per_device_train_batch_size,
    shuffle=True,
    num_workers=training_args.dataloader_num_workers,
    pin_memory=True,
    collate_fn=data_collator,
)

optimizer = trainer.create_optimizer()
lr_scheduler = trainer.create_scheduler(num_training_steps=training_args.max_steps, optimizer=optimizer)

# --- Variáveis de Controle ---
start_time = time.time()
global_step = 0
table_data = []  # Para armazenar os dados da tabela, adaptado do primeiro código

# --- Loop Principal ---
progress_bar = tqdm(total=training_args.max_steps, desc="Treinamento em progresso", ncols=200)

try:
    for epoch in range(int(training_args.num_train_epochs)):
        for batch_idx, batch in enumerate(train_dataloader):
            # Mover batch para GPU
            batch = {k: v.to(device) for k, v in batch.items()}

            # Forward pass com autocasting para precisão mista
            with torch.cuda.amp.autocast(enabled=(training_args.fp16 or training_args.bf16)):
                outputs = model(**batch, use_cache=False)
                loss = outputs.loss

            # Acumulação de gradiente
            loss = loss / training_args.gradient_accumulation_steps
            loss.backward()

            # Atualização dos pesos
            if (batch_idx + 1) % training_args.gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                global_step += 1
                progress_bar.update(1)

                # --- Avaliação ---
                eval_loss_display = "-"
                if global_step % training_args.eval_steps == 0:
                    logger.info("📊 Executando avaliação...")
                    eval_results = trainer.evaluate()
                    eval_loss_display = f"{eval_results['eval_loss']:.4f}"
                    logger.info(f"✅ Resultado da Avaliação: {eval_results}")
                    model.train()  # Garante que o modelo volte ao modo de treino

                # Checkpoint
                if global_step % training_args.save_steps == 0:
                    checkpoint_path = CHECKPOINT_DIR / f"checkpoint-{global_step}"
                    trainer.save_model(str(checkpoint_path))
                    logger.info(f"💾 Checkpoint salvo em: {checkpoint_path}")

                # --- Logging a cada 50 steps, adaptado do primeiro código ---
                if global_step % 50 == 0:
                    current_loss = float(loss.item()) * training_args.gradient_accumulation_steps
                    elapsed = (time.time() - start_time) / 60
                    ckpt = "Salvo" if global_step % training_args.save_steps == 0 else "-"
                    progress = round(global_step / training_args.max_steps * 100, 2)
                    current_lr = lr_scheduler.get_last_lr()[0]
                    row = [global_step, f"{current_loss:.4f}", eval_loss_display, round(elapsed, 2), ckpt, f"{progress}%"]

                    # Salvar no CSV com colunas adaptadas (incluindo eval_loss do segundo código)
                    pd.DataFrame([[
                        global_step, current_loss,
                        eval_loss_display if eval_loss_display != '-' else '',
                        current_lr,
                        elapsed,
                        ckpt,
                        f"{progress}%"
                    ]], columns=["step", "train_loss", "eval_loss", "learning_rate", "elapsed_min", "checkpoint", "progress_%"]).to_csv(
                        LOG_FILE, mode='a', header=False, index=False
                    )

                    table_data.append(row)
                    clear_output(wait=True)
                    print("Últimas métricas registradas:")
                    display(pd.DataFrame(table_data, columns=["Step", "Train Loss", "Eval Loss", "Elapsed (min)", "Checkpoint", "Progresso (%)"]).tail(5))

                if global_step >= training_args.max_steps:
                    break

            # Limpeza de memória
            if (batch_idx + 1) % 100 == 0:
                gc.collect()
                torch.cuda.empty_cache()

        if global_step >= training_args.max_steps:
            break

except KeyboardInterrupt:
    logger.warning("⚠️ Treinamento interrompido pelo usuário.")
except Exception as e:
    logger.error(f"❌ Ocorreu um erro inesperado: {e}", exc_info=True)
finally:
    progress_bar.close()
    logger.info("🎉 Loop de treinamento finalizado.")

# ==================== FINALIZAÇÃO ====================
logger.info("💾 Salvando modelo final...")
final_model_path = OUTPUT_DIR / "final_model"
trainer.save_model(str(final_model_path))
tokenizer.save_pretrained(str(final_model_path))
logger.info(f"✅ Modelo final salvo em: {final_model_path}")

# ==================== AVALIAÇÃO FINAL =================

In [None]:
print("Últimas métricas registradas:")
display(pd.DataFrame(table_data, columns=["Step", "Train Loss", "Eval Loss", "Elapsed (min)", "Checkpoint", "Progresso (%)"]).tail(5))