In [8]:
%pip install transformers datasets peft accelerate bitsandbytes xformers unsloth

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [5]:
import json
import pandas as pd
from datasets import Dataset
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments
)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Treinando no dispositivo: {device}")

Treinando no dispositivo: cuda


In [3]:
file_path = "trn.json"

data = []
with open(file_path, "r") as file:
    for line in file:
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Erro ao decodificar linha: {line}")
            print(f"Detalhes do erro: {e}")

filtered_data = [{"title": entry["title"], "content": entry["content"]} for entry in data]

output_file_path = "filtered_data.json"
with open(output_file_path, "w") as output_file:
    json.dump(filtered_data, output_file, indent=4)

print(f"Arquivo filtrado salvo em: {output_file_path}")

Arquivo filtrado salvo em: filtered_data.json


In [7]:
# Carregar o arquivo JSON
with open('filtered_data.json', 'r') as f:
    data = json.load(f)

# Converter para DataFrame
df = pd.DataFrame(data)

In [85]:
def create_prompt(row):
    return {
        "input": f"You are a book expert and should answer any question involving this title {row['title']}",
        "output": row["content"]
    }

# Aplicar ao DataFrame
fine_tune_data = df.apply(create_prompt, axis=1).tolist()

In [86]:
# Aplicar ao DataFrame
fine_tune_data = df.apply(create_prompt, axis=1).tolist()

# Dividir os dados em treinamento e validação
from sklearn.model_selection import train_test_split

# Separar 10% dos dados para validação
train_data, val_data = train_test_split(fine_tune_data[:10000], test_size=0.2)

In [87]:
with open('train_data.json', 'w') as f:
    json.dump(train_data, f, indent=4)

with open('val_data.json', 'w') as f:
    json.dump(val_data, f, indent=4)

In [88]:
# Carregar os datasets
train_dataset = Dataset.from_json("train_data.json")
val_dataset = Dataset.from_json("val_data.json")


Generating train split: 8000 examples [00:00, 95074.44 examples/s]
Generating train split: 2000 examples [00:00, 154001.36 examples/s]


In [89]:
# Escolha do modelo
model_name = "t5-small"  # Altere para o modelo desejado, como "mistralai/Mistral-7B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Mover o modelo para o dispositivo
model = model.to(device)  # Modificação: Move o modelo para GPU ou CPU
print(f"Modelo carregado no dispositivo: {model.device}")

# Função de tokenização
def tokenize_function(examples):
    return tokenizer(
        examples["input"], 
        text_target=examples["output"], 
        truncation=True, 
        padding="max_length", 
        max_length=256  # Modificação: Reduz o comprimento máximo para economizar memória
    )

# Tokenizar os datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

Modelo carregado no dispositivo: cuda:0


Map: 100%|██████████| 8000/8000 [00:01<00:00, 6614.05 examples/s]
Map: 100%|██████████| 2000/2000 [00:00<00:00, 6583.54 examples/s]


In [90]:
# Configurações de treinamento
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=2,  # Modificação: Reduz o batch size para evitar OOM
    gradient_accumulation_steps=4,  # Modificação: Acumula gradientes para simular batches maiores
    num_train_epochs=3,
    save_steps=500,
    save_total_limit=2,
    fp16=True,  # Modificação: Habilitar Mixed Precision Training
    logging_dir='./logs',
    predict_with_generate=True,
)





In [91]:
# Criar o Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [92]:
trainer.train()

 17%|█▋        | 500/3000 [02:33<12:49,  3.25it/s]

{'loss': 3.3907, 'grad_norm': 2.348841905593872, 'learning_rate': 2.5090000000000002e-05, 'epoch': 0.5}


 33%|███▎      | 1000/3000 [05:08<10:14,  3.26it/s]

{'loss': 2.4213, 'grad_norm': 2.3896596431732178, 'learning_rate': 2.009e-05, 'epoch': 1.0}



 33%|███▎      | 1000/3000 [05:28<10:14,  3.26it/s]

{'eval_loss': 2.2688496112823486, 'eval_runtime': 19.282, 'eval_samples_per_second': 103.724, 'eval_steps_per_second': 12.965, 'epoch': 1.0}


 50%|█████     | 1500/3000 [08:02<07:42,  3.24it/s]  

{'loss': 2.3836, 'grad_norm': 2.0116751194000244, 'learning_rate': 1.5090000000000001e-05, 'epoch': 1.5}


 67%|██████▋   | 2000/3000 [10:37<05:06,  3.26it/s]

{'loss': 2.4116, 'grad_norm': 1.9103543758392334, 'learning_rate': 1.009e-05, 'epoch': 2.0}



 67%|██████▋   | 2000/3000 [10:57<05:06,  3.26it/s]

{'eval_loss': 2.231015920639038, 'eval_runtime': 19.2696, 'eval_samples_per_second': 103.79, 'eval_steps_per_second': 12.974, 'epoch': 2.0}


 83%|████████▎ | 2500/3000 [13:31<02:32,  3.27it/s]  

{'loss': 2.3986, 'grad_norm': 1.7840808629989624, 'learning_rate': 5.0899999999999995e-06, 'epoch': 2.5}


100%|██████████| 3000/3000 [16:06<00:00,  3.24it/s]

{'loss': 2.3434, 'grad_norm': 1.7220593690872192, 'learning_rate': 9e-08, 'epoch': 3.0}



100%|██████████| 3000/3000 [16:27<00:00,  3.04it/s]

{'eval_loss': 2.2218337059020996, 'eval_runtime': 19.2555, 'eval_samples_per_second': 103.866, 'eval_steps_per_second': 12.983, 'epoch': 3.0}
{'train_runtime': 987.3166, 'train_samples_per_second': 24.308, 'train_steps_per_second': 3.039, 'train_loss': 2.558201904296875, 'epoch': 3.0}





TrainOutput(global_step=3000, training_loss=2.558201904296875, metrics={'train_runtime': 987.3166, 'train_samples_per_second': 24.308, 'train_steps_per_second': 3.039, 'total_flos': 1624101617664000.0, 'train_loss': 2.558201904296875, 'epoch': 3.0})

In [98]:
# Função para perguntas
def ask_question(question, model, tokenizer, device):
    inputs = tokenizer(question, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Modificação: Move os dados para o dispositivo correto
    outputs = model.generate(
    **inputs, 
    max_length=512,  # Corrigido: max_length
    num_beams=10,  # Beam search para respostas melhores
    no_repeat_ngram_size=2,  # Evitar repetições
    temperature=0.7,  # Controlar aleatoriedade
    top_k=50,  # Filtrar os 50 tokens mais prováveis
    top_p=0.95,  # Top-p sampling para diversidade
    do_sample=True
)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Exemplo de teste
question = "You are a book expert and should answer any question involving this title Collins German Unabridged Dictionary 5th Edition (Harpercollins Unabridged Dictionaries)"
response = ask_question(question, model, tokenizer, device)
print("Pergunta:", question)
print("Resposta:", response)

Pergunta: You are a book expert and should answer any question involving this title Collins German Unabridged Dictionary 5th Edition (Harpercollins Unabridged Dictionaries)
Resposta: &#8220;This text refers to the Collins German Unabridged Dictionary 5th Edition of this title.
