In [None]:
pip install transformers datasets torch accelerate

In [1]:
import pandas as pd
from datasets import Dataset
import json

# Carregar os dados do arquivo JSON (considerando que o arquivo é um JSON por linha)
with open("trn.json", "r") as f:
    data = [json.loads(line) for line in f]

# Carregar os dados já preparados
df = pd.DataFrame(data)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Remover colunas não necessárias para o fine-tuning
df = df[["title", "content"]]  # Manter apenas as colunas 'title' e 'content'

# Criar uma coluna de prompt seguindo o estilo FLAN-T5
df["prompt"] = "Question: " + df["title"] + " Answer: " + df["content"]

# Converter para um dataset do Hugging Face
dataset = Dataset.from_pandas(df[["prompt", "content"]])  # 'content' é a resposta esperada

# Dividir entre treino (90%) e validação (10%)
dataset = dataset.train_test_split(test_size=0.1)

In [None]:
from transformers import AutoTokenizer

MODEL_NAME = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Função para tokenizar
def tokenize_function(examples):
    return tokenizer(examples["prompt"], text_target=examples["content"], padding="max_length", truncation=True, max_length=512)

# Aplicar tokenização
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remover colunas originais para evitar conflitos
tokenized_datasets = tokenized_datasets.remove_columns(["prompt", "content"])
tokenized_datasets.set_format("torch")


In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./flan-t5-finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=False,  # Se quiser salvar no Hugging Face Hub, altere para True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer
)

trainer.train()


In [None]:
# Salvar o modelo treinado
model.save_pretrained("./flan-t5-finetuned")
tokenizer.save_pretrained("./flan-t5-finetuned")


In [None]:
def generate_answer(question):
    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True, max_length=512)
    outputs = model.generate(**inputs, max_length=150)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Exemplo de teste
question = "Smartphone Samsung Galaxy"
print(generate_answer(question))
