<a href="https://colab.research.google.com/github/lusabo/fashionAI/blob/main/T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!rm -r /content/results
!rm -r /content/t5-chatbot

rm: cannot remove '/content/results': No such file or directory
rm: cannot remove '/content/t5-chatbot': No such file or directory


In [None]:
!pip install transformers[torch] datasets

In [None]:
import os
import json
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from transformers.trainer_callback import TrainerCallback
import logging

In [None]:
# Carregar o arquivo JSON
file_path = '/content/dataset.json'

with open(file_path, 'r') as file:
  data = json.load(file)

# Transformar os dados em um DataFrame
df = pd.DataFrame(data)

# Criar um Dataset do Hugging Face
dataset = Dataset.from_pandas(df)

In [None]:
def preprocess_function(examples):
    inputs = [f"pergunta: {question}" for question in examples['pergunta']]
    targets = [f"resposta: {answer}" for answer in examples['resposta']]
    return {"input_text": inputs, "target_text": targets}

# Aplicar a função de preprocessamento
dataset = dataset.map(preprocess_function, batched=True)

In [None]:
# Verificar as primeiras linhas do Dataset para garantir que o preprocessamento está correto
print(f"Primeiros exemplos do Dataset após preprocessamento:\n{dataset[0]}")

In [None]:
# Carregar o tokenizer e o modelo T5
model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(examples["target_text"], max_length=512, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenizar o dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# Verificar as primeiras linhas do Dataset tokenizado para garantir que a tokenização está correta
print(f"Primeiros exemplos do Dataset tokenizado:\n{tokenized_dataset[0]}")

In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Definir os argumentos de treinamento
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01
)

# Data collator para garantir que as entradas estejam padronizadas durante o treinamento
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    data_collator=data_collator
)

In [None]:
# Treinar o modelo
trainer.train()

In [None]:
# Verificar e salvar o modelo e o tokenizer
output_dir = "./t5-chatbot"

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Carregar o modelo e o tokenizador treinados
model = T5ForConditionalGeneration.from_pretrained("./t5-chatbot")
tokenizer = T5Tokenizer.from_pretrained("./t5-chatbot")

def generate_answer(question, max_new_tokens=50, num_beams=3):
    input_text = f"pergunta: {question}"
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids

    # Gerar a resposta com os parâmetros ajustados
    outputs = model.generate(input_ids, max_new_tokens=50, num_beams=3, early_stopping=True)

    # Decodificar a resposta gerada
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [None]:
# Testar com uma nova pergunta
print(generate_answer("Como usar calça jeans de forma estilosa?"))