<a href="https://colab.research.google.com/github/marioalexandreantunes/inteligencia_artificial/blob/main/gpt2_small_lang_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Instalação de Bibliotecas Necessárias

In [None]:
!pip install pyarrow==14.0.2 requests==2.31.0 fsspec==2024.6.1 transformers[torch] datasets torch

# Importação de Bibliotecas

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, pipeline
from datasets import Dataset
import json
import pandas as pd

# Carregar Dados do Arquivo JSON

In [None]:
def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return pd.DataFrame(data)

file_path = '/content/dataset.json'
df = load_data(file_path)

# Inicializar Tokenizador e Preparar Dados

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token

df['text'] = df['question'] + " " + df['answer']

def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=512)

tokenized_datasets = df['text'].apply(lambda x: tokenize_function(x))
dataset = Dataset.from_pandas(df)

def tokenize_dataset(dataset):
    return dataset.map(lambda x: tokenizer(x['text'], padding="max_length", truncation=True, max_length=512), batched=True, remove_columns=["text", "question", "answer"])

tokenized_dataset = tokenize_dataset(dataset)

# Adicionar Labels e Dividir Dataset

In [None]:
tokenized_dataset = tokenized_dataset.map(lambda examples: {'labels': examples['input_ids']}, batched=True)

train_test_split = tokenized_dataset.train_test_split(test_size=0.15)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Função de Agrupamento de Dados

In [None]:
def data_collator(features):
    batch = {}
    batch['input_ids'] = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)
    batch['attention_mask'] = torch.tensor([f['attention_mask'] for f in features], dtype=torch.long)
    batch['labels'] = torch.tensor([f['labels'] for f in features], dtype=torch.long)
    return batch

# Inicializar Modelo e Argumentos de Treinamento

In [None]:
model = GPT2LMHeadModel.from_pretrained('distilgpt2')

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Configurar o Trainer e Iniciar o Treinamento

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

trainer.train()

# Salvar o Modelo e Tokenizador Treinados

In [None]:
model.save_pretrained("./gpt2-chatbot")
tokenizer.save_pretrained("./gpt2-chatbot")

# Carregar Modelo e Tokenizador Treinados

In [None]:
model = GPT2LMHeadModel.from_pretrained("./gpt2-chatbot")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-chatbot")

# Função para Gerar Resposta

In [None]:
def gerar_resposta(model, tokenizer, input_text, max_length=50, num_return_sequences=1):
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    attention_mask = [1] * len(inputs[0])
    outputs = model.generate(inputs, attention_mask=torch.tensor([attention_mask]), max_length=max_length, num_return_sequences=num_return_sequences, pad_token_id=tokenizer.eos_token_id)
    generated_text = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_text

# Exemplo de Uso da Função de Geração de Resposta

In [None]:
input_text = "What are the dimensions of a regulation soccer field?"
resposta = gerar_resposta(model, tokenizer, input_text)
print(f"{resposta}")