<a href="https://colab.research.google.com/github/marcos-2002/shakespeare-made-in-brazil/blob/main/fine-tunning_modelo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip uninstall -y transformers accelerate
!pip install transformers[torch] accelerate -U

In [None]:
!pip install datasets

In [6]:
import torch
import os
import accelerate
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset, DatasetDict

In [None]:
# Configurações
model_name = "MarioJ/Portuguese-Poems-Small-Gpt2"
train_file = '/content/drive/MyDrive/Shakespeare made in Brazil/dataset/musicas-train.json'  # Caminho para o arquivo de treinamento
test_file = '/content/drive/MyDrive/Shakespeare made in Brazil/dataset/musicas-test.json'    # Caminho para o arquivo de teste
output_dir = '/content/drive/MyDrive/Shakespeare made in Brazil/results' # Caminho para o resultado

# Carregar o tokenizer e o modelo
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Carregar o dataset
dataset = load_dataset('json', data_files={'train': train_file, 'test': test_file})

# Função de tokenização
def tokenize_function(examples):
    tokenized = tokenizer(examples['Letra'], truncation=True, padding='max_length', max_length=128)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Tokenizar o dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["Poema:", "Letra"])

# Argumentos de treinamento
training_args = TrainingArguments(
    output_dir=output_dir, # Diretório de salvamento
    overwrite_output_dir=True,
    num_train_epochs=3, # Número de passagens
    per_device_train_batch_size=2, # Quantidade de exemplos processados em treinamento
    per_device_eval_batch_size=2, # Quantidade de exemplos processados em avaliação
    warmup_steps=500, # Passos de aquecimento
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="steps",  # Avaliar a cada N passos
    save_strategy="steps",  # Salvar checkpoints a cada N passos
    save_total_limit=3,  # Limite de quantos checkpoints manter
    load_best_model_at_end=True,
    save_steps=500,  # Salvar checkpoints a cada 500 passos
    eval_steps=500,  # Avaliar a cada 500 passos
)

# Criar o Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer
)

# Treinar o modelo
trainer.train()  # Inicia o treinamento do zero

# Salvar o modelo
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Shakespeare made in Brazil/results/modelo-final")
model = AutoModelForCausalLM.from_pretrained("/content/drive/MyDrive/Shakespeare made in Brazil/results/modelo-final")

input_text = "Quando gero um poema"

input_ids = tokenizer(input_text, return_tensors="pt").input_ids

attention_mask = (input_ids != tokenizer.pad_token_id).long()

output = model.generate(
    input_ids,
    attention_mask=attention_mask,
    max_length=100,
    num_return_sequences=1,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.eos_token_id
)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

print(decoded_output)