In [1]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from datasets import load_dataset
from peft import get_peft_model
import pandas as pd
import numpy as np
import re

  from .autonotebook import tqdm as notebook_tqdm
2024-06-01 15:44:49.424936: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)

In [3]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset


def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator


def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)
  model.config.attention_dropout = 0.1  # Exemplo de configuração de dropout para a camada de atenção
  model.config.dropout = 0.1  # Exemplo de configuração de dropout para outras camadas
  model = get_peft_model(model, peft_config)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
          learning_rate=1e-3,
          logging_steps=20
      )


  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()

In [8]:
# you need to set parameters
train_file_path = '/home/vickcoghi/code/mmf3358/Conta_tu_o_Conto/raw_data/treino_5000.csv'
model_name = 'gpt2'
output_dir = '/home/vickcoghi/code/mmf3358/Conta_tu_o_Conto'
overwrite_output_dir = True
per_device_train_batch_size = 16
num_train_epochs = 5.0
save_steps = 20


In [9]:
# It takes about 30 minutes to train in colab.
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

Repo card metadata block was not found. Setting CardData to empty.


Step,Training Loss
20,3.9574
40,3.6555
60,3.5938
80,3.5884
100,3.5123
120,3.5077
140,3.4994
160,3.5047
180,3.456
200,3.4211




KeyboardInterrupt: 

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "/home/vickcoghi/code/mmf3358/Conta_tu_o_Conto"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))

In [None]:
# sequence = input()
# max_len = 50 # 20
# generate_text(sequence, max_len)

In [None]:
model = load_model('/home/vickcoghi/code/mmf3358/Conta_tu_o_Conto')

In [None]:
generator = pipeline('text-generation', model=model,tokenizer='/home/vickcoghi/code/mmf3358/Conta_tu_o_Conto')