In [1]:
import torch
import pytorch_lightning as pl
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, GPT2LMHeadModel, GPT2Tokenizer,BertForQuestionAnswering, BertTokenizer
from torch.utils.data import DataLoader, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-small')
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small', padding_side='left') 
tokenizer.pad_token = tokenizer.eos_token

In [3]:
myDataEnglish = "The solar system consists of the Sun and all celestial objects orbiting it. There are eight major planets in our solar system, with Earth being the only one known to support life. The largest planet is Jupiter, while the smallest is Mercury. Beyond the planets, there are numerous asteroids, comets, and dwarf planets, such as Pluto. The study of our solar system continues to reveal its vast complexity and beauty."
# myDataEnglish = "asd, aasddsds"
batch_size = 2
num_epochs = 5


In [4]:
class CustomDataSet(Dataset):
    def __init__(self, data):
        self.data = data.split(',')
        pass
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        return self.data[index]
    
dataset = CustomDataSet(myDataEnglish)
dataLoader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [5]:
for epoch in range(num_epochs):
    total_loss = 0
    model.train()
    for bach in dataLoader:
        idsTraining = tokenizer(bach, return_tensors='pt', padding=True, truncation=True)
        input_ids = idsTraining['input_ids']
        optimizer.zero_grad()
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        # print(loss)
    
    avg_loss = total_loss / len(dataLoader)
    # print(avg_loss)
    print(f'Epoch [{epoch + 1}/{num_epochs}] Loss: {loss.item()}')


model.config.pad_token_id = model.config.eos_token_id

Epoch [1/5] Loss: 8.95949935913086
Epoch [2/5] Loss: 7.261989593505859
Epoch [3/5] Loss: 4.7879791259765625
Epoch [4/5] Loss: 2.95976185798645
Epoch [5/5] Loss: 9.934455871582031


In [6]:
inputs = tokenizer(["what ocean did the titanic sink in ?"], return_tensors="pt")

# What are the major components of our solar system?
# print(inputs)
# what ocean did the titanic sink in ?
# inputs = tokenizer('Qaundo o titanic afundou ?', return_tensors="pt")
# outputs = model(asd['input_ids'])
# print(inputs['input_ids'])
outputs = model.generate(
    inputs['input_ids'],
    max_new_tokens=20,
    num_beams=2,
    num_return_sequences=2,
    return_dict_in_generate=True,
    output_scores=True,

    # max_length=50,  # Comprimento máximo do texto gerado
    # min_length=10,  # Comprimento mínimo do texto gerado
    # num_return_sequences=1,  # Três sequências geradas
    temperature=10,  # Temperatura moderada para aleatoriedade
    # top_k=50,  # Limita as escolhas de token
    # top_p=0.9,  # Controla a proporção acumulada de probabilidade
    # repetition_penalty=1.2,  # Penalização de repetição de tokens
    # pad_token_id=tokenizer.pad_token_id,  # Token de preenchimento
    # eos_token_id=tokenizer.eos_token_id,  # Token de fim de sequência
    eos_token_id=4013,  # Token de fim de sequência
    # length_penalty=0.1
)


# print(outputs)

output_ids = outputs['sequences']
# output_ids = outputs[0]
print(output_ids)
# print(len(output_ids))
# resposta = tokenizer.decode(output_ids[0])
for id in output_ids:
    resposta = tokenizer.decode(id)
    print(resposta)
# resposta = tokenizer.decode()
# print(resposta)

# resposte = tokenizer.decode(outputs['sequences'])
# print(resposte)
# print(outputs['sequences'])sequences_scores, beam_indices



tensor([[10919,  9151,   750,   262,  5259, 26277, 14595,   287,  5633, 50256,
            13,  1318,   318,   691,   530,  1900,  1688,  5440,   284,  1104,
          1204,    13, 22409,    13,   383,  4387,   318, 22721,    13],
        [10919,  9151,   750,   262,  5259, 26277, 14595,   287,  5633, 50256,
            13,  1318,   318,   691,   530,  1900,  1688,  5440,   284,  1104,
          1204,    13, 22409,    13,  1318,   389,    13,  1318,   389]])
what ocean did the titanic sink in?<|endoftext|>. There is only one known major planet to support life. intercourse. The largest is Jupiter.
what ocean did the titanic sink in?<|endoftext|>. There is only one known major planet to support life. intercourse. There are. There are
