### Dataset

Volvemos a descargar el dataset de chistes

In [1]:
from datasets import load_dataset

jokes = load_dataset("Maximofn/short-jokes-dataset")
jokes

DatasetDict({
    train: Dataset({
        features: ['ID', 'Joke'],
        num_rows: 231657
    })
})

Creamos un subset por si se tiene poca memoria

In [2]:
percent_of_train_dataset = 0.0005    # If you want 50% of the dataset, set this to 0.5

subset_dataset = jokes["train"].select(range(int(len(jokes["train"]) * percent_of_train_dataset)))
subset_dataset

Dataset({
    features: ['ID', 'Joke'],
    num_rows: 115
})

Dividimos el dataset en subsets de entrenamiento, validación y test

In [3]:
percent_of_train_dataset = 0.90

split_dataset = subset_dataset.train_test_split(train_size=int(subset_dataset.num_rows * percent_of_train_dataset), seed=19, shuffle=False)
train_dataset = split_dataset["train"]
validation_test_dataset = split_dataset["test"]

split_dataset = validation_test_dataset.train_test_split(train_size=int(validation_test_dataset.num_rows * 0.5), seed=19, shuffle=False)
validation_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print(f"Size of the train set: {len(train_dataset)}. Size of the validation set: {len(validation_dataset)}. Size of the test set: {len(test_dataset)}")

Size of the train set: 103. Size of the validation set: 6. Size of the test set: 6


### Tokenizador

Iniciamos el tokenizador y asignamos el token de padding al de end of string

In [4]:
from transformers import AutoTokenizer

checkpoints = "openai-community/gpt2"

tokenizer = AutoTokenizer.from_pretrained(checkpoints)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Añadimos los tokens especiales de inicio de chiste y fin de chiste

In [5]:
new_tokens = ['<SJ>', '<EJ>']   # Start and end of joke tokens

num_added_tokens = tokenizer.add_tokens(new_tokens)
print(f"Added {num_added_tokens} tokens")

Added 2 tokens


Los añadimos al dataset

In [6]:
joke_column = "Joke"

def format_joke(example):
    example[joke_column] = '<SJ> ' + example['Joke'] + ' <EJ>'
    return example

remove_columns = [column for column in train_dataset.column_names if column != joke_column]

train_dataset = train_dataset.map(format_joke, remove_columns=remove_columns)
validation_dataset = validation_dataset.map(format_joke, remove_columns=remove_columns)
test_dataset = test_dataset.map(format_joke, remove_columns=remove_columns)
train_dataset, validation_dataset, test_dataset

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

(Dataset({
     features: ['Joke'],
     num_rows: 103
 }),
 Dataset({
     features: ['Joke'],
     num_rows: 6
 }),
 Dataset({
     features: ['Joke'],
     num_rows: 6
 }))

Tokenizamos el dataset

In [7]:
def tokenize_function(examples):
    return tokenizer(examples[joke_column], padding="max_length", truncation=True, max_length=768, return_tensors="pt")

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=[joke_column])
validation_dataset = validation_dataset.map(tokenize_function, batched=True, remove_columns=[joke_column])
test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=[joke_column])
train_dataset, validation_dataset, test_dataset

Map:   0%|          | 0/103 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

(Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 103
 }),
 Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 6
 }),
 Dataset({
     features: ['input_ids', 'attention_mask'],
     num_rows: 6
 }))

### Modelo

Instanciamos el modelo, asignamos el token de padding y añadimos los nuevos tokens de inicion de chiste y fin de chiste

In [8]:
from transformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained(checkpoints)
model.config.pad_token_id = model.config.eos_token_id
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

### Device

Creamos el dispositivo y pasamos el modelo al dispositivo

In [9]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.half().to(device)
print()




### Pytorch Dataset

Creamos un dataset de pytorch

In [10]:
from torch.utils.data import Dataset

class JokesDataset(Dataset):
    def __init__(self, huggingface_dataset):
        self.dataset = huggingface_dataset

    def __getitem__(self, idx):
        input_ids = torch.tensor(self.dataset[idx]['input_ids'])
        attention_mask = torch.tensor(self.dataset[idx]['attention_mask'])
        return input_ids, attention_mask

    def __len__(self):
        return len(self.dataset)

Instanciamos los datasets de entrenamiento, validación y test

In [11]:
train_pytorch_dataset = JokesDataset(train_dataset)
validation_pytorch_dataset = JokesDataset(validation_dataset)
test_pytorch_dataset = JokesDataset(test_dataset)

Veamos una muestra

In [12]:
input_ids, attention_mask = train_pytorch_dataset[0]
input_ids.shape, attention_mask.shape

(torch.Size([768]), torch.Size([768]))

### Pytorch Dataloader

Creamos los dataloaders

In [13]:
from torch.utils.data import DataLoader

BS = 2

train_loader = DataLoader(train_pytorch_dataset, batch_size=BS, shuffle=True)
validation_loader = DataLoader(validation_pytorch_dataset, batch_size=BS)
test_loader = DataLoader(test_pytorch_dataset, batch_size=BS)

Vemos una muestra

In [14]:
# input_ids, attention_mask = next(iter(train_loader))
# input_ids.shape, attention_mask.shape

Se lo pasamos al modelo

In [15]:
# output = model(input_ids.to(device), attention_mask=attention_mask.to(device))
# output.keys()

Como vemos no tenemos valor de `loss`, como hemos visto tenemos que pasarle el `input_ids` y el `labels`

In [16]:
# output = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=input_ids.to(device))
# output.keys()

Ahora sí tenemos `loss`

In [17]:
# output['loss'].item()

### Optimizador

Creamos un optimizador

In [18]:
from transformers import AdamW

LR = 2e-5
optimizer = AdamW(model.parameters(), lr=5e-5)



### Entrenamiento

Creamos el bucle de entrenamiento

In [19]:
from tqdm import tqdm

EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    progresbar = tqdm(train_loader, total=len(train_loader), desc=f'Epoch {epoch + 1}')
    for input_ids, at_mask in progresbar:
        input_ids = input_ids.to(device)
        at_mask = at_mask.to(device)

        output = model(input_ids=input_ids, attention_mask=at_mask, labels=input_ids)

        loss = output['loss']
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        progresbar.set_postfix({'train_loss': loss.item()})
    train_loss /= len(train_loader)
    progresbar.set_postfix({'train_loss': train_loss})

Epoch 1: 100%|██████████| 52/52 [04:02<00:00,  4.67s/it, train_loss=0.527]
Epoch 2: 100%|██████████| 52/52 [04:02<00:00,  4.66s/it, train_loss=0.758]
Epoch 3: 100%|██████████| 52/52 [04:02<00:00,  4.66s/it, train_loss=0.227]


### Uso del modelo

Probamos el modelo, primero tokenizamos texto

In [20]:
input_tokens = tokenize_function({'Joke': "<SJ> Why didn't the frog cross the road"})
input_tokens['input_ids'].shape, input_tokens['attention_mask'].shape

(torch.Size([1, 768]), torch.Size([1, 768]))

Se pasan los tokens al modelo

In [27]:
tokens_output = model.generate(**input_tokens.to(model.device), max_new_tokens=500, do_sample=True, top_k=0)
tokens_output.shape

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([1, 789])

Decodificamos los tokens de salida

In [30]:
decoded_joke = tokenizer.decode(tokens_output[0], skip_special_tokens=True)
decoded_joke

'<SJ> Why didn\'t the frog cross the road 4 fuck in condemning goodA tights Holden examines\'s overdoses opens hillGuy." C narrated they '