In [1]:
import numpy as np
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Data loading

In [2]:
x_train = np.load('./splitted_data/x_train.npy')
y_train = np.load('./splitted_data/y_train.npy')

x_test = np.load('./splitted_data/x_test.npy')
y_test = np.load('./splitted_data/y_test.npy')

x_val = np.load('./splitted_data/x_val.npy')
y_val = np.load('./splitted_data/y_val.npy')

xy_train_validate = np.load('./splitted_data/xy_train_validate.npy')
xy_test = np.load('./splitted_data/xy_test.npy')

In [3]:
print(f"Train : {len(x_train)} ** Test : {len(x_test)} ** Val : {len(x_val)}")

Train : 493413 ** Test : 61677 ** Val : 61677


In [4]:
print(xy_train_validate, "\n\n", len(xy_train_validate))

['A very clean and well decorated empty bathroom'
 'A panoramic view of a kitchen and all of its appliances.'
 'A blue and white bathroom with butterfly themed wall tiles.' ...
 'Two women sit and pose with stuffed animals.'
 'White Plate with a lot of guacamole and an extra large dollop of sour cream over meat'
 'A dinner plate has a lemon wedge garnishment.'] 

 414113


In [27]:
len(xy_train_validate)

414113

# Data tokenization

In [31]:
tokenizer = CTRLTokenizer.from_pretrained('ctrl')

In [32]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

In [33]:
xy_train_validate_encoded = tokenizer(xy_train_validate.tolist(), return_tensors = 'pt', padding = True, truncation = True)

In [34]:
print(xy_train_validate_encoded)

{'input_ids': tensor([[    75,     82,   4200,  ..., 246534, 246534, 246534],
        [    75,  68927,    971,  ..., 246534, 246534, 246534],
        [    75,   1416,      2,  ..., 246534, 246534, 246534],
        ...,
        [  1010,    571,   2208,  ..., 246534, 246534, 246534],
        [  1480,  14828,     12,  ..., 246534, 246534, 246534],
        [    75,   3724,   5314,  ..., 246534, 246534, 246534]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [36]:
xy_train_validate_encoded.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

# Dataset class definition

In [37]:
class COCO_dataset(Dataset):
    
    def __init__(self, captions, tokenizer): # captions intere o spezzate?
        self.captions = captions
        self.tokenizer = tokenizer
    
    def __getitem__(self, index):
        
        outCaption = str(self.captions[index])
        outDict = self.tokenizer(outCaption, return_tensors = 'pt', padding = True, truncation = True)
        
        return outDict
    
    def __len__(self):
        return len(self.captions)
    

# Dataset building

In [38]:
train = COCO_dataset(xy_train_validate[:300000], tokenizer)
evaluate = COCO_dataset(xy_train_validate[300000:], tokenizer)

# Fine-tuning script

In [None]:
# CTRL model instance
model = CTRLLMHeadModel.from_pretrained('ctrl')
model.train()

In [42]:
# Setting training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

In [43]:
# Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train,         # training dataset
    eval_dataset=evaluate            # evaluation dataset
)

In [44]:
trainer.train() # to train

RuntimeError: stack expects each tensor to be equal size, but got [1, 10] at entry 0 and [1, 9] at entry 1

In [None]:
trainer.evaluate() # to evaluate