In [3]:
import numpy as np
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

# Data loading

In [4]:
x_train = np.load('./splitted_data/x_train.npy')
y_train = np.load('./splitted_data/y_train.npy')

x_test = np.load('./splitted_data/x_test.npy')
y_test = np.load('./splitted_data/y_test.npy')

x_val = np.load('./splitted_data/x_val.npy')
y_val = np.load('./splitted_data/y_val.npy')

xy_train_validate = np.load('./splitted_data/xy_train_validate.npy')
xy_test = np.load('./splitted_data/xy_test.npy')

In [5]:
print(f"Train : {len(x_train)} ** Test : {len(x_test)} ** Val : {len(x_val)}")

Train : 493413 ** Test : 61677 ** Val : 61677


In [6]:
print(xy_train_validate, "\n\n", len(xy_train_validate))

['A very clean and well decorated empty bathroom'
 'A panoramic view of a kitchen and all of its appliances.'
 'A blue and white bathroom with butterfly themed wall tiles.' ...
 'Two women sit and pose with stuffed animals.'
 'White Plate with a lot of guacamole and an extra large dollop of sour cream over meat'
 'A dinner plate has a lemon wedge garnishment.'] 

 414113


In [7]:
len(xy_train_validate)

414113

# Utilities

In [8]:
def dictPrint(dict):
    for key, value in dict.items():
        print("\n{}:\n\n\t{}".format(key,value))

# Tokenizer management

In [9]:
tokenizer = CTRLTokenizer.from_pretrained('ctrl')

In [10]:
# Setting special tokens
tokenizer.add_special_tokens({'bos_token': '<BOS>', 'eos_token': '<EOS>','pad_token': '[PAD]'})

3

In [11]:
# Max length for padding and truncation
MAX_LENGTH = 20

# Dataset class definition

In [12]:
class COCO_dataset(Dataset):
    
    def __init__(self, captions, tokenizer): # captions intere o spezzate?
        self.captions = captions
        self.tokenizer = tokenizer
    
    def __getitem__(self, index):
        
        outCaption = str(self.captions[index])
        outDict = self.tokenizer(outCaption, return_tensors = 'pt', padding='max_length', max_length=MAX_LENGTH, truncation = True)
        
#        return {'input_ids': outDict['input_ids'],  ###### Dovrebbe essere la stessa cosa
#                'past_key_values': None,
#                'attention_mask': outDict['attention_mask'],
#                'token_type_ids': outDict['token_type_ids'],
#                'position_ids': None,
#                'head_mask': None,
#               }
        return outDict
    
    def __len__(self):
        return len(self.captions)
    

# Dataset building

In [13]:
train = COCO_dataset(xy_train_validate[:300000], tokenizer)
evaluate = COCO_dataset(xy_train_validate[300000:], tokenizer)

# Fine-tuning script

In [None]:
# CTRL model instance
model = CTRLLMHeadModel.from_pretrained('ctrl')
model.train()

In [15]:
model.resize_token_embeddings(len(tokenizer))

Embedding(246537, 1280)

In [16]:
# Setting training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

In [17]:
# Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train,         # training dataset
    eval_dataset=evaluate            # evaluation dataset
)

In [18]:
trainer.train() # to train

KeyError: 'loss'

In [None]:
trainer.evaluate() # to evaluate