In [1]:
import numpy as np
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel, Trainer, TrainingArguments, AdamW
from transformers.modeling_outputs import CausalLMOutputWithPast
from torch.utils.data import Dataset, DataLoader

In [2]:
# Setting device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [42]:
#device = "cpu"

# Data loading

In [3]:
x_train = np.load('./splitted_data/x_train.npy')
y_train = np.load('./splitted_data/y_train.npy')

x_test = np.load('./splitted_data/x_test.npy')
y_test = np.load('./splitted_data/y_test.npy')

x_val = np.load('./splitted_data/x_val.npy')
y_val = np.load('./splitted_data/y_val.npy')

xy_train_validate = np.load('./splitted_data/xy_train_validate.npy')
xy_test = np.load('./splitted_data/xy_test.npy')

In [4]:
print(f"Train : {len(x_train)} ** Test : {len(x_test)} ** Val : {len(x_val)}")

Train : 493413 ** Test : 61677 ** Val : 61677


In [5]:
print(xy_train_validate, "\n\n", len(xy_train_validate))

['A very clean and well decorated empty bathroom'
 'A panoramic view of a kitchen and all of its appliances.'
 'A blue and white bathroom with butterfly themed wall tiles.' ...
 'Two women sit and pose with stuffed animals.'
 'White Plate with a lot of guacamole and an extra large dollop of sour cream over meat'
 'A dinner plate has a lemon wedge garnishment.'] 

 414113


In [6]:
xy_train_validate = xy_train_validate.tolist()

In [7]:
len(xy_train_validate)

414113

# Utilities

In [8]:
def dictPrint(dict):
    for key, value in dict.items():
        print("\n{}:\n\n\t{}".format(key,value))

# Tokenizer management

In [9]:
tokenizer = CTRLTokenizer.from_pretrained('ctrl')

In [10]:
# Setting special tokens
tokenizer.add_special_tokens({'bos_token': '<BOS>', 'eos_token': '<EOS>','pad_token': '[PAD]'})

3

In [11]:
def compute_max_length(sentences):
    out = 0
    for sentence in sentences:
        if len(sentence.split()) > out:
            out = len(sentence.split())
    return out

In [12]:
# Max length for padding
MAX_LENGTH = compute_max_length(xy_train_validate) + 1 # adding one to count the control code
MAX_LENGTH

50

# Dataset class definition

In [13]:
class COCO_dataset(Dataset):
    
    def __init__(self, captions, tokenizer, control_code): # captions intere o spezzate?
        self.captions = captions
        self.tokenizer = tokenizer
        self.control_code = control_code
    
    def __getitem__(self, index):
        
        outCaption = self.control_code + " " + self.captions[index]
        outDict = self.tokenizer(outCaption, return_tensors = 'pt', padding='max_length', max_length=MAX_LENGTH, truncation = True)
        
        return {'input_ids': outDict['input_ids'],
                'attention_mask': outDict['attention_mask'],
                'token_type_ids': outDict['token_type_ids']
               }
        
    
    def __len__(self):
        return len(self.captions)
    

# Dataset building

In [14]:
train = COCO_dataset(xy_train_validate[:300000], tokenizer, "captions")
evaluate = COCO_dataset(xy_train_validate[300000:], tokenizer, "captions")

# Fine-tuning with Trainer

In [None]:
# CTRL model instance
model = CTRLLMHeadModel.from_pretrained('ctrl')
model.train()

In [16]:
model.resize_token_embeddings(len(tokenizer))

Embedding(246537, 1280)

In [17]:
model = model.to(device)

In [30]:
# Setting training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total # of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

In [31]:
# Trainer instance
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train,         # training dataset
    eval_dataset=evaluate            # evaluation dataset
)

In [None]:
trainer.train() # to train

In [None]:
trainer.evaluate() # to evaluate

# Fine-tuning with a custom training loop

In [18]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [60]:
batch_size = 10
accumulation_steps = 10
evaluation_steps = 50
# Create the DataLoader for our training set.
train_sampler = RandomSampler(train)
train_dataloader = DataLoader(train, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_sampler = SequentialSampler(evaluate)
validation_dataloader = DataLoader(evaluate, sampler=validation_sampler, batch_size=batch_size)

In [61]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

In [62]:
epochs = 1
total_steps = len(train_dataloader) * epochs

In [63]:
from transformers import get_linear_schedule_with_warmup

In [64]:
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [65]:
import random

In [66]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [67]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [68]:
# Store the average loss after each epoch so we can plot them.
loss_values = []

In [69]:
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
    
    model.zero_grad()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        
        batch['input_ids'], batch['token_type_ids'], batch['attention_mask'] = batch['input_ids'].to(device), batch['token_type_ids'].to(device), batch['attention_mask'].to(device)
        
        # Logging
        print("Batch {} loaded on device ({})".format(step, device))
        
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
        
        model.zero_grad()
        print("Gradients put to zero.")
        
        outputs = model(input_ids = batch['input_ids'], 
                        token_type_ids= batch['token_type_ids'] , 
                        attention_mask= batch['attention_mask'],
                        labels = batch['input_ids'])
        
        print("Output computed.")
        
        # The call to `model` may return a tuple or an object of CausalLMOutputWithPast class.
        # In the first case, loss has to be extracted through index, while in the latter case,
        # it's an attribute.
        
        if isinstance(outputs, CausalLMOutputWithPast):
            loss = outputs.loss
        else:
            loss = outputs[0]
        
        
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        print("Loss: " + str(loss.item()))
        total_loss = total_loss + loss.item()
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        
        # Update the learning rate.
        scheduler.step()
        
        """
        #### Accumulating gradient version ####
        
        #Average loss over accumulation steps
        loss = loss / accumulation_steps
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            scheduler.step()
            model.zero_grad()                           # Reset gradients tensors
        if (i+1) % evaluation_steps == 0:           # Evaluate the model when we...
            evaluate_model()
        
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        
        # Update the learning rate.
        scheduler.step()
        """
    
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation NOT SURE ABOUT VALIDATION BECAUSE 
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    '''
        print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy
        # Track the number of batches
        nb_eval_steps += 1
    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    
    
    
    
    
    '''
print("")
print("Training complete!")


Training...
Batch 0 loaded on device (cuda:0)
Gradients put to zero.


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 11.17 GiB total capacity; 10.67 GiB already allocated; 1.44 MiB free; 10.81 GiB reserved in total by PyTorch)