In [1]:
import warnings
warnings.filterwarnings('ignore',category=FutureWarning)

In [2]:
import numpy as np
import torch
from transformers import CTRLTokenizer, CTRLLMHeadModel, Trainer, TrainingArguments, AdamW
from transformers.modeling_outputs import CausalLMOutputWithPast
from torch.utils.data import Dataset, DataLoader

In [3]:
# Setting device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# Data loading

In [4]:
#x_train = np.load('./splitted_data/x_train.npy')
#y_train = np.load('./splitted_data/y_train.npy')

#x_test = np.load('./splitted_data/x_test.npy')
#y_test = np.load('./splitted_data/y_test.npy')

#x_val = np.load('./splitted_data/x_val.npy')
#y_val = np.load('./splitted_data/y_val.npy')

xy_train_validate = np.load('./splitted_data/xy_train_validate.npy')
xy_test = np.load('./splitted_data/xy_test.npy')

In [5]:
#print(f"Train : {len(x_train)} ** Test : {len(x_test)} ** Val : {len(x_val)}")

In [6]:
print(xy_train_validate, "\n\n", len(xy_train_validate))

['A very clean and well decorated empty bathroom'
 'A panoramic view of a kitchen and all of its appliances.'
 'A blue and white bathroom with butterfly themed wall tiles.' ...
 'Two women sit and pose with stuffed animals.'
 'White Plate with a lot of guacamole and an extra large dollop of sour cream over meat'
 'A dinner plate has a lemon wedge garnishment.'] 

 414113


In [7]:
xy_train_validate = xy_train_validate.tolist()

In [8]:
len(xy_train_validate)

414113

# Utilities

In [9]:
def dictPrint(dict):
    for key, value in dict.items():
        print("\n{}:\n\n\t{}".format(key,value))

# Tokenizer management

In [10]:
tokenizer = CTRLTokenizer.from_pretrained('ctrl')

In [11]:
# Setting special tokens
tokenizer.add_special_tokens({'bos_token': '<BOS>', 'eos_token': '<EOS>','pad_token': '[PAD]'})

3

In [12]:
def compute_max_length(sentences):
    out = 0
    for sentence in sentences:
        if len(sentence.split()) > out:
            out = len(sentence.split())
    return out

In [13]:
# Max length for padding
MAX_LENGTH = compute_max_length(xy_train_validate) + 1 # adding one to count the control code
MAX_LENGTH

50

# Dataset class definition

In [14]:
class COCO_dataset(Dataset):
    
    def __init__(self, captions, tokenizer, control_code): # captions intere o spezzate?
        self.captions = captions
        self.tokenizer = tokenizer
        self.control_code = control_code
    
    def __getitem__(self, index):
        
        outCaption = self.control_code + " " + self.captions[index]
        outDict = self.tokenizer(outCaption, return_tensors = 'pt', padding='max_length', max_length=MAX_LENGTH, truncation = True)
        
        return {'input_ids': outDict['input_ids'],
                'attention_mask': outDict['attention_mask'],
                'token_type_ids': outDict['token_type_ids']
               }
        
    
    def __len__(self):
        return len(self.captions)
    

# Dataset building

In [15]:
train = COCO_dataset(xy_train_validate[:300000], tokenizer, "captions")
evaluate = COCO_dataset(xy_train_validate[300000:], tokenizer, "captions")

In [121]:
train = COCO_dataset(xy_train_validate[:100000], tokenizer, "captions")
evaluate = COCO_dataset(xy_train_validate[100000:150000], tokenizer, "captions")

# Fine-tuning with Trainer

In [16]:
# CTRL model instance
model = CTRLLMHeadModel.from_pretrained('sshleifer/tiny-ctrl') # tiny version
#model = CTRLLMHeadModel.from_pretrained('ctrl') # full dimension version
model.to(device)
model.train()

CTRLLMHeadModel(
  (transformer): CTRLModel(
    (w): Embedding(246534, 16)
    (dropout): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): EncoderLayer(
        (multi_head_attention): MultiHeadAttention(
          (Wq): Linear(in_features=16, out_features=16, bias=True)
          (Wk): Linear(in_features=16, out_features=16, bias=True)
          (Wv): Linear(in_features=16, out_features=16, bias=True)
          (dense): Linear(in_features=16, out_features=16, bias=True)
        )
        (ffn): Sequential(
          (0): Linear(in_features=16, out_features=2, bias=True)
          (1): ReLU()
          (2): Linear(in_features=2, out_features=16, bias=True)
        )
        (layernorm1): LayerNorm((16,), eps=1e-06, elementwise_affine=True)
        (layernorm2): LayerNorm((16,), eps=1e-06, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): EncoderLayer(
        (multi_head_attentio

In [17]:
model.resize_token_embeddings(len(tokenizer))

Embedding(246537, 16)

# Fine-tuning with a custom training loop

In [18]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [19]:
batch_size = 10
accumulation_steps = 10
evaluation_steps = 50

# Create the DataLoader for our training set.
train_sampler = RandomSampler(train)
train_dataloader = DataLoader(train, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_sampler = SequentialSampler(evaluate)
validation_dataloader = DataLoader(evaluate, sampler=validation_sampler, batch_size=batch_size)

In [20]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

In [21]:
epochs = 2
total_steps = len(train_dataloader) * epochs

In [22]:
from transformers import get_linear_schedule_with_warmup

In [23]:
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [24]:
import random

In [25]:
import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [26]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [27]:
# Store the average loss after each epoch so we can plot them.
loss_values = []

In [28]:
# BERT metric
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [29]:
#device="cpu"
#model.to(device)

In [30]:
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
    
    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()
    
    model.zero_grad()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        
        batch['input_ids'], batch['token_type_ids'], batch['attention_mask'] = batch['input_ids'].to(device), batch['token_type_ids'].to(device), batch['attention_mask'].to(device)
        
        # Logging
        #print("Batch {} loaded on device ({})".format(step, device))
        
        # Progress update every 40 batches.
        if step % 500 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        
        
        model.zero_grad()
        #print("Gradients put to zero.")
        
        outputs = model(input_ids = batch['input_ids'], 
                        token_type_ids= batch['token_type_ids'] , 
                        attention_mask= batch['attention_mask'],
                        labels = batch['input_ids'])
        
        #print("Output computed.")
        
        # The call to `model` may return a tuple or an object of CausalLMOutputWithPast class.
        # In the first case, loss has to be extracted through index, while in the latter case,
        # it's an attribute.
        
        if isinstance(outputs, CausalLMOutputWithPast):
            loss = outputs.loss
        else:
            loss = outputs[0]
        
        
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        
        if step % 100 == 0 and not step == 0:
            print("Loss: " + str(loss.item()))
        
        total_loss = total_loss + loss.item()
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        
        # Update the learning rate.
        scheduler.step()
        
        """
        #### Accumulating gradient version ####
        
        #Average loss over accumulation steps
        loss = loss / accumulation_steps
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        if (i+1) % accumulation_steps == 0:             # Wait for several backward steps
            optimizer.step()                            # Now we can do an optimizer step
            scheduler.step()
            model.zero_grad()                           # Reset gradients tensors
        if (i+1) % evaluation_steps == 0:           # Evaluate the model when we...
            evaluate_model()
        
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        
        # Update the learning rate.
        scheduler.step()
        """
    
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation NOT SURE ABOUT VALIDATION BECAUSE 
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    """
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for step, batch in enumerate(validation_dataloader):
        
        # Add batch to GPU
        batch['input_ids'], batch['token_type_ids'], batch['attention_mask'] = batch['input_ids'].to(device), batch['token_type_ids'].to(device), batch['attention_mask'].to(device)
                
        # Logging
        #print("Batch {} loaded on device ({})".format(step, device))
    
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(batch['input_ids'], 
                            token_type_ids=batch['token_type_ids'], 
                            attention_mask=batch['attention_mask'],
                            labels = batch['input_ids'])
        print(outputs.loss)
        
        #### CAPIRE COME VALUTARE LA VALIDATION
        '''
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy
        # Track the number of batches
        nb_eval_steps += 1
    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))'''
"""
    
print("")
print("Training complete!")


Training...
Loss: 12.312093734741211
Loss: 12.267667770385742
Loss: 12.213639259338379
Loss: 12.159385681152344
  Batch   500  of  30,000.    Elapsed: 0:01:03.
Loss: 12.100106239318848
Loss: 12.034055709838867
Loss: 11.962601661682129
Loss: 11.920173645019531
Loss: 11.84681510925293
  Batch 1,000  of  30,000.    Elapsed: 0:02:06.
Loss: 11.77114486694336
Loss: 11.692214012145996
Loss: 11.627106666564941
Loss: 11.552595138549805
Loss: 11.485422134399414
  Batch 1,500  of  30,000.    Elapsed: 0:03:09.
Loss: 11.417083740234375
Loss: 11.330798149108887
Loss: 11.255816459655762
Loss: 11.166817665100098
Loss: 11.114569664001465
  Batch 2,000  of  30,000.    Elapsed: 0:04:12.
Loss: 11.016351699829102
Loss: 10.978005409240723
Loss: 10.934349060058594
Loss: 10.83154582977295
Loss: 10.73050594329834
  Batch 2,500  of  30,000.    Elapsed: 0:05:15.
Loss: 10.668656349182129
Loss: 10.616241455078125
Loss: 10.51346492767334
Loss: 10.453363418579102
Loss: 10.35300350189209
  Batch 3,000  of  30,000.  

# Text generation

In [31]:
device = 'cpu'
model = model.to(device)

In [36]:
#without prompt

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(max_length=50, bos_token_id = tokenizer.encode("<BOS>")[0])

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))


Output:
----------------------------------------------------------------------------------------------------
A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [37]:
# with control code only 

# encode context the generation is conditioned on
input_ids = tokenizer('captions', return_tensors='pt').input_ids
input_ids.to(device)

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
captions A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A


In [38]:
# with control code and prompt

# encode context the generation is conditioned on
input_ids = tokenizer('captions I cute dog in', return_tensors='pt').input_ids
input_ids.to(device)

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
captions I cute dog in a a a a a a a a


In [39]:
# model saving
model.save_pretrained("./finetuned_models/tinyctrl/2epochs")

# Load a previously fine-tuned model

In [40]:
##### DA SISTEMARE #####

from transformers import PreTrainedModel
model = PreTrainedModel.from_pretrained("mlai2020/Conditional_Text_Generation/finetuned_models/tinyctrl/2epochs/")

AttributeError: 'NoneType' object has no attribute 'from_pretrained'

## comandi utili per metrica BLEU

In [None]:
!git clone https://github.com/geek-ai/Texygen.git Texygen

In [None]:
from Texygen.utils.metrics.Bleu import Bleu

In [None]:
inputs = tokenizer("The cat is on the table", return_tensors="pt")
generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)

In [None]:
tokenizer.decode(token_ids = generation_output[0], skip_special_tokens = False)

In [None]:
f = open("demofile2.txt", "w")
f.write("The cat is on the table and also the pen is on the table")
f.close()

In [None]:
f = open("demofile1.txt", "w")
f.write("The cat is on the table de la table de la table de la table de la table de la")
f.close()

In [None]:
bleu = Bleu(test_text='demofile1.txt', real_text='demofile2.txt', gram=3)
bleu.get_score() #non funziona