In [1]:
import glob
import numpy as np
import pandas as pd
import os
import time
import torch
import torch.nn as nn
from torch import optim
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, random_split
import matplotlib.pyplot as plt
import yaml
import re
import sys
sys.path.append('..')
from APPLICATION.model.tokenization import PrettyMidiTokenizer, BCI_TOKENS, SILENCE_TOKEN
from APPLICATION.model.model import TCN

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',504)
pd.set_option('display.width',1000)


# MODEL PARAMETERS
EPOCHS = 500 # 500
LEARNING_RATE = 1 # 4
BATCH_SIZE = 4 # 16
TRAIN_MODEL = True
FEEDBACK = False
EMPHASIZE_EEG = True
EARLY_STOP = True

pwd = os.getcwd()
print(pwd)

DIRECTORY_PATH = ''


cuda:0
/home/marco_bortolotti/project/Affective-AI-Music-Improviser/TCN


In [12]:
'''
Assumptions:
Sequences described as input_#.mid and output_#.mid in the corresponding folders
'''
DATASET_PATH = os.path.join(DIRECTORY_PATH, 'dataset')

print(DATASET_PATH)

input_filenames = sorted(glob.glob(os.path.join(DATASET_PATH, 'input/*.mid')))
print('Number of input files:', len(input_filenames))

output_filenames = sorted(glob.glob(os.path.join(DATASET_PATH, 'output/*.mid')))
print('Number of output files:', len(output_filenames), '\n')


INPUT_TOK = PrettyMidiTokenizer()
OUTPUT_TOK = PrettyMidiTokenizer()

for i, (in_file, out_file) in enumerate(zip(input_filenames, output_filenames)):

    in_file_name = os.path.basename(in_file)
    out_file_name = os.path.basename(out_file)
    print(f'{i + 1}: {in_file_name} -> {out_file_name}')

    if 'RELAX' in in_file_name:
        emotion_token = BCI_TOKENS['relaxed']
    elif 'EXCITED' in in_file_name:
        emotion_token = BCI_TOKENS['concentrated']
    else:
        raise Exception('Emotion not found in file name. Please add the emotion to the file name.')

    in_seq, in_df = INPUT_TOK.midi_to_tokens(in_file, update_vocab=True, update_sequences=True, emotion_token = emotion_token, instrument='drum')
    out_seq, out_df = OUTPUT_TOK.midi_to_tokens(out_file, update_vocab=True, update_sequences=True)

    print(f'Input sequence length: {len(in_seq)}')
    print(f'Emotion token: {emotion_token}\n')

print(f'\nNumber of input sequences: {len(INPUT_TOK.sequences)}')
print(f'Input sequence length: {len(INPUT_TOK.sequences[0])}')
print(f'Input vocabulars size: {len(INPUT_TOK.VOCAB)}')
print(f'\nNumber of output sequences: {len(OUTPUT_TOK.sequences)}')
print(f'Output sequence length: {len(OUTPUT_TOK.sequences[0])}')
print(f'Output vocabulars size: {len(OUTPUT_TOK.VOCAB)}')

print('\nInput vocab:', INPUT_TOK.VOCAB.word2idx)
print('Output vocab:', OUTPUT_TOK.VOCAB.word2idx)

# with open('training_seq.txt', 'w') as f:    
#     for seq in INPUT_TOK.sequences:
#         for tok in seq[:48]:
#             f.write('\"' + INPUT_TOK.VOCAB.idx2word[tok] + '\", ')
#         f.write('\n')




dataset
Number of input files: 6
Number of output files: 6 

1: 0_Drum_HardRock_EXCITED.mid -> 0_Bass_HardRock_EXCITED.mid
Input sequence length: 26
Emotion token: C

2: 1_Drum_HardRock_EXCITED.mid -> 1_Bass_HardRock_EXCITED.mid
Input sequence length: 32
Emotion token: C

3: 2_Drum_Blues_EXCITED.mid -> 2_Bass_Blues_EXCITED.mid
Input sequence length: 20
Emotion token: C

4: 3_Drum_Blues_EXCITED.mid -> 3_Bass_Blues_EXCITED.mid
Input sequence length: 20
Emotion token: C

5: 4_Drum_PopRock_RELAX.mid -> 4_Bass_PopRock_RELAX.mid
Input sequence length: 35
Emotion token: R

6: 5_Drum_PopRock_RELAX.mid -> 5_Bass_PopRock_RELAX.mid
Input sequence length: 23
Emotion token: R


Number of input sequences: 156
Input sequence length: 192
Input vocabulars size: 16

Number of output sequences: 156
Output sequence length: 192
Output vocabulars size: 87

Input vocab: {'O': 0, 'R': 1, 'C': 2, '36fS': 3, '42pS': 4, '38fS': 5, '36fS_42fS': 6, '38fS_42fS': 7, '42fS': 8, '36fS_42pS': 9, '38pS_42fS': 10, '38fS_

In [13]:
# Create the dataset
dataset = TensorDataset(torch.LongTensor(INPUT_TOK.sequences).to(device),
                        torch.LongTensor(OUTPUT_TOK.sequences).to(device))

# Split the dataset into training, evaluation and test sets
train_set, eval_set, test_set = random_split(dataset, [0.6, 0.2, 0.2])

In [14]:
# Augment the training set
def data_augmentation_shift(dataset, shifts):
    '''
    Shifts the sequences by a number of ticks to create new sequences.
    '''
    augmented_input_sequences = []
    output_sequences = []

    for ticks in shifts:
        for input_sequence, ouput_sequence in dataset:
            input_sequence = input_sequence.cpu().numpy().copy()

            # remove the first token since it is the emotion token
            emotion_token = input_sequence[0]
            input_sequence = input_sequence[1:]

            # shift the sequence
            new_input_sequence = np.roll(input_sequence, ticks)

            # add the emotion token back to the sequence
            new_input_sequence = np.concatenate(([emotion_token], new_input_sequence))

            # add the new sequence to the augmented sequences
            augmented_input_sequences.append(new_input_sequence)
            output_sequences.append(ouput_sequence.cpu().numpy().copy())
    
    augmented_dataset = TensorDataset(torch.LongTensor(augmented_input_sequences).to(device), 
                                      torch.LongTensor(output_sequences).to(device))
    
    # Concatenate the original and the augmented dataset
    concatenated_dataset = torch.utils.data.ConcatDataset([dataset, augmented_dataset])

    return concatenated_dataset


def data_augmentation_transposition(dataset, transpositions):
    '''
    Transpose the sequences by a number of semitones to create new sequences.

    Parameters:
    - transpositions: a list of integers representing the number of semitones to transpose the sequences.

    NB: The transposition is done by adding the number of semitones to the pitch of each note in the sequence.
    '''

    input_sequences = []
    augmented_output_sequences = []

    for transposition in transpositions:
        for input_sequence, ouput_sequence in dataset:

            input_sequence = input_sequence.cpu().numpy().copy()
            new_ouput_sequence = ouput_sequence.cpu().numpy().copy()

            for i in range(len(new_ouput_sequence)):

                token = ouput_sequence[i]
                word = OUTPUT_TOK.VOCAB.idx2word[token]

                # check if the token is a note
                if word != SILENCE_TOKEN and word != BCI_TOKENS['relaxed'] and word != BCI_TOKENS['concentrated']:

                    # extract all the pitches from the token 
                    pitches = re.findall(r'\d+', word) # NB: pitches is a string list

                    # transpose each pitch in the token 
                    for pitch in pitches:
                        new_pitch = str(int(pitch) + transposition)
                        word = word.replace(pitch, new_pitch)

                    # add the new token to the vocabulary
                    OUTPUT_TOK.VOCAB.add_word(word) 

                    # update the sequence with the new token
                    new_ouput_sequence[i] = OUTPUT_TOK.VOCAB.word2idx[word]
            
            # update sequence with the new tokens
            input_sequences.append(input_sequence)
            augmented_output_sequences.append(new_ouput_sequence)

    augmented_dataset = TensorDataset(torch.LongTensor(input_sequences).to(device), 
                                      torch.LongTensor(augmented_output_sequences).to(device))
    
    # Concatenate the original and the augmented dataset
    concatenated_dataset = torch.utils.data.ConcatDataset([dataset, augmented_dataset])

    return concatenated_dataset

train_set_augmented = data_augmentation_shift(train_set, [-3, -2, -1, 1, 2, 3])
train_set_augmented = data_augmentation_transposition(train_set_augmented, [3,5])

print(f'Training set size before augmentation: {len(train_set)}')
print(f'Training set size after augmentation: {len(train_set_augmented)}')

Training set size before augmentation: 94
Training set size after augmentation: 1974


In [15]:
def initialize_dataset():

  # Create the dataloaders
  train_sampler = RandomSampler(train_set_augmented)
  train_dataloader = DataLoader(train_set_augmented, sampler=train_sampler, batch_size=BATCH_SIZE)

  eval_sampler = RandomSampler(eval_set)
  eval_dataloader = DataLoader(eval_set, sampler=eval_sampler, batch_size=BATCH_SIZE)

  test_sampler = RandomSampler(test_set)
  test_dataloader = DataLoader(test_set, sampler=test_sampler, batch_size=BATCH_SIZE)

  return train_dataloader, eval_dataloader, test_dataloader

train_dataloader, eval_dataloader, test_dataloader = initialize_dataset()

print(f'Train set size: {len(train_set_augmented)}')
print(f'Evaluation set size: {len(eval_set)}')
print(f'Test set size: {len(test_set)}')

Train set size: 1974
Evaluation set size: 31
Test set size: 31


In [16]:
# Set the hyperparameters
SEED = 1111
torch.manual_seed(SEED)

'''
IMPORTANT:
to cover all the sequence of tokens k * d must be >= hidden units (see the paper)
k = kernel_size
d = dilation = 2 ^ (n_levels - 1)
'''

OUTPUT_SIZE = len(OUTPUT_TOK.VOCAB)

if FEEDBACK:
    INPUT_SIZE = len(INPUT_TOK.VOCAB) + OUTPUT_SIZE
    LEVELS = 8
    HIDDEN_UNITS = INPUT_TOK.SEQ_LENGTH * 2 # 192 * 2 = 384
else:
    INPUT_SIZE = len(INPUT_TOK.VOCAB)
    LEVELS = 7
    HIDDEN_UNITS = INPUT_TOK.SEQ_LENGTH # 192

print(f'\nInput size: {len(INPUT_TOK.VOCAB)}')


EMBEDDING_SIZE = 20 # size of word embeddings -> Embedding() is used to encode input token into [192, 20] real value vectors (see model.py)
NUM_CHANNELS = [HIDDEN_UNITS] * (LEVELS - 1) + [EMBEDDING_SIZE] # [192, 192, 192, 192, 192, 192, 20]
GRADIENT_CLIP = 0.35


# balance the loss function by assigning a weight to each token related to its frequency
LOSS_WEIGTHS = torch.ones([OUTPUT_SIZE], dtype=torch.float, device = device)
OUTPUT_TOK.VOCAB.compute_weights()
for i, weigth in enumerate(OUTPUT_TOK.VOCAB.weights):
    LOSS_WEIGTHS[i] = 1 - weigth
    # print(f'{OUTPUT_TOK.VOCAB.idx2word[i]}: {LOSS_WEIGTHS[i]}')


def initialize_model():
  # create the model
  model = TCN(input_size = INPUT_SIZE,
              embedding_size = EMBEDDING_SIZE,
              output_size = OUTPUT_SIZE,
              num_channels = NUM_CHANNELS,
              emphasize_eeg = EMPHASIZE_EEG,
              dropout = 0.45,
              emb_dropout = 0.25,
              kernel_size = 3,
              tied_weights = False) # tie encoder and decoder weights (legare)

  model.to(device)

  # May use adaptive softmax to speed up training
  criterion = nn.CrossEntropyLoss(weight = LOSS_WEIGTHS)
  optimizer = getattr(optim, 'SGD')(model.parameters(), lr=LEARNING_RATE)

  return model, criterion, optimizer

model, criterion, optimizer = initialize_model()

print(f'\nModel created: {model}')
print(model.encoder.weight[0])



Input size: 16

Model created: TCN(
  (encoder): Embedding(16, 20, padding_idx=0)
  (tcn): TemporalConvNet(
    (network): Sequential(
      (0): TemporalBlock(
        (conv1): ParametrizedConv1d(
          20, 192, kernel_size=(3,), stride=(1,), padding=(2,)
          (parametrizations): ModuleDict(
            (weight): ParametrizationList(
              (0): _WeightNorm()
            )
          )
        )
        (chomp1): Chomp1d()
        (relu1): ReLU()
        (dropout1): Dropout(p=0.45, inplace=False)
        (conv2): ParametrizedConv1d(
          192, 192, kernel_size=(3,), stride=(1,), padding=(2,)
          (parametrizations): ModuleDict(
            (weight): ParametrizationList(
              (0): _WeightNorm()
            )
          )
        )
        (chomp2): Chomp1d()
        (relu2): ReLU()
        (dropout2): Dropout(p=0.45, inplace=False)
        (net): Sequential(
          (0): ParametrizedConv1d(
            20, 192, kernel_size=(3,), stride=(1,), padding=(

In [17]:
def save_parameters():

    # plot the losses over the epochs

    plt.plot(train_losses, label='train')
    plt.plot(eval_losses, label='eval')
    plt.legend()
    plt.savefig(os.path.join(RESULTS_PATH, 'losses.png'))
    plt.clf()

    # save the vocabularies
    INPUT_TOK.VOCAB.save(os.path.join(RESULTS_PATH, 'input_vocab.txt'))
    OUTPUT_TOK.VOCAB.save(os.path.join(RESULTS_PATH, 'output_vocab.txt'))

     # save the model hyperparameters in a file txt
    with open(os.path.join(RESULTS_PATH, 'model_hyperparameters.txt'), 'w') as f:

        f.write(f'DATE: {time.strftime("%Y%m%d-%H%M%S")}\n\n')

        f.write(f'-----------------DATASET------------------\n')
        f.write(f'DATASET_PATH: {DATASET_PATH}\n')
        f.write(f'TRAIN_SET_SIZE: {len(train_set)}\n')
        f.write(f'EVAL_SET_SIZE: {len(eval_set)}\n')
        f.write(f'TEST_SET_SIZE: {len(test_set)}\n\n')


        f.write(f'----------OPTIMIZATION PARAMETERS----------\n')
        f.write(f'GRADIENT_CLIP: {GRADIENT_CLIP}\n')
        f.write(f'FEEDBACK: {FEEDBACK}\n')
        f.write(f'EARLY STOPPING: {EARLY_STOP}\n')
        f.write(f'EMPHASIZE_EEG: {EMPHASIZE_EEG}\n')
        f.write(f'LEARNING_RATE: {LEARNING_RATE}\n')
        f.write(f'BATCH_SIZE: {BATCH_SIZE}\n')
        f.write(f'EPOCHS: {EPOCHS}\n\n')


        f.write(f'------------MODEL PARAMETERS--------------\n')
        f.write(f'SEED: {SEED}\n')
        f.write(f'INPUT_SIZE: {INPUT_SIZE}\n')
        f.write(f'EMBEDDING_SIZE: {EMBEDDING_SIZE}\n')
        f.write(f'LEVELS: {LEVELS}\n')
        f.write(f'HIDDEN_UNITS: {HIDDEN_UNITS}\n')
        f.write(f'NUM_CHANNELS: {NUM_CHANNELS}\n')
        f.write(f'OUTPUT_SIZE: {OUTPUT_SIZE}\n')
        f.write(f'LOSS_WEIGTHS: {LOSS_WEIGTHS}\n\n')



        f.write(f'-------------------RESULTS----------------\n')
        f.write(f'TRAIN_LOSSES: {best_train_loss}\n')
        f.write(f'BEST_EVAL_LOSS: {best_eval_loss}\n')
        f.write(f'TEST_LOSS: {test_loss}\n')
        f.write(f'BEST_MODEL_EPOCH: {best_model_epoch}\n')

    data = {
        'DATE': time.strftime("%Y%m%d-%H%M%S"),
        'INPUT_SIZE': INPUT_SIZE,
        'EMBEDDING_SIZE': EMBEDDING_SIZE,
        'NUM_CHANNELS': NUM_CHANNELS,
        'OUTPUT_SIZE': OUTPUT_SIZE,
        'KERNEL_SIZE': 3
    }

    path = os.path.join(RESULTS_PATH, 'config.yaml')
    with open(path, 'w') as file:
        yaml.safe_dump(data, file)

In [18]:
BAR_LENGTH = INPUT_TOK.BAR_LENGTH

def epoch_step(dataloader, mode):

    if FEEDBACK:
        prev_output = torch.zeros([BATCH_SIZE, INPUT_TOK.SEQ_LENGTH], dtype=torch.long, device=device)

    if mode == 'train':
        model.train()
    else:
        model.eval() # disable dropout

    total_loss = 0

    # iterate over the training data
    for batch_idx, (data, targets) in enumerate(dataloader):

        batch_idx += 1

        # mask the last bar of the input data
        batch_size = data.size(0)
        data_masked = torch.cat((data[:, :BAR_LENGTH*3], torch.ones([batch_size, BAR_LENGTH], dtype=torch.long, device = device)), dim = 1)

        if FEEDBACK:
            input = torch.cat((data_masked, prev_output[:batch_size, :]), dim = 1)
        else:
            input = data_masked

        # reset model gradients to zero
        optimizer.zero_grad()

        # make the prediction
        output = model(input)[:, :INPUT_TOK.SEQ_LENGTH]
        prev_output = torch.argmax(output, 2)# batch, seq_len (hidden units), vocab_size

        # flatten the output sequence
        # NB: the size -1 is inferred from other dimensions
        # NB: contiguous() is used to make sure the tensor is stored in a contiguous chunk of memory, necessary for view() to work

        final_target = targets.contiguous().view(-1)
        final_output = output.contiguous().view(-1, OUTPUT_SIZE)

        # calculate the loss
        loss = criterion(final_output, final_target)

        if mode == 'train':
            # calculate the gradients
            loss.backward()

            # clip the gradients to avoid exploding gradients
            if GRADIENT_CLIP > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)

            # update the weights
            optimizer.step()

        total_loss += loss.data.item()

    return total_loss / len(dataloader)


In [19]:
def train(results_path = None):

    global RESULTS_PATH, MODEL_PATH
    global best_eval_loss, best_train_loss, best_model_epoch, train_losses, eval_losses

    if results_path is None:
        RESULTS_PATH = os.path.join('results', time.strftime("%Y%m%d_%H%M%S"))
    else:
        RESULTS_PATH = results_path
    
    if not os.path.exists(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)

    MODEL_PATH = os.path.join(RESULTS_PATH, 'model_state_dict.pth')

    best_eval_loss = 1e8
    best_train_loss = 1e8
    best_model_epoch = 0
    eval_losses = []
    train_losses = []
    lr = LEARNING_RATE

    for epoch in range(1, EPOCHS+1):

        start_time = time.time()

        train_loss = epoch_step(train_dataloader, 'train')

        eval_loss = epoch_step(eval_dataloader, 'eval')

        # Save the model if the validation loss is the best we've seen so far.
        if eval_loss < best_eval_loss:
            torch.save(model.state_dict(), MODEL_PATH)
            best_eval_loss = eval_loss
            best_model_epoch = epoch

        if train_loss < best_train_loss:
            best_train_loss = train_loss

        # # Anneal the learning rate if the validation loss plateaus
        # if epoch > 5 and eval_loss >= max(eval_losses[-5:]):
        #     lr = lr / 2.
        #     if lr < 0.1:
        #         lr = 2
        #     for param_group in optimizer.param_groups:
        #         param_group['lr'] = lr


        eval_losses.append(eval_loss)
        train_losses.append(train_loss)

        # Early stopping
        if EARLY_STOP:
          if epoch > 15:
              if min(eval_losses[-15:]) > best_eval_loss:
                  break

        # print the loss and the progress
        elapsed = time.time() - start_time
        print('| epoch {:3d}/{:3d} | lr {:02.5f} | ms/epoch {:5.5f} | train_loss {:5.2f} | eval_loss {:5.2f}' \
                .format(epoch, EPOCHS, lr, elapsed * 1000, train_loss, eval_loss))


    print('\n\n TRAINING FINISHED:\n\n\tBest Loss: {:5.2f}\tBest Model saved at epoch: {:3d} \n\n' \
            .format(best_eval_loss, best_model_epoch))


    # test the model
    global test_loss
    test_loss = epoch_step(test_dataloader, 'eval')
    print(f'\n\nTEST LOSS: {test_loss}')

    save_parameters()

In [21]:
# MODEL PARAMETERS
TRAIN_MODEL = True

EPOCHS = 500 # 500
LEARNING_RATE = 2 # 4
BATCH_SIZE = 4 # 16
EARLY_STOP = True

train_dataloader, eval_dataloader, test_dataloader = initialize_dataset()

FEEDBACK = False
EMPHASIZE_EEG = False
model, criterion, optimizer = initialize_model()
train('results/model')

FEEDBACK = False
EMPHASIZE_EEG = True
model, criterion, optimizer = initialize_model()
train('results/model_EEG')

FEEDBACK = True
EMPHASIZE_EEG = False
model, criterion, optimizer = initialize_model()
train('results/model_feedback')

FEEDBACK = True
EMPHASIZE_EEG = True
model, criterion, optimizer = initialize_model()
train('results/model_EEG_feedback')

# if TRAIN_MODEL:

#   for i in range(2):

#     if i == 0:
#       FEEDBACK = False
#     else:
#       FEEDBACK = True

#     BATCH_SIZE = 4
#     LEARNING_RATE = 1.0
#     model, criterion, optimizer = initialize_model()
#     train()

#     LEARNING_RATE = 2.0
#     model, criterion, optimizer = initialize_model()
#     train()

#     LEARNING_RATE = 4.0
#     model, criterion, optimizer = initialize_model()
#     train()

#     LEARNING_RATE = 1.0
#     BATCH_SIZE = 8
#     train_dataloader, eval_dataloader, test_dataloader = initialize_dataset()
#     model, criterion, optimizer = initialize_model()
#     train()

#     BATCH_SIZE = 16
#     train_dataloader, eval_dataloader, test_dataloader = initialize_dataset()
#     model, criterion, optimizer = initialize_model()
#     train()

#     BATCH_SIZE = 32
#     train_dataloader, eval_dataloader, test_dataloader = initialize_dataset()
#     model, criterion, optimizer = initialize_model()
#     train()

  return F.conv1d(input, weight, bias, self.stride,


| epoch   1/500 | lr 2.00000 | ms/epoch 12016.55388 | train_loss  3.40 | eval_loss  3.44
| epoch   2/500 | lr 2.00000 | ms/epoch 9182.69706 | train_loss  3.31 | eval_loss  3.46
| epoch   3/500 | lr 2.00000 | ms/epoch 9375.64325 | train_loss  3.24 | eval_loss  3.37
| epoch   4/500 | lr 2.00000 | ms/epoch 9157.55773 | train_loss  3.16 | eval_loss  3.14
| epoch   5/500 | lr 2.00000 | ms/epoch 9452.27933 | train_loss  2.85 | eval_loss  2.89
| epoch   6/500 | lr 2.00000 | ms/epoch 9329.99206 | train_loss  2.69 | eval_loss  3.04
| epoch   7/500 | lr 2.00000 | ms/epoch 9357.37634 | train_loss  2.64 | eval_loss  2.79
| epoch   8/500 | lr 2.00000 | ms/epoch 8954.49948 | train_loss  2.61 | eval_loss  2.83
| epoch   9/500 | lr 2.00000 | ms/epoch 8876.15395 | train_loss  2.59 | eval_loss  2.87
| epoch  10/500 | lr 2.00000 | ms/epoch 9139.60004 | train_loss  2.59 | eval_loss  2.84
| epoch  11/500 | lr 2.00000 | ms/epoch 9495.34225 | train_loss  2.57 | eval_loss  2.78
| epoch  12/500 | lr 2.00000 | 

KeyboardInterrupt: 