In [2]:
import glob
import numpy as np
import pandas as pd
import os
import time
import torch
import torch.nn as nn
from torch import optim
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, random_split
import sys
sys.path.append('..')

from word_cnn.model import TCN
from APPLICATION.model.tokenization import PrettyMidiTokenizer, BCI_TOKENS

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',504)
pd.set_option('display.width',1000)


DIRECTORY_PATH = ''


# MODEL PARAMETERS
EPOCHS = 500 # 500
LEARNING_RATE = 2 # 4
BATCH_SIZE = 16 # 16
TRAIN_MODEL = False
FEEDBACK = False
EMPHAZISE_EEG = True

In [3]:
'''
Assumptions:
Sequences described as 
    input:  drum_genere_emotion.mid 
    output: bass_genere_emotion.mid 
in the corresponding folders
'''
DATASET_PATH = os.path.join(DIRECTORY_PATH, 'dataset')

input_filenames = glob.glob(os.path.join(DATASET_PATH, 'input/*.MID'))
print('Number of input files:', len(input_filenames))

output_filenames = glob.glob(os.path.join(DATASET_PATH, 'output/*.MID'))
print('Number of output files:', len(output_filenames), '\n')

INPUT_TOK = PrettyMidiTokenizer()
OUTPUT_TOK = PrettyMidiTokenizer()

for i, (in_file, out_file) in enumerate(zip(input_filenames, output_filenames)):

    in_file_name = os.path.basename(in_file)
    out_file_name = os.path.basename(out_file)
    print(f'{i + 1}: {in_file_name} -> {out_file_name}')

    if 'RELAX' in in_file_name:
        emotion_token = BCI_TOKENS['relax']
    elif 'EXCITED' in in_file_name:
        emotion_token = BCI_TOKENS['concentrate']
    else:
        raise Exception('Emotion not found in file name. Please add the emotion to the file name.')

    in_seq, in_df = INPUT_TOK.midi_to_tokens(in_file, update_vocab=True, update_sequences=True, emotion_token = emotion_token)
    out_seq, out_df = OUTPUT_TOK.midi_to_tokens(out_file, update_vocab=True, update_sequences=True)

    print(f'Input sequence length: {len(in_seq)}')
    print(f'Emotion token: {emotion_token}\n')

    if i == 0:
        print(in_df)

print(f'\nNumber of input sequences: {len(INPUT_TOK.sequences)}')
print(f'Input sequence length: {len(INPUT_TOK.sequences[0])}')
print(f'Input vocabulars size: {len(INPUT_TOK.VOCAB)}')
print(f'\nNumber of output sequences: {len(OUTPUT_TOK.sequences)}')
print(f'Output sequence length: {len(OUTPUT_TOK.sequences[0])}')
print(f'Output vocabulars size: {len(OUTPUT_TOK.VOCAB)}')

print('\nInput vocab:', INPUT_TOK.VOCAB.word2idx)
print('Output vocab:', OUTPUT_TOK.VOCAB.word2idx)
    

for t in INPUT_TOK.sequences[0]:
    print(INPUT_TOK.VOCAB.idx2word[t])

Number of input files: 6
Number of output files: 6 

1: 0_Drum_HardRock_EXCITED.mid -> 0_Bass_HardRock_EXCITED.mid
Input sequence length: 26
Emotion token: C

    pitch velocity start end bar
0      36      127     0   6   0
1      42       48     5   6   0
2      38      127    11  18   0
3      42       54    17  18   0
4      42      125    23  24   0
5      36      127    23  30   0
6      42       59    29  30   0
7      38      127    35  42   0
8      42       67    41  42   0
9      42      127    47  48   0
10     36      127    47  48   0
11     36      127     0   6   1
12     42      127    11  12   1
13     38      127    11  18   1
14     42       40    17  18   1
15     42      126    23  24   1
16     36      127    23  29   1
17     42      127    35  36   1
18     38      127    35  41   1
19     42      127    47  48   1
20     36      127    47  48   1
21     36      127     0   6   2
22     42       53     5   6   2
23     42      127    11  12   2
24     38      1

In [4]:
# Perform data augmentation
input_shifts = [-3, -2, -1, 1, 2, 3]
output_shifts = list(np.zeros(len(input_shifts)))

INPUT_TOK.data_augmentation_shift(input_shifts)
OUTPUT_TOK.data_augmentation_shift(output_shifts)

print(f'\nNumber of input sequences after data augmentation: {len(INPUT_TOK.sequences)}')
print(f'Number of output sequences after data augmentation: {len(OUTPUT_TOK.sequences)}')


Number of input sequences after data augmentation: 1197
Number of output sequences after data augmentation: 1197


In [5]:
# Create the dataset
dataset = TensorDataset(torch.LongTensor(INPUT_TOK.sequences).to(device),
                        torch.LongTensor(OUTPUT_TOK.sequences).to(device))

# Split the dataset into training, evaluation and test sets
train_set, eval_set, test_set = random_split(dataset, [0.6, 0.2, 0.2])

# Create the dataloaders
train_sampler = RandomSampler(train_set)          
train_dataloader = DataLoader(train_set, sampler=train_sampler, batch_size=BATCH_SIZE)

eval_sampler = RandomSampler(eval_set)
eval_dataloader = DataLoader(eval_set, sampler=eval_sampler, batch_size=BATCH_SIZE)

test_sampler = RandomSampler(test_set)
test_dataloader = DataLoader(test_set, sampler=test_sampler, batch_size=BATCH_SIZE)

print(f'Train set size: {len(train_set)}')
print(f'Evaluation set size: {len(eval_set)}')
print(f'Test set size: {len(test_set)}')


Train set size: 719
Evaluation set size: 239
Test set size: 239


  dataset = TensorDataset(torch.LongTensor(INPUT_TOK.sequences).to(device),


In [6]:
# Set the hyperparameters
SEED = 1111
OUTPUT_SIZE = len(OUTPUT_TOK.VOCAB) 


'''
IMPORTANT:
to cover all the sequence of tokens k * d must be >= hidden units (see the paper)
k = kernel_size
d = dilation = 2 ^ (n_levels - 1) 
'''
if FEEDBACK:
    INPUT_SIZE = len(INPUT_TOK.VOCAB) + OUTPUT_SIZE
    LEVELS = 8
    HIDDEN_UNITS = INPUT_TOK.SEQ_LENGTH * 2 # 192 * 2 = 384
else:
    INPUT_SIZE = len(INPUT_TOK.VOCAB) 
    LEVELS = 7
    HIDDEN_UNITS = INPUT_TOK.SEQ_LENGTH # 192 


EMBEDDING_SIZE = 20 # size of word embeddings -> Embedding() is used to encode input token into [192, 20] real value vectors (see model.py)
NUM_CHANNELS = [HIDDEN_UNITS] * (LEVELS - 1) + [EMBEDDING_SIZE] # [192, 192, 192, 192, 192, 192, 20]
GRADIENT_CLIP = 0.35


# balance the loss function by assigning a weight to each token related to its frequency
LOSS_WEIGTHS = torch.ones([OUTPUT_SIZE], dtype=torch.float, device=device)
OUTPUT_TOK.VOCAB.compute_weights()
for i, weigth in enumerate(OUTPUT_TOK.VOCAB.weights):
    LOSS_WEIGTHS[i] = 1 - weigth
    # print(f'{OUTPUT_TOK.VOCAB.idx2word[i]}: {LOSS_WEIGTHS[i]}')


# create the model
model = TCN(input_size = INPUT_SIZE,
            embedding_size = EMBEDDING_SIZE, 
            output_size = OUTPUT_SIZE, 
            num_channels = NUM_CHANNELS, 
            emphasize_eeg = EMPHAZISE_EEG,
            dropout = 0.45, 
            emb_dropout = 0.25, 
            kernel_size = 3, 
            tied_weights = False) # tie encoder and decoder weights (legare)

model.to(device)

# May use adaptive softmax to speed up training
torch.manual_seed(SEED)
criterion = nn.CrossEntropyLoss(weight = LOSS_WEIGTHS)
optimizer = getattr(optim, 'SGD')(model.parameters(), lr=LEARNING_RATE)

print(f'\nModel created: {model}')



Model created: TCN(
  (encoder): Embedding(92, 20, padding_idx=0)
  (tcn): TemporalConvNet(
    (network): Sequential(
      (0): TemporalBlock(
        (conv1): Conv1d(20, 192, kernel_size=(3,), stride=(1,), padding=(2,))
        (chomp1): Chomp1d()
        (relu1): ReLU()
        (dropout1): Dropout(p=0.45, inplace=False)
        (conv2): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(2,))
        (chomp2): Chomp1d()
        (relu2): ReLU()
        (dropout2): Dropout(p=0.45, inplace=False)
        (net): Sequential(
          (0): Conv1d(20, 192, kernel_size=(3,), stride=(1,), padding=(2,))
          (1): Chomp1d()
          (2): ReLU()
          (3): Dropout(p=0.45, inplace=False)
          (4): Conv1d(192, 192, kernel_size=(3,), stride=(1,), padding=(2,))
          (5): Chomp1d()
          (6): ReLU()
          (7): Dropout(p=0.45, inplace=False)
        )
        (downsample): Conv1d(20, 192, kernel_size=(1,), stride=(1,))
        (relu): ReLU()
      )
      (1): Temp

In [7]:
def save_parameters():
    # plot the losses over the epochs 
    import matplotlib.pyplot as plt
    plt.plot(train_losses, label='train')
    plt.plot(eval_losses, label='eval')
    plt.legend()
    plt.savefig(os.path.join(RESULTS_PATH, 'losses.png'))


    # save the vocabularies
    INPUT_TOK.VOCAB.save(os.path.join(RESULTS_PATH, 'input_vocab.txt'))
    OUTPUT_TOK.VOCAB.save(os.path.join(RESULTS_PATH, 'output_vocab.txt'))

     # save the model hyperparameters in a file txt
    with open(os.path.join(RESULTS_PATH, 'model_hyperparameters.txt'), 'w') as f:

        f.write(f'----------OPTIMIZATION PARAMETERS----------\n')
        f.write(f'DATE: {time.strftime("%Y%m%d-%H%M%S")}\n')
        f.write(f'DATASET_PATH: {DATASET_PATH}\n')
        f.write(f'FEEDBACK: {FEEDBACK}\n')
        f.write(f'SEED: {SEED}\n')
        f.write(f'INPUT_SIZE: {INPUT_SIZE}\n')
        f.write(f'EMBEDDING_SIZE: {EMBEDDING_SIZE}\n')
        f.write(f'LEVELS: {LEVELS}\n')
        f.write(f'HIDDEN_UNITS: {HIDDEN_UNITS}\n')
        f.write(f'NUM_CHANNELS: {NUM_CHANNELS}\n')
        f.write(f'OUTPUT_SIZE: {OUTPUT_SIZE}\n')
        f.write(f'LOSS_WEIGTHS: {LOSS_WEIGTHS}\n')
        f.write(f'LEARNING_RATE: {LEARNING_RATE}\n')
        f.write(f'BATCH_SIZE: {BATCH_SIZE}\n')
        f.write(f'EPOCHS: {EPOCHS}\n')
        f.write(f'GRADIENT_CLIP: {GRADIENT_CLIP}\n')
        f.write(f'------------------------------------------\n')
        f.write(f'----------RESULTS----------\n')
        f.write(f'BEST_TRAIN_LOSSES: {best_train_loss}\n')
        f.write(f'BEST_EVAL_LOSS: {best_eval_loss}\n')
        f.write(f'TEST_LOSS: {test_loss}\n')
        f.write(f'BEST_MODEL_EPOCH: {best_model_epoch}\n')
        f.write(f'------------------------------------------\n')

In [8]:
BAR_LENGTH = INPUT_TOK.BAR_LENGTH

def epoch_step(dataloader, mode):

    if FEEDBACK:
        prev_output = torch.zeros([BATCH_SIZE, INPUT_TOK.SEQ_LENGTH], dtype=torch.long, device=device)

    if mode == 'train':
        model.train()
    else:
        model.eval() # disable dropout
        
    total_loss = 0

    # iterate over the training data
    for batch_idx, (data, targets) in enumerate(dataloader):

        batch_idx += 1

        # mask the last bar of the input data
        batch_size = data.size(0)
        data_masked = torch.cat((data[:, :BAR_LENGTH*3], torch.ones([batch_size, BAR_LENGTH], dtype=torch.long, device=device)), dim = 1) 

        if FEEDBACK:
            input = torch.cat((data_masked, prev_output[:batch_size, :]), dim = 1)
        else:
            input = data_masked
           
        # reset model gradients to zero
        optimizer.zero_grad()

        # make the prediction
        output = model(input)[:, :INPUT_TOK.SEQ_LENGTH] 
        prev_output = torch.argmax(output, 2)# batch, seq_len (hidden units), vocab_size

        # flatten the output sequence
        # NB: the size -1 is inferred from other dimensions
        # NB: contiguous() is used to make sure the tensor is stored in a contiguous chunk of memory, necessary for view() to work
    
        final_target = targets.contiguous().view(-1)    
        final_output = output.contiguous().view(-1, OUTPUT_SIZE)

        # calculate the loss
        loss = criterion(final_output, final_target)

        if mode == 'train':
            # calculate the gradients
            loss.backward()

            # clip the gradients to avoid exploding gradients
            if GRADIENT_CLIP > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)

            # update the weights
            optimizer.step()

        total_loss += loss.data.item()

    return total_loss / len(dataloader)


In [9]:
best_eval_loss = 1e8
best_train_loss = 1e8
best_model_epoch = 0
eval_losses = []
train_losses = []
lr = LEARNING_RATE
early_stop = True


if TRAIN_MODEL:

    RESULTS_PATH = os.path.join(DIRECTORY_PATH, 'results', time.strftime("%Y%m%d_%H%M%S"))
    if not os.path.exists(RESULTS_PATH):
        os.makedirs(RESULTS_PATH)
        
    MODEL_PATH = os.path.join(RESULTS_PATH, 'model_state_dict.pth')

    
    for epoch in range(1, EPOCHS+1):

        start_time = time.time()

        train_loss = epoch_step(train_dataloader, 'train')
        
        eval_loss = epoch_step(eval_dataloader, 'eval')

        # Save the model if the validation loss is the best we've seen so far.
        if eval_loss < best_eval_loss:
            torch.save(model.state_dict(), MODEL_PATH)
            best_eval_loss = eval_loss
            best_model_epoch = epoch 

        if train_loss < best_train_loss:
            best_train_loss = train_loss

        # # Anneal the learning rate if the validation loss plateaus
        # if epoch > 5 and eval_loss >= max(eval_losses[-5:]):
        #     lr = lr / 2.
        #     if lr < 0.1:
        #         lr = 2
        #     for param_group in optimizer.param_groups:
        #         param_group['lr'] = lr


        eval_losses.append(eval_loss)
        train_losses.append(train_loss)

        # Early stopping
        if early_stop:
            if epoch > 15:
                if min(eval_losses[-15:]) > best_eval_loss:
                    break

        # print the loss and the progress
        elapsed = time.time() - start_time
        print('| epoch {:3d}/{:3d} | lr {:02.5f} | ms/epoch {:5.5f} | train_loss {:5.2f} | eval_loss {:5.2f}' \
                .format(epoch, EPOCHS, lr, elapsed * 1000, train_loss, eval_loss))

    print('\n\n TRAINING FINISHED:\n\n\tBest Loss: {:5.2f}\tBest Model saved at epoch: {:3d} \n\n' \
            .format(best_eval_loss, best_model_epoch))
    
    # test the model
    test_loss = epoch_step(test_dataloader, 'eval')
    print(f'\n\nTEST LOSS: {test_loss}')
    save_parameters()

In [10]:
# Load the best saved model.
RESULTS_PATH = 'models/model'
MODEL_PATH = f'{RESULTS_PATH}/model_state_dict.pth'

INPUT_TOK.load_vocab(f'{RESULTS_PATH}/input_vocab.txt')
OUTPUT_TOK.load_vocab(f'{RESULTS_PATH}/output_vocab.txt')

model = TCN(input_size = len(INPUT_TOK.VOCAB),
            embedding_size = EMBEDDING_SIZE, 
            output_size = len(OUTPUT_TOK.VOCAB), 
            num_channels = NUM_CHANNELS, 
            dropout = 0.45, 
            emb_dropout = 0.25, 
            kernel_size = 3, 
            tied_weights = False) # tie encoder and decoder weights (legare)

model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu')))
model.eval()
model.to(device)

# select a genere to be predicted
generes = ['blues', 'rock_relax', 'rock_excited']

for genere in generes:
    # get a sample to be predicted
    sample_path = os.path.join(DATASET_PATH, f'test/drum_{genere}_2.mid')
    sample = INPUT_TOK.midi_to_tokens(sample_path, update_vocab=False) [0]

    print(sample)
    sample = torch.LongTensor(sample)

    # Get the last sequence from the batch and unsqueeze it to add a batch dimension.
    sample = sample[-1].unsqueeze(0)

    # Mask the last bar of the input data.
    sample = torch.cat((sample[:, :BAR_LENGTH*3], torch.ones([1, BAR_LENGTH], dtype=torch.long)), dim = 1)

    # Make the prediction.
    prediction = model(sample.to(device))
    prediction = prediction.contiguous().view(-1, OUTPUT_SIZE)

    # Get the predicted tokens.
    predicted_tokens = torch.argmax(prediction, 1)

    # Get the predicted sequence.
    predicted_sequence = predicted_tokens.cpu().numpy().tolist()

    # Convert the predicted sequence to MIDI.
    out_file_path = os.path.join(RESULTS_PATH, f'predicted_{genere}.mid')
    pitch_ticks_velocity =  OUTPUT_TOK.tokens_to_midi(predicted_sequence, out_file_path = out_file_path, ticks_filter = 3, instrument_name = 'Electric Bass (finger)') 


# # check 
# predicted_sequence_string = []
# for id in predicted_sequence:
#     predicted_sequence_string.append(OUTPUT_TOK.VOCAB.idx2word[id])
# print(predicted_sequence_string)
# print(pitch_ticks_list)

[array([1, 2, 2, 2, 0, 18, 19, 19, 19, 24, 25, 5, 5, 5, 5, 5, 0, 18, 19,
       19, 19, 37, 63, 65, 2, 2, 2, 2, 0, 18, 19, 19, 19, 24, 25, 66, 5,
       5, 5, 5, 0, 0, 18, 19, 19, 19, 7, 27, 1, 2, 2, 2, 2, 0, 0, 0, 0, 0,
       8, 21, 5, 5, 5, 5, 5, 0, 18, 19, 19, 19, 7, 27, 2, 2, 2, 2, 2, 0,
       18, 19, 19, 19, 8, 21, 5, 5, 5, 5, 5, 18, 19, 19, 19, 19, 7, 27, 1,
       2, 2, 2, 0, 32, 33, 33, 33, 33, 8, 21, 5, 5, 5, 5, 5, 0, 18, 19,
       19, 37, 63, 65, 2, 2, 2, 2, 0, 18, 19, 19, 19, 8, 21, 21, 5, 5, 5,
       5, 0, 0, 18, 19, 19, 19, 7, 27, 1, 2, 2, 2, 2, 32, 33, 33, 33, 67,
       68, 69, 5, 5, 5, 5, 0, 18, 19, 19, 19, 37, 63, 65, 2, 2, 2, 0, 0,
       15, 16, 16, 16, 70, 71, 71, 5, 5, 5, 5, 0, 18, 19, 19, 19, 19, 7,
       27], dtype=object)]
MIDI file saved at models/model\predicted_blues.mid
[array([13, 14, 14, 33, 0, 0, 0, 0, 0, 32, 33, 33, 33, 33, 33, 0, 0, 0, 0,
       0, 0, 17, 10, 10, 10, 10, 10, 10, 0, 0, 0, 0, 0, 32, 33, 33, 33,
       33, 33, 0, 0, 0, 0, 0, 7, 27, 27