In [6]:
import glob
import numpy as np
import pandas as pd
import os
import time
import math
import numpy as np
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
from torch import optim
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, random_split
import sys
sys.path.append('..')

from word_cnn.model import TCN
from MIDI.PRETTY_MIDI.pretty_midi_tokenization import PrettyMidiTokenizer, SILENCE_TOKEN

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',504)
pd.set_option('display.width',1000)


DIRECTORY_PATH = './'
DATASET_PATH = os.path.join(DIRECTORY_PATH, 'dataset')
MODEL_PATH = os.path.join(DIRECTORY_PATH, 'model')


EPOCHS = 500 # 500
LEARNING_RATE = 4
BATCH_SIZE = 2


In [7]:
'''
Assumptions:
Sequences described as input_#.mid and output_#.mid in the corresponding folders
'''

input_filenames = glob.glob(os.path.join(DATASET_PATH, 'input/*.MID'))
print('Number of input files:', len(input_filenames))

output_filenames = glob.glob(os.path.join(DATASET_PATH, 'output/*.MID'))
print('Number of output files:', len(output_filenames))


for i, (in_file, out_file) in enumerate(zip(input_filenames, output_filenames)):

    in_file_name = os.path.basename(in_file)
    out_file_name = os.path.basename(out_file)
    print(f'\n\n{i + 1}: {in_file_name} -> {out_file_name}')

    input = PrettyMidiTokenizer(in_file)
    print(f'\nNumber of input bars: {input.num_bars}')
    print(f'Number of input sequences: {len(input.sequences)}')
    print(f'Input sequence length: {len(input.sequences[0])}')
    print(f'Input vocabulars size: {len(input.VOCAB)}')

    output = PrettyMidiTokenizer(out_file)
    print(f'\nNumber of output bars: {output.num_bars}')
    print(f'Number of output sequences: {len(output.sequences)}')
    print(f'Output sequence length: {len(output.sequences[0])}')
    print(f'Output vocabulars size: {len(output.VOCAB)}')

    min_length = min(len(input.sequences), len(output.sequences))
    input.sequences = input.sequences[:min_length]
    output.sequences = output.sequences[:min_length]
    print(f'\nNumber of sequences after truncation: {len(input.sequences)}, {len(output.sequences)}')

Number of input files: 1
Number of output files: 1


1: drum_excited.MID -> bass_example.MID

Number of input bars: 24
Number of input sequences: 20
Input sequence length: 192
Input vocabulars size: 13

Number of output bars: 11
Number of output sequences: 7
Output sequence length: 192
Output vocabulars size: 30

Number of sequences after truncation: 7, 7


In [9]:
# convert the sequences to LongTensor for PyTorch
input_data = torch.LongTensor(input.sequences).to(device)
output_data = torch.LongTensor(output.sequences).to(device)

# Create the dataset
dataset = TensorDataset(input_data, output_data)

# Split the dataset into training, evaluation and test sets
train_set, eval_set, test_set = random_split(dataset, [0.6, 0.2, 0.2])

# Create the dataloaders
train_sampler = RandomSampler(train_set)          
train_dataloader = DataLoader(train_set, sampler=train_sampler, batch_size=BATCH_SIZE)

eval_sampler = RandomSampler(eval_set)
eval_dataloader = DataLoader(eval_set, sampler=eval_sampler, batch_size=BATCH_SIZE)

test_sampler = RandomSampler(test_set)
test_dataloader = DataLoader(test_set, sampler=test_sampler, batch_size=BATCH_SIZE)

print(f'Train set size: {len(train_set)}')
print(f'Evaluation set size: {len(eval_set)}')
print(f'Test set size: {len(test_set)}')


Train set size: 5
Evaluation set size: 1
Test set size: 1


In [138]:
# Set the hyperparameters
SEED = 1111 
OUTPUT_VOCAB_SIZE = len(output.VOCAB)
EMBEDDING_SIZE = 20 # size of word embeddings -> Embedding() is used to encode input token into [192, 20] vectors (see model.py)
LEVELS = 7
HIDDEN_UNITS = 192
NUM_CHANNELS = [HIDDEN_UNITS] * (LEVELS - 1) + [EMBEDDING_SIZE]
GRADIENT_CLIP = 0.35


# reduce the weights of the silence token since it is overrepresented in the dataset
silence_id = output.VOCAB.word2idx[SILENCE_TOKEN]
LOSS_WEIGTHS = torch.ones([OUTPUT_VOCAB_SIZE], dtype=torch.float)
LOSS_WEIGTHS[silence_id] = 0.3


# create the model
model = TCN(input_size = EMBEDDING_SIZE, 
            output_size = OUTPUT_VOCAB_SIZE, 
            num_channels = NUM_CHANNELS, 
            dropout = 0.45, 
            emb_dropout = 0.25, 
            kernel_size = 3, 
            tied_weights = False) # tie encoder and decoder weights (legare)


# May use adaptive softmax to speed up training
torch.manual_seed(SEED)
criterion = nn.CrossEntropyLoss(weight = LOSS_WEIGTHS)
optimizer = getattr(optim, 'SGD')(model.parameters(), lr=LEARNING_RATE)


[192, 192, 192, 192, 192, 192, 20]


In [None]:
BAR_LENGTH = input.BAR_LENGTH
LOG_INTERVAL = 1 # report interval


def train(dataloader, epoch):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()

    # iterate over the training data
    for batch_idx, (data, targets) in enumerate(dataloader):

        batch_idx += 1

        # mask the last bar of the input data 
        batch_size = data.size(0)
        data_masked = torch.cat((data[:, :BAR_LENGTH*3], torch.ones([batch_size, BAR_LENGTH], dtype=torch.long)), dim = 1)

        # reset model gradients to zero
        optimizer.zero_grad()

        # make the prediction
        output = model(data_masked)

        # flatten the output sequence
        # NB: the size -1 is inferred from other dimensions
        # NB: contiguous() is used to make sure the tensor is stored in a contiguous chunk of memory, necessary for view() to work
        final_target = targets.contiguous().view(-1)
        final_output = output.contiguous().view(-1, OUTPUT_VOCAB_SIZE)

        # calculate the loss
        loss = criterion(final_output, final_target)

        # calculate the gradients
        loss.backward()

        if GRADIENT_CLIP > 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)

        # update the weights
        optimizer.step()

        total_loss += loss.data.item()

        # print the loss and the progress
        if batch_idx % LOG_INTERVAL == 0 and batch_idx > 0:
            current_loss = total_loss / LOG_INTERVAL
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(epoch, 
                                                        batch_idx, 
                                                        len(train_dataloader), 
                                                        LEARNING_RATE,
                                                        elapsed * 1000 / LOG_INTERVAL,
                                                        current_loss, 
                                                        math.exp(current_loss)))
            total_loss = 0
            start_time = time.time()


In [137]:
torch.save(model.state_dict(), 'generative_model.pt')

In [None]:

def evaluate(dataloader):

    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0
    processed_data_size = 0

    for batch_idx, (data, targets) in enumerate(dataloader):

        batch_idx += 1

        # mask the last bar of the input data 
        batch_size = data.size(0)
        data_masked = torch.cat((data[:, :BAR_LENGTH*3], torch.ones([batch_size, BAR_LENGTH], dtype=torch.long)), dim = 1)

        # reset model gradients to zero
        optimizer.zero_grad()

        # make the prediction
        output = model(data_masked)

        # flatten the output sequence
        final_target = targets.contiguous().view(-1)
        final_output = output.contiguous().view(-1, OUTPUT_VOCAB_SIZE)

        loss = criterion(final_output, final_target)

        # Note that we don't add TAR loss here
        total_loss += (data.size(1)) * loss.data
        processed_data_size += data.size(1) - eff_history

    return total_loss[0] / processed_data_size


train_model = False

In [None]:

best_vloss = 1e8

if not train_model:
    with open("model.pt", 'rb') as f:
        model = torch.load(f)

    next_in = None
    model.eval()

else:
    all_vloss = []
    for epoch in range(1, args.epochs+1):
        epoch_start_time = time.time()
        if args.train:
            train()
            val_loss = evaluate(val_data)
            test_loss = evaluate(test_data)
        

            print('-' * 89)
            print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                    'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                            val_loss, math.exp(val_loss)))
            print('| end of epoch {:3d} | time: {:5.2f}s | test loss {:5.2f} | '
                'test ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                            test_loss, math.exp(test_loss)))
            print('-' * 89)

            # Save the model if the validation loss is the best we've seen so far.
            if val_loss < best_vloss:
                with open("model.pt", 'wb') as f:
                    print('Save model!\n')
                    torch.save(model, f)
                best_vloss = val_loss

            # Anneal the learning rate if the validation loss plateaus
            if epoch > 5 and val_loss >= max(all_vloss[-5:]):
                lr = lr / 2.
                if lr < 0.1:
                    print("bump lr")
                    lr = 2
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
            all_vloss.append(val_loss)



# Load the best saved model.
with open("model.pt", 'rb') as f:
    model = torch.load(f)
# Run on test data.
test_loss = evaluate(test_data)
print('=' * 89)
print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
    test_loss, math.exp(test_loss)))
print('=' * 89)