In [58]:
import glob
import numpy as np
import pandas as pd
import os
import pretty_midi
import collections
import time
import math
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',504)
pd.set_option('display.width',1000)


DRUM_MIDI_DICT = {    
    36: 'Kick',
    38: 'Snare',
    42: 'Closed Hi-Hat',
    43: 'Floor Tom',
    44: 'Pedal Hi-Hat',
    46: 'Open Hi-Hat',
    47: 'Tom 2',
    48: 'Tom 1',
    49: 'Crash',
    51: 'Ride'}

DIRECTORY_PATH = '..'
DATASET_PATH = os.path.join(DIRECTORY_PATH, 'dataset')
CHECKPOINTS_PATH = os.path.join(DIRECTORY_PATH, 'training_checkpoints')

# Model parameters
BATCH_SIZE = 2 # 16
EPOCHS = 50 # 500
LEARNING_RATE = 0.001 # 4

# MIDI parameters
BPM = 120
BEATS_PER_BAR = 4
TICKS_PER_BEAT = 12
BAR_DURATION = BEATS_PER_BAR * (60 / BPM)

# Tokenization parameters
BAR_LENGTH = BEATS_PER_BAR * TICKS_PER_BEAT
SEQ_LENGTH = BAR_LENGTH * 4 # 4 bars
VELOCITY_RANGES = {'p': (0, 64), 'f': (65, 127)}
NOTE_START_TOKEN = 'S'
SILENCE_TOKEN = 'O'
BCI_TOKEN = 'BCI'

In [106]:
def convert_time_to_ticks(time: float):
    pm = pretty_midi.PrettyMIDI(midi_file=None, resolution=TICKS_PER_BEAT, initial_tempo=BPM)
    return pm.time_to_tick(time)

def new_note(pitch, velocity, start, end, bar, convert_to_ticks = True):

    # NB: start and end are relative to the bar they are in
    if convert_to_ticks:
        start = convert_time_to_ticks(start - bar*BAR_DURATION)
        end = convert_time_to_ticks(end - bar*BAR_DURATION)

    new_note = {
        'pitch': pitch,
        'velocity': velocity,
        'start': start,
        'end': end,
        'bar': bar
    }
    
    return new_note


def append_note_to_notes_dict(notes: pd.DataFrame, note: dict):
    for key, value in note.items():
        notes[key].append(value)


class Dictionary(object):
    def __init__(self):
        self.input = input
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

In [107]:
def midi_to_tokens(midi_file_path: str, bpm = BPM, beats_per_bar = BEATS_PER_BAR) -> pd.DataFrame:

  pm = pretty_midi.PrettyMIDI(midi_file_path)
  instrument = pm.instruments[0]
  notes = collections.defaultdict(list) # Dictionary with values as list
  bar_duration = (60/bpm) * beats_per_bar

  ticks_per_beat = pm.resolution

  # Sort the notes by start time
  sorted_notes = sorted(instrument.notes, key=lambda note: note.start)

  for note in sorted_notes:

    pitch = note.pitch
    velocity = note.velocity
    start = note.start
    end = note.end
    # step = start - prev_start
    duration = end - start
    bar = int(start // bar_duration) # integer part of the division

    # split the note in two if it spans multiple bars
    if start + duration > (bar + 1) * bar_duration: 

      # update the current note to end at the end of the bar and update its duration
      note = new_note(pitch, velocity, start, (bar + 1) * bar_duration, bar)
      append_note_to_notes_dict(notes, note)

      # create new note in the succeeding bar with the remaining duration
      note = new_note(pitch, velocity, (bar + 1) * bar_duration, end, bar + 1)
      append_note_to_notes_dict(notes, note)

    else:
      note = new_note(pitch, velocity, start, end, bar)
      append_note_to_notes_dict(notes, note)

  # create a dataframe from the notes dictionary
  notes_df = pd.DataFrame({name: np.array(value) for name, value in notes.items()})


  # split notes into bars and convert notes ticks into a time serie of strings
  bars_time_series = []
  for bar_id in notes_df['bar'].unique():
    bar_df = notes_df[notes_df['bar'] == bar_id]
    bar_df = bar_df.reset_index(drop=True)

    # fill the beginning and end of each bar with empty notes if necessary
    if bar_df.loc[len(bar_df) - 1, 'end'] != BAR_LENGTH:
      note = new_note(pitch = 0,
                      velocity = 0,
                      start = bar_df.loc[len(bar_df) - 1, 'end'],
                      end = BAR_LENGTH,
                      bar = bar,
                      convert_to_ticks = False)
      bar_df = bar_df.append(note, ignore_index=True)

    if bar_df.at[0, 'start'] != 0:
      note = new_note(pitch = 0,
                      velocity = 0,
                      start = 0,
                      end = bar_df.at[0, 'start'],
                      bar = bar,
                      convert_to_ticks = False)
      bar_df = bar_df.append(note, ignore_index=True) 
      bar_df = bar_df.sort_values(by=['start']) 
      bar_df = bar_df.reset_index(drop=True)


    # convert note ticks into a time serie of strings 
    bar_time_serie = np.empty((BAR_LENGTH), dtype=object)
    bar_time_serie[:] = SILENCE_TOKEN
    for i in range(len(bar_df)):
      note = bar_df.loc[i, 'pitch']
      if note != 0:
        start = bar_df.loc[i, 'start']
        end = bar_df.loc[i, 'end']
        bar_time_serie[start] = str(note)+NOTE_START_TOKEN
        bar_time_serie[start+1:end] = str(note)
    bars_time_series.append(bar_time_serie)


  # flat bars and extract the string vocabulary
  flatten_time_series = np.concatenate(bars_time_series)
  token_list = list(set(flatten_time_series))

  # create the vocabulary
  VOCAB = Dictionary()
  for i in range(0, len(token_list)):
      VOCAB.add_word(token_list[i])

  # create the sequences of tokens for the model 
  sequences=[]
  num_sequences = len(flatten_time_series) - SEQ_LENGTH
  for i in range(0, num_sequences, BAR_LENGTH):
    seq = flatten_time_series[i:(i+SEQ_LENGTH)].copy() # NB: copy is necessary to avoid modifying the original array

    # add the BCI token to the input sequences at each time step
    if 'input' in midi_file_path:
      seq = np.concatenate(([BCI_TOKEN], seq[:-1]))
      VOCAB.add_word(BCI_TOKEN)

    for i in range(len(seq)):
      seq[i] = VOCAB.word2idx[seq[i]] 

    sequences.append(seq)
    

  return sequences, VOCAB, notes_df

In [108]:
'''
Assumptions:
Sequences described as input_#.mid and output_#.mid in the corresponding folders
'''

input_filenames = glob.glob(os.path.join(DATASET_PATH, 'input/*.MID'))
print('Number of input files:', len(input_filenames))

output_filenames = glob.glob(os.path.join(DATASET_PATH, 'output/*.MID'))
print('Number of output files:', len(output_filenames))

for i, (in_file, out_file) in enumerate(zip(input_filenames, output_filenames)):

    in_file_name = os.path.basename(in_file)
    out_file_name = os.path.basename(out_file)
    print(f'\n\n{i + 1}: {in_file_name} -> {out_file_name}')

    input_sequences, INPUT_VOCAB, input_notes_df = midi_to_tokens(in_file)
    n_bar = len(input_notes_df['bar'].unique())
    print(f'\nNumber of input bars: {n_bar}')
    print(f'Number of input sequences: {len(input_sequences)}')
    print(f'Input sequence length: {len(input_sequences[0])}')
    print(f'Input vocabulars size: {len(INPUT_VOCAB)}')

    output_sequences, OUTPUT_VOCAB, output_notes_df = midi_to_tokens(out_file)
    n_bar = len(output_notes_df['bar'].unique())
    print(f'\nNumber of output bars: {n_bar}')
    print(f'Number of output sequences: {len(output_sequences)}')
    print(f'Output sequence length: {len(output_sequences[0])}')
    print(f'Output vocabulars size: {len(OUTPUT_VOCAB)}')

    min_length = min(len(input_sequences), len(output_sequences))
    input_sequences = input_sequences[:min_length]
    output_sequences = output_sequences[:min_length]
    print(f'\nNumber of sequences after truncation: {len(input_sequences)}, {len(output_sequences)}')

    print(INPUT_VOCAB.word2idx)
    print(OUTPUT_VOCAB.word2idx)

Number of input files: 1
Number of output files: 1


1: drum_excited.MID -> bass_example.MID

Number of input bars: 24
Number of input sequences: 20
Input sequence length: 192
Input vocabulars size: 13

Number of output bars: 11
Number of output sequences: 7
Output sequence length: 192
Output vocabulars size: 30

Number of sequences after truncation: 7, 7
{'O': 0, '47S': 1, '38S': 2, '36': 3, '48S': 4, '48': 5, '42': 6, '42S': 7, '49S': 8, '38': 9, '36S': 10, '47': 11, 'BCI': 12}
{'O': 0, '47S': 1, '54': 2, '53': 3, '43S': 4, '60': 5, '50': 6, '50S': 7, '47': 8, '53S': 9, '48S': 10, '43': 11, '49': 12, '58S': 13, '57S': 14, '60S': 15, '52S': 16, '45': 17, '55': 18, '55S': 19, '52': 20, '48': 21, '54S': 22, '58': 23, '44': 24, '44S': 25, '45S': 26, '57': 27, '49S': 28, '59S': 29}


In [123]:
# convert the sequences to LongTensor for PyTorch
input_data = torch.LongTensor(input_sequences).to(device)
output_data = torch.LongTensor(output_sequences).to(device)

# Create the dataset
train_data = TensorDataset(input_data, output_data)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE)

# iterate over the training data
# for i, (data, targets) in enumerate(train_dataloader):
    # print(f'\nBatch {i + 1}:')
    # print(f'Input sequence shape: {data.shape}')
    # print(f'Output sequence shape: {targets.shape}')

In [136]:
import sys
sys.path.append('../../')
from TCN.word_cnn.model import TCN

EPOCHS = 500 # 500
LEARNING_RATE = 4

seed = 1111 
torch.manual_seed(seed)

eval_batch_size = BATCH_SIZE

starttime = time.time()
runtime = time.time()
started = False
clicks = 0
sm = nn.Softmax()

n_words = len(OUTPUT_VOCAB)


em_size = 20 # size of word embeddings -> Embedding() is used to encode input token into [em_size, output_size] vectors (see model.py)
levels = 7
n_hidden_units = 192
num_chans = [n_hidden_units] * (levels - 1) + [em_size]
k_size = 3
dropout = 0.45
emb_dropout = 0.25
tied = False # tie encoder and decoder weights (legare)
weights = torch.ones([n_words], dtype=torch.float)
corpidx = 0
gradient_clip = 0.35
log_interval = 1 # report interval

print(num_chans)

for word in OUTPUT_VOCAB.idx2word:
    if word==SILENCE_TOKEN:
        weights[corpidx] = 0.3
    corpidx +=1


model = TCN(input_size = em_size, 
            output_size = n_words, 
            num_channels = num_chans, 
            dropout=dropout, 
            emb_dropout=emb_dropout, 
            kernel_size=k_size, 
            tied_weights=tied)


# May use adaptive softmax to speed up training
criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = getattr(optim, 'SGD')(model.parameters(), lr=LEARNING_RATE)

 
def train(epoch):
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0
    start_time = time.time()

    # iterate over the training data
    for batch_idx, (data, targets) in enumerate(train_dataloader):

        batch_idx += 1

        # mask the last bar of the input data 
        batch_size = data.size(0)
        data_masked = torch.cat((data[:, :BAR_LENGTH*3], torch.ones([batch_size, BAR_LENGTH], dtype=torch.long)), dim = 1)

        # reset model gradients to zero
        optimizer.zero_grad()

        # make the prediction
        output = model(data_masked)

        # flatten the output sequence
        # NB: the size -1 is inferred from other dimensions
        # NB: contiguous() is used to make sure the tensor is stored in a contiguous chunk of memory, necessary for view() to work
        final_target = targets.contiguous().view(-1)
        final_output = output.contiguous().view(-1, n_words)

        loss = criterion(final_output, final_target)

        loss.backward()

        if gradient_clip > 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)

        optimizer.step()

        total_loss += loss.data

        if batch_idx % log_interval == 0 and batch_idx > 0:
            current_loss = total_loss.item() / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(epoch, 
                                                        batch_idx, 
                                                        len(train_dataloader), 
                                                        LEARNING_RATE,
                                                        elapsed * 1000 / log_interval,
                                                        current_loss, 
                                                        math.exp(current_loss)))
            total_loss = 0
            start_time = time.time()


for epoch in range(1, EPOCHS + 1):
    train(epoch)



[192, 192, 192, 192, 192, 192, 20]
| epoch   1 |     1/    4 batches | lr 4.00000 | ms/batch 76.34544 | loss  3.40 | ppl    30.01
| epoch   1 |     2/    4 batches | lr 4.00000 | ms/batch 233.24513 | loss  3.25 | ppl    25.67
| epoch   1 |     3/    4 batches | lr 4.00000 | ms/batch 84.01489 | loss  3.00 | ppl    20.02
| epoch   1 |     4/    4 batches | lr 4.00000 | ms/batch 59.56602 | loss  6.56 | ppl   704.26
| epoch   2 |     1/    4 batches | lr 4.00000 | ms/batch 113.96956 | loss  3.03 | ppl    20.60
| epoch   2 |     2/    4 batches | lr 4.00000 | ms/batch 93.69802 | loss  2.94 | ppl    18.92
| epoch   2 |     3/    4 batches | lr 4.00000 | ms/batch 94.06185 | loss  2.90 | ppl    18.09
| epoch   2 |     4/    4 batches | lr 4.00000 | ms/batch 64.09740 | loss  3.16 | ppl    23.63
| epoch   3 |     1/    4 batches | lr 4.00000 | ms/batch 87.56733 | loss  2.99 | ppl    19.96
| epoch   3 |     2/    4 batches | lr 4.00000 | ms/batch 95.39890 | loss  2.97 | ppl    19.42
| epoch   3 |

In [137]:
torch.save(model.state_dict(), 'generative_model.pt')

In [None]:

def evaluate(data_source):
    model.eval()
    total_loss = 0
    processed_data_size = 0
    for i in range(0, data_source.size(1) - 1, args.seq_len+1):
        if i + args.seq_len - args.validseqlen >= data_source.size(1) - 1:
            continue
        data, targets = get_batch(data_source, i, args, evaluation=True)
        output = model(data)
        
        # Discard the effective history, just like in training
        eff_history = args.seq_len - args.validseqlen
        final_output = output[:, eff_history:].contiguous().view(-1, n_words)
        final_target = targets[:, eff_history:].contiguous().view(-1)
        loss = criterion(final_output, final_target)

        # Note that we don't add TAR loss here
        total_loss += (data.size(1) - eff_history) * loss.data
        processed_data_size += data.size(1) - eff_history

    return total_loss[0] / processed_data_size


if __name__ == "__main__":
    best_vloss = 1e8

    # At any point you can hit Ctrl + C to break out of training early.
    try:
        if not args.train:
            with open("model.pt", 'rb') as f:
                model = torch.load(f)

            next_in = None
            model.eval()
        print(corpus.dictionary.idx2word[29])
        all_vloss = []
        for epoch in range(1, args.epochs+1):
            epoch_start_time = time.time()
            if args.train:
                train()
                val_loss = evaluate(val_data)
                test_loss = evaluate(test_data)
            

                print('-' * 89)
                print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                        'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                                val_loss, math.exp(val_loss)))
                print('| end of epoch {:3d} | time: {:5.2f}s | test loss {:5.2f} | '
                    'test ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                                test_loss, math.exp(test_loss)))
                print('-' * 89)

                # Save the model if the validation loss is the best we've seen so far.
                if val_loss < best_vloss:
                    with open("model.pt", 'wb') as f:
                        print('Save model!\n')
                        torch.save(model, f)
                    best_vloss = val_loss

                # Anneal the learning rate if the validation loss plateaus
                if epoch > 5 and val_loss >= max(all_vloss[-5:]):
                    lr = lr / 2.
                    if lr < 0.1:
                        print("bump lr")
                        lr = 2
                    for param_group in optimizer.param_groups:
                        param_group['lr'] = lr
                all_vloss.append(val_loss)

            

    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    # Load the best saved model.
    with open("model.pt", 'rb') as f:
        model = torch.load(f)

    # Run on test data.
    test_loss = evaluate(test_data)
    print('=' * 89)
    print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format(
        test_loss, math.exp(test_loss)))
    print('=' * 89)


In [None]:
def get_dataloader(input_sequences, output_sequences, batch_size = BATCH_SIZE):

    train_data = TensorDataset(torch.LongTensor(input_sequences).to(device),
                               torch.LongTensor(output_sequences).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return train_dataloader

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)



def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0



