In [1]:
from loaders import *
from collections import Counter
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F
import json
import numpy as np
import matplotlib.pyplot as plt
import random

In [2]:
""" Some global variables """
_loader = Loader(502) # 500 + SOS + EOS
loader = MIDILoader(_loader)

use_cuda = torch.cuda.is_available()
# Is the tokenizer 1 indexed?
vocabulary_size = 16*128*2 + 32*16 + 100 + 1 # 4708 + 1
vocabulary_size = vocabulary_size + 2 # SOS (index 4709) and EOS (index 4710)
SOS_TOKEN = 4709
EOS_TOKEN = 4710

encoding_size = 500
one_hot_embeddings = np.eye(vocabulary_size)

In [3]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [4]:
class EncoderLSTM(nn.Module):
    # Your code goes here
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size).double()
        if use_cuda:
            self.lstm = self.lstm.cuda()
        
    def forward(self, input, hidden_in):
        _, hidden_out = self.lstm(input, hidden_in) # encoder only outputs hidden
        return hidden_out
    
    def initHidden(self, hidden, batch_size):
        
        if hidden == None:
            result = Variable(torch.zeros(1, batch_size, self.hidden_size)).double()
            
            if use_cuda:
                result = result.cuda()
            return result
        
        else:
            return hidden

In [5]:
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size).double()
        self.out = nn.Linear(hidden_size, output_size).double()
        if use_cuda:
            self.lstm = self.lstm.cuda()
            self.out = self.out.cuda()

    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        output = output.squeeze()
        return output.unsqueeze(0), hidden

    def initHidden(self, batch_size):
        result = Variable(torch.zeros(1, batch_size, self.hidden_size)).double()
        if use_cuda:
            return result.cuda()
        else:
            return result

In [6]:
# The next two functions are part of some other deep learning frameworks, but PyTorch
# has not yet implemented them. We can find some commonly-used open source worked arounds
# after searching around a bit: https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1.
def _sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.arange(0, max_len).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def compute_loss(logits, target, length):
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.

    Returns:
        loss: An average loss value masked by the length.
    """
    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = _sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.double()
    loss = losses.sum() / length.double().sum()
    return loss

In [7]:
class MetaLearner(nn.Module):
    
    def __init__(self,
                 input_size,
                 hidden_size):
        super(MetaLearner,self).__init__()
        
    

In [8]:
class Learner(nn.Module):
    
    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 learning_rate,
                 embeddings=one_hot_embeddings):
        
        super(Learner,self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.encoder = EncoderLSTM(input_size, hidden_size)
        self.decoder = DecoderLSTM(input_size, hidden_size, output_size)
        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=learning_rate)
        self.decoder_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=learning_rate)
        
        self.embeddings = embeddings
        self.criterion = nn.CrossEntropyLoss()
        
    
    def forward(self, sequence, numbered_seq, hidden):
        
        encoder = self.encoder
        decoder = self.decoder
        embeddings = self.embeddings
        criterion = self.criterion
        
        # (seq_length, batch_size, vocab_size)
        seq_size = sequence.size()
        batch_size = seq_size[1]
        sequence_length = seq_size[0]
        loss = 0
        
        # h_n = (1, batch_size, encoding_size)
        encoder_hidden = encoder.initHidden(hidden, batch_size)
        encoder_hidden = (encoder_hidden, encoder_hidden) # Need a tuple

        # Encoder is fed the flipped control sequence
        for index_control in np.arange(sequence_length-1, 0, -1):
            encoder_input = sequence[index_control].unsqueeze(0) # (1, batch_size, vocab_size)
            encoder_hidden = encoder(encoder_input, encoder_hidden)
        
        # feed encoder_hidden
        decoder_input = sequence[1].unsqueeze(0) # One after SOS
        decoder_hidden = encoder_hidden
        predicted_note_index = 0
        
        # Prepare the results tensor
        all_decoder_outputs = Variable(torch.zeros(*sequence.size())).double() # (seq_length, batch_size, vocab_size)
        if use_cuda:
            all_decoder_outputs = all_decoder_outputs.cuda()

        all_decoder_outputs[0] = decoder_input

        for index_control in range(2, sequence_length):
            # decoder_input = decoder_input.view(1, 1, vocabulary_size)
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
            
            if random.random() <= 0.9:
                decoder_input = sequence[index_control].unsqueeze(0)
            else:
                topv, topi = decoder_output.data.topk(1)
                # This is the next input, without teacher forcing it's the predicted output
                decoder_input = torch.stack([Variable(torch.DoubleTensor(embeddings[ni]))
                                         for ni in topi.squeeze()]).unsqueeze(0)
                if use_cuda:
                    decoder_input = decoder_input.cuda()
                    
            # Save the decoder output
            all_decoder_outputs[index_control] = decoder_output
        
        
        seq_lens = Variable(torch.LongTensor(np.ones(batch_size, dtype=int)*sequence_length))
        if use_cuda:
            seq_lens = seq_lens.cuda()
        loss = compute_loss(all_decoder_outputs.transpose(0,1).contiguous(),
                        numbered_seq.transpose(0,1).contiguous(), 
                        seq_lens)
        
        return loss
    
    def map_inference(self, sequence, hidden, embeddings=one_hot_embeddings, max_length=500):
        """ sequence has to be batch_size=1"""
        encoder = self.encoder
        decoder = self.decoder
        
        output_control_sequence = []
    
        # Encoder
        encoder_hidden = encoder.initHidden(hidden, batch_size=1)
        encoder_hidden = (encoder_hidden, encoder_hidden)

        sequence_length = sequence.size()[1]

        for index_control in np.arange(sequence_length-1, 0, -1):
            encoder_input = sequence[0][index_control].view(1, 1, vocabulary_size)
            encoder_hidden = encoder(encoder_input, encoder_hidden) # Gets hidden for next input

        # This point we have last encoder_hidden, feed into decoder
        decoder_hidden = encoder_hidden
        decoder_input = sequence[0][0]
        predicted_control_index = SOS_TOKEN

        cur_length = 0
        while True:
            decoder_input = decoder_input.view(1, 1, vocabulary_size)
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)

            # MAP inference
            topv, topi = decoder_output.data.topk(1)
            predicted_control_index = int(topi)
            if predicted_control_index == EOS_TOKEN:
                break
            output_control_sequence.append(predicted_control_index)

            # This is the next input
            decoder_input = torch.from_numpy(embeddings[predicted_control_index])
            decoder_input = Variable(decoder_input).double()
            if use_cuda:
                decoder_input = decoder_input.cuda()

            cur_length += 1
            if cur_length >= max_length:
                break

        return output_control_sequence
    
    def train(self, sequence, numbered_seq, hidden):
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        
        loss = self.forward(sequence, numbered_seq, hidden)
        
        loss.backward()
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()
        return loss

learner = Learner(vocabulary_size, 
              encoding_size, 
              vocabulary_size,
              learning_rate=0.01)

In [9]:
input_files = ['bach_846.mid', 'mz_311_1.mid', 'rac_op3_2.mid']
input_variables = []
original_sequences = []

for index, input_file in enumerate(input_files):
    orig_seq = loader.read('../data/' + input_file)
    orig_seq = loader.tokenize(orig_seq)
    
    trunc_seq = orig_seq[0:500]
    trunc_seq = [SOS_TOKEN] + trunc_seq + [EOS_TOKEN]
    original_sequences.append(trunc_seq)
    seq_length = len(trunc_seq)
    
    trunc_seq = torch.from_numpy(np.array(one_hot_embeddings[trunc_seq])) # This is really time consuming
    trunc_seq = trunc_seq.view(seq_length, vocabulary_size)
    trunc_seq = Variable(trunc_seq)
    if use_cuda:
        trunc_seq = trunc_seq.cuda()
    input_variables.append(trunc_seq)
    
original_sequences = np.array(original_sequences, dtype=np.int64)

In [10]:
""" Testing Learner """
print_every = 10
total_epochs = 200
print_loss_total = 0
batch_size = 2
startTime = time.time()
for epoch in range(1, total_epochs+1):
    for batch in range(len(input_variables)//batch_size):
        # lstm input is (seq_len, batch_size, vocab_size)
        start, end = batch*batch_size, (batch+1)*batch_size
        sequences = torch.stack(input_variables[start:end]).transpose(0,1)
        numbered_seqs = torch.stack(Variable(torch.from_numpy(original_sequences[start:end]))).transpose(0,1)
        print(numbered_seqs)
        if use_cuda:
            numbered_seqs = numbered_seqs.cuda()
        loss = learner.train(sequences, numbered_seqs, hidden=None)
        print_loss_total += loss
    
    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('%s (%d %d%%) %.4f' % (timeSince(startTime, epoch / total_epochs),
                                     epoch, epoch / total_epochs * 100, print_loss_avg))

Variable containing:
 4709  4709
 4144  4143
  188   194
     ⋮      
 2243  2255
  200  4611
 4710  4710
[torch.LongTensor of size 502x2]





KeyboardInterrupt: 

In [36]:
song_index = 0
test_seq = input_variables[song_index][:,1:101] # First dimension is batch
out_seq = learner.map_inference(test_seq, hidden=None)
whole_seq = original_sequences[song_index][0:100].tolist() + out_seq
midi = loader.detokenize(np.array(whole_seq))
midi.write('test.mid')

In [42]:
test_hidden = learner.encoder.initHidden(None).squeeze()
print(torch.stack([test_hidden, test_hidden]).unsqueeze(0))

Variable containing:
( 0 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
[torch.cuda.DoubleTensor of size 1x2x500 (GPU 0)]

