In [28]:
import sys
sys.path.insert(0, '../src')

import pickle
from loaders import *
from episode import *
from dataset import *
from collections import Counter
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F
import json
import numpy as np
import matplotlib.pyplot as plt
import random

In [29]:
""" Some global variables """
_loader = Loader(502) # 500 + SOS + EOS
loader = MIDILoader(_loader)

use_cuda = torch.cuda.is_available()
# Is the tokenizer 1 indexed?
vocabulary_size = 16*128*2 + 32*16 + 100 + 1 # 4708 + 1
vocabulary_size = vocabulary_size + 2 # SOS (index 4709) and EOS (index 4710)
SOS_TOKEN = 4709
EOS_TOKEN = 4710

encoding_size = 500
one_hot_embeddings = np.eye(vocabulary_size)

In [30]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [31]:
# The next two functions are part of some other deep learning frameworks, but PyTorch
# has not yet implemented them. We can find some commonly-used open source worked arounds
# after searching around a bit: https://gist.github.com/jihunchoi/f1434a77df9db1bb337417854b398df1.
def _sequence_mask(sequence_length, max_len=None):
    if max_len is None:
        max_len = sequence_length.data.max()
    batch_size = sequence_length.size(0)
    seq_range = torch.arange(0, max_len).long()
    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
    seq_range_expand = Variable(seq_range_expand)
    if sequence_length.is_cuda:
        seq_range_expand = seq_range_expand.cuda()
    seq_length_expand = (sequence_length.unsqueeze(1)
                         .expand_as(seq_range_expand))
    return seq_range_expand < seq_length_expand


def compute_loss(logits, target, length):
    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.

    Returns:
        loss: An average loss value masked by the length.
    """
    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = F.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = _sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.double()
    loss = losses.sum() / length.double().sum()
    return loss

In [32]:
class EncoderLSTM(nn.Module):
    # Your code goes here
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size).double()
        
        if use_cuda:
            self.lstm = self.lstm.cuda()
        
    def forward(self, input, hidden):
        _, hidden_out = self.lstm(input, hidden) # encoder only outputs hidden
        return hidden_out
    
    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

In [33]:
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size).double()
        self.out = nn.Linear(hidden_size, output_size).double()
        
        if use_cuda:
            self.lstm = self.lstm.cuda()
            self.out = self.out.cuda()

    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        return output[0], hidden

    def initHidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_size)

In [46]:
class Learner(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size,
                 output_size,
                 learning_rate,
                 embeddings=one_hot_embeddings):
        super(Learner,self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.encoder = EncoderLSTM(input_size, hidden_size)
        self.decoder = DecoderLSTM(input_size, hidden_size, output_size)
        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=learning_rate)
        self.decoder_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=learning_rate)
        
        self.embeddings = embeddings
        self.criterion = nn.CrossEntropyLoss()
        
        if use_cuda:
            self.encoder = self.encoder.cuda()
            self.decoder = self.decoder.cuda()
            self.criterion = self.criterion.cuda()
        
    def forward(self, token_seqs):
        loss = 0
        batch_size = len(token_seqs)
        seq_len = len(token_seqs[0])
        self.encoder_optimizer.zero_grad()
        self.decoder_optimizer.zero_grad()
        
        encoder_hidden = Variable(self.encoder.initHidden(batch_size)).double()
        encoder_output = Variable(self.encoder.initHidden(batch_size)).double()
        if use_cuda:
            encoder_hidden = encoder_hidden.cuda()
            encoder_output = encoder_output.cuda()
        
        hidden = (encoder_output, encoder_hidden)
        for i in np.arange(seq_len-1, 0, -1):
            token_batch = np.array(self.embeddings[token_seqs[:, i]])
            encoder_input = Variable(torch.from_numpy(token_batch)).view(1, batch_size, -1).double()
            encoder_input = encoder_input.cuda() if use_cuda else encoder_input
            #print("encoder_input: %d" % (np.where(encoder_input.data==1)[2][0]))
            hidden = self.encoder(encoder_input, hidden)
        encoder_output, encoder_hidden = hidden
            
        token_batch = np.array(self.embeddings[[SOS_TOKEN]*batch_size])
        decoder_input = Variable(torch.from_numpy(token_batch)).double()
        decoder_output = Variable(self.decoder.initHidden(batch_size)).double()
        if use_cuda:
            decoder_output = decoder_output.cuda()
        
        hidden = (decoder_output, encoder_hidden)
        for i in range(seq_len+1):
            decoder_input = decoder_input.squeeze().view(1, batch_size, -1)
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
            #print("decoder_input: %d" % (np.where(decoder_input.data==1)[2][0]))
            decoder_output, hidden = self.decoder(decoder_input, hidden)
            #print("prediction: %d" % (int(decoder_output.topk(1)[1])))
            if i < seq_len:
                seq_var = token_seqs[:, i]
            else:
                seq_var = [EOS_TOKEN]*batch_size
                
            target = Variable(torch.from_numpy(np.array(seq_var))).long()
            target = target.cuda() if use_cuda else target
            loss += self.criterion(decoder_output, target)
                
            # Teacher forcing
            decoder_input = Variable(torch.from_numpy(np.array(self.embeddings[seq_var]))).double()
            
        loss = torch.sum(loss)/batch_size
        loss.backward()
        self.encoder_optimizer.step()
        self.decoder_optimizer.step()
        
        return loss.data[0] / seq_len
    
    def map_inference(self, token_seqs):
        batch_size = len(token_seqs)
        seq_len = len(token_seqs[0])
        encoder_hidden = Variable(self.encoder.initHidden(batch_size)).double()
        encoder_output = Variable(self.encoder.initHidden(batch_size)).double()
        if use_cuda:
            encoder_hidden = encoder_hidden.cuda()
            encoder_output = encoder_output.cuda()
        
        hidden = (encoder_output, encoder_hidden)
        for i in np.arange(seq_len-1, 0, -1):
            token_batch = np.array(self.embeddings[token_seqs[:, i]])
            encoder_input = Variable(torch.from_numpy(token_batch)).view(1, batch_size, -1).double()
            encoder_input = encoder_input.cuda() if use_cuda else encoder_input
            hidden = self.encoder(encoder_input, hidden)
        
        encoder_output, encoder_hidden = hidden
            
        token_batch = np.array(self.embeddings[[SOS_TOKEN]*batch_size])
        decoder_output = Variable(self.decoder.initHidden(batch_size)).double()
        if use_cuda:
            decoder_output = decoder_output.cuda()
        
        hidden = (decoder_output, encoder_hidden)

        pred_seqs = None
        for i in range(500):
            decoder_input = Variable(torch.from_numpy(token_batch)).double()
            decoder_input = decoder_input.cuda() if use_cuda else decoder_input
            decoder_input = decoder_input.squeeze().view(1, batch_size, -1)
            decoder_output, hidden = self.decoder(decoder_input, hidden)
            topv, topi = decoder_output.data.topk(1)
            #print("Iteration: %d, Prediction: %d" % (i, token))
            if pred_seqs is None:
                pred_seqs = topi.cpu().numpy()
            else:
                pred_seqs = np.concatenate((pred_seqs, topi.cpu().numpy()), axis=1)
            token_batch = np.array(self.embeddings[topi])
            
        return pred_seqs.tolist()

In [47]:
learner = Learner(vocabulary_size, 
              encoding_size, 
              vocabulary_size,
              learning_rate=0.01)

learner = learner.cuda() if use_cuda else learner
eps = load_sampler_from_config("../src/config.yaml")

In [14]:
songs = np.concatenate(eps.get_episode().support)

Jimi Hendrix hearmytrainacomin.mid
Jimi Hendrix gypsyeyes.mid
Jimi Hendrix thestarspangledbanner.mid
Jimi Hendrix electricladyland.mid
Jimi Hendrix stonefree.mid
Jimi Hendrix messagetolove.mid
Jimi Hendrix drifting.mid
Jimi Hendrix thewindcriesmary.mid
Jimi Hendrix haveyoueverbeen.mid
Jimi Hendrix upfromtheskies.mid
Jimi Hendrix spanishcastlemagicsolo(woodstock).mid


In [26]:
songs

array([[4248,  551, 4708, ...,  568,  679, 4647],
       [4248,  551, 4708, ...,  568,  679, 4647],
       [4248,  551, 4708, ...,  568,  679, 4647],
       ...,
       [4248,  551, 4708, ...,  568,  679, 4647],
       [4248,  551, 4708, ...,  568,  679, 4647],
       [4248,  551, 4708, ...,  568,  679, 4647]])

In [36]:
print_every = 10
total_epochs = 50
print_loss_total = 0
startTime = time.time()
for epoch in range(1, total_epochs+1):
    #songs = np.concatenate(eps.get_episode().support)
    loss = learner(songs)
    print_loss_total += loss
    
    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('%s (%d %d%%) %.4f' % (timeSince(startTime, epoch / total_epochs),
                                     epoch, epoch / total_epochs * 100, print_loss_avg))
        #torch.save(learner.state_dict(), '../models/baseline_'+str(epoch))
        

0m 53s (- 3m 32s) (10 20%) 0.4734
1m 46s (- 2m 40s) (20 40%) 0.3523
2m 41s (- 1m 47s) (30 60%) 0.3376
3m 38s (- 0m 54s) (40 80%) 0.3296
4m 32s (- 0m 0s) (50 100%) 0.3253


In [None]:
print_every = 10
total_epochs = 2900
print_loss_total = 0
startTime = time.time()
for epoch in range(1001, total_epochs+1):
    songs = eps.get_episode().support[0]
    for song in songs:
        song = list(song)
        loss = learner(song)
        print_loss_total += loss
    
    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print('%s (%d %d%%) %.4f' % (timeSince(startTime, epoch / total_epochs),
                                     epoch, epoch / total_epochs * 100, print_loss_avg))
        torch.save(learner.state_dict(), '../models/baseline_'+str(epoch))
        

In [39]:
torch.save(learner.state_dict(), "../models/song.mod")

pickle.dump(songs[0], open("../models/song.mid", "wb"))

In [57]:
learner.load_state_dict(torch.load("../models/song.mod"))
learner.eval()
song = pickle.load(open("../models/song.mid", "rb"))
song = song.tolist()

midi = loader.detokenize(np.array(song))
midi.write('orig.mid')

midi = loader.detokenize(np.array(song[:100]))
midi.write('song.mid')

gen_seq = learner.map_inference(np.array([song[:100]]))
midi = loader.detokenize(np.array(song[:100] + gen_seq[0]))
midi.write('pred.mid')

In [None]:
learner.load_state_dict(torch.load('../models/baseline_'+epoch+'_'+print_loss_avg))

In [None]:
song_index = 0
test_seq = input_variables[song_index][:,1:101] # First dimension is batch
out_seq = learner.map_inference(test_seq, hidden=None)
whole_seq = original_sequences[song_index][0:100].tolist() + out_seq
midi = loader.detokenize(np.array(whole_seq))
midi.write('test.mid')