In [2]:
from loaders import *

In [3]:
_loader = Loader(500)
loader = MIDILoader(_loader)

In [4]:
_sequence = loader.read('../data/bach_846.mid')
sequence = loader.tokenize(_sequence)

In [5]:
from collections import Counter
from gensim.models import Word2Vec
from random import random
from nltk import word_tokenize
from nltk.translate.bleu_score import sentence_bleu
from torch import nn
from torch.autograd import Variable

import numpy as np
import torch
import torch.nn.functional as F
import json
import numpy as np
import matplotlib.pyplot as plt
import random

use_cuda = torch.cuda.is_available()
# Is the tokenizer 1 indexed?
vocabulary_size = 16*128*2 + 32*16 + 100 + 1 # 4708 + 1
vocabulary_size = vocabulary_size + 2 # SOS (index 4709) and EOS (index 4710)
SOS_TOKEN = 4709
EOS_TOKEN = 4710
encoding_size = 1000
one_hot_embeddings = np.eye(vocabulary_size) 



In [6]:
midi_sequence = loader.detokenize(np.array(sequence))
midi_sequence.write('test.mid')

In [7]:
class EncoderLSTM(nn.Module):
    # Your code goes here
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size).double()
        if use_cuda:
            self.lstm = self.lstm.cuda()
        
    def forward(self, input, hidden_in):
        _, hidden_out = self.lstm(input, hidden_in) # encoder only outputs hidden
        return hidden_out
    
    def initHidden(self):
        result = Variable(torch.zeros(1, 1, self.hidden_size)).double()
        if use_cuda:
            result = result.cuda()
        return result

In [8]:
class DecoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderLSTM, self).__init__()
        self.hidden_size = hidden_size

        self.lstm = nn.LSTM(input_size, hidden_size).double()
        self.out = nn.Linear(hidden_size, output_size).double()
        self.project = nn.Linear(4096, self.hidden_size).double()
        if use_cuda:
            self.lstm = self.lstm.cuda()
            self.out = self.out.cuda()
            self.project = self.project.cuda()

    def forward(self, input, hidden):
        output = F.relu(input)
        output, hidden = self.lstm(output, hidden)
        output = self.out(output)
        output = output.squeeze()
        return output.unsqueeze(0), hidden

    def initHidden(self, init_size, image_features):
        result = self.project(image_features)
        result = F.relu(result)
        if use_cuda:
            return result.cuda()
        else:
            return result

In [9]:
def train(sequence,
          encoder, 
          decoder, 
          encoder_optimizer, 
          decoder_optimizer, 
          criterion,
          embeddings=one_hot_embeddings):
    
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    # target_variable has (batch_size, length, vocab_size)
    # Without minibatch, this is just one sequence
    sequence_length = sequence.size()[1]
    loss = 0
    
    # Encoder is fed from the flipped sentence
    encoder_hidden = encoder.initHidden()
    encoder_hidden = (encoder_hidden, encoder_hidden) # Need a tuple
    
    # Feeding encoder in a loop, in reverse order
    # Skip index=0 which should be SOS
    for index_control in np.arange(sequence_length-1, 0, -1):
        encoder_input = sequence[0][index_control].view(1, 1, vocabulary_size)
        encoder_hidden = encoder(encoder_input, encoder_hidden) # Gets hidden for next input    
    
    # feed encoder_hidden
    decoder_input = sequence[0][0]
    decoder_hidden = encoder_hidden
    predicted_note_index = 0
    
    for index_control in range(1, sequence_length):
        decoder_input = decoder_input.view(1, 1, vocabulary_size)
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        
        topv, topi = decoder_output.data.topk(1)
        predicted_control_index = int(topi)

        
        if random.random() <= 0.9:
            decoder_input = sequence[0][index_control].view(1, 1, vocabulary_size)
        else:
            # This is the next input, without teacher forcing it's the predicted output
            decoder_input = torch.from_numpy(embeddings[predicted_control_index])
            decoder_input = Variable(decoder_input)
            if use_cuda:
                decoder_input = decoder_input.cuda()
        
        # This is just to conform with the pytorch format..
        # CrossEntropyLoss takes input1: (N, C) and input2: (N).
        _, actual_control_index = sequence[0][index_control].topk(1)
        if use_cuda:
            actual_control_index = actual_control_index.cuda()

        # Compare current output to next "target" input
        loss += criterion(decoder_output, actual_control_index)
            
    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()

    # index_word keeps track of the current word
    # in case of break (EOS) and non-break (teacher-forcing), it'll be the actually count.
    return loss.data[0] / index_control

In [10]:
_loader = Loader(500)
loader = MIDILoader(_loader)

# Initialize the encoder with a hidden size of 1000. 
# With one-hot, the input size is
encoder = EncoderLSTM(vocabulary_size, encoding_size)
decoder = DecoderLSTM(vocabulary_size, encoding_size, vocabulary_size)

In [11]:
# input_files = ['bach_846.mid', 'mz_311_1.mid', 'rac_op3_2.mid']
input_files = ['bach_846.mid']
encoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

input_variables = []

for index, input_file in enumerate(input_files):
    sequence = loader.read('../data/' + input_file)
    sequence = loader.tokenize(sequence)
    sequence = [SOS_TOKEN] + sequence + [EOS_TOKEN]
    seq_length = len(sequence)
    sequence = torch.from_numpy(np.array(one_hot_embeddings[sequence])) # This is really time consuming
    sequence = sequence.view(1, seq_length, vocabulary_size)
    sequence = Variable(sequence)
    if use_cuda:
        sequence = sequence.cuda()
    input_variables.append(sequence)
        
for epoch in range(10):
    for index, input_file in enumerate(input_files):
        sequence = input_variables[index]
        train(sequence, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

KeyboardInterrupt: 

In [12]:
def map_inference(sequence, encoder, decoder, embeddings=one_hot_embeddings, max_length=1000):
     
    output_control_sequence = []
    
    # Encoder
    encoder_hidden = encoder.initHidden()
    encoder_hidden = (encoder_hidden, encoder_hidden)
    
    sequence_length = sequence.size()[1]
    
    for index_control in np.arange(sequence_length-1, 0, -1):
        encoder_input = sequence[0][index_control].view(1, 1, vocabulary_size)
        encoder_hidden = encoder(encoder_input, encoder_hidden) # Gets hidden for next input
        
    # This point we have last encoder_hidden, feed into decoder
    decoder_hidden = encoder_hidden
    decoder_input = sequence[0][0]
    predicted_control_index = SOS_TOKEN
    
    cur_length = 0
    while True:
        decoder_input = decoder_input.view(1, 1, vocabulary_size)
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        
        # MAP inference
        topv, topi = decoder_output.data.topk(1)
        predicted_control_index = int(topi)
        if predicted_control_index == EOS_TOKEN:
            break
        output_control_sequence.append(predicted_control_index)
        
        # This is the next input
        decoder_input = torch.from_numpy(embeddings[predicted_control_index])
        decoder_input = Variable(decoder_input).double()
        if use_cuda:
            decoder_input = decoder_input.cuda()
        
        cur_length += 1
        if cur_length >= max_length:
            break
        
    return output_control_sequence

In [13]:
sequence = loader.read('../data/bach_846.mid')
sequence = loader.tokenize(sequence)
sequence = sequence[0:500]
seq_length = len(sequence)
sequence_var = torch.from_numpy(np.array(one_hot_embeddings[sequence])) # This is really time consuming
sequence_var = sequence_var.view(1, seq_length, vocabulary_size)
sequence_var = Variable(sequence_var)
if use_cuda:
    sequence_var = sequence_var.cuda()

generated_sequence = map_inference(sequence_var, encoder, decoder)
whole_sequence = sequence + generated_sequence 
midi_sequence = loader.detokenize(np.array(whole_sequence))
midi_sequence.write('test.mid')