In [1]:
import numpy as np 
import pandas as pd 
from io import open
import tensorflow as tf
import glob
import pickle
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import time
import copy

In [2]:
!pip install music21

Collecting music21
  Downloading music21-6.7.1.tar.gz (19.2 MB)
[K     |████████████████████████████████| 19.2 MB 7.9 MB/s eta 0:00:01
Collecting webcolors
  Downloading webcolors-1.11.1-py3-none-any.whl (9.9 kB)
Building wheels for collected packages: music21
  Building wheel for music21 (setup.py) ... [?25ldone
[?25h  Created wheel for music21: filename=music21-6.7.1-py3-none-any.whl size=21941692 sha256=24cff1a309acafa68e8c0df8e06f23d017f13ba894e506e7c60f8f0d542126bf
  Stored in directory: /root/.cache/pip/wheels/72/44/61/90e4e65262ca1b4d9f707527b540729ce3f64e00fc6b38d54c
Successfully built music21
Installing collected packages: webcolors, music21
Successfully installed music21-6.7.1 webcolors-1.11.1


In [3]:
from music21 import converter, instrument, note, chord, stream

In [4]:
def preprocess_input(filename, folder=False):
    # master list of notes
    notes = []
    
    # converting folders with multiple MIDI files
    if folder == True:
        assert os.path.exists('../input/classical-music-midi/'+filename)
        for file in glob.glob('../input/classical-music-midi/'+filename+'/*.mid'):
            notes_per_piece = []
            # read the MIDI file
            midi = converter.parse(file)            
            
            try: # file has instrument parts
                s2 = instrument.partitionByInstrument(midi)
                notes_to_parse = s2.parts[1].recurse() 
            except: # file has notes in a flat structure
                notes_to_parse = midi.flat.notes
            
#             print(notes_to_parse)
            for element in notes_to_parse:
                if isinstance(element, note.Note):
                    notes_per_piece.append(str(element.pitch))
                elif isinstance(element, chord.Chord):
                    notes_per_piece.append('.'.join(str(n) for n in element.normalOrder))
            notes.append(notes_per_piece)
    else:
        assert os.path.exists(filename)
        midi = converter.parse(filename)
        try: # file has instrument parts
            s2 = instrument.partitionByInstrument(midi)
            notes_to_parse = s2.parts[1].recurse() 
        except: # file has notes in a flat structure
            notes_to_parse = midi.flat.notes
#         print(notes_to_parse)
        for element in notes_to_parse:
            if isinstance(element, note.Note):
                notes.append(str(element.pitch))
            elif isinstance(element, chord.Chord):
                notes.append('.'.join(str(n) for n in element.normalOrder))
        

    with open('./notes', 'wb') as filepath:
        pickle.dump(notes, filepath)
#     print(notes)
    return notes

In [5]:
notes = preprocess_input('beeth', folder=True)

In [22]:
len(notes[-5:])

5

In [6]:
class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)
    
    def words(self):
        return self.idx2word


class Corpus(object):
    def __init__(self, path):
        self.dictionary = Dictionary()
        self.train = self.tokenize(notes[:-12])
        self.valid = self.tokenize(notes[-12:-6])
        self.test = self.tokenize(notes[-6:])

    def tokenize(self, notes):
        """Tokenizes a note sequence"""
        assert len(notes) > 0
        
        # Add notes to the dictionary
        for song in notes:
            for note in song:
                self.dictionary.add_word(note)
#         # Add words to the dictionary
#         with open(path, 'r', encoding="utf8") as f:
#             for line in f:
#                 words = line.split() + ['<eos>']
#                 for word in words:
#                     self.dictionary.add_word(word)

        # Tokenize file content
        idss = []
        
        for song in notes:
            ids = []
            for note in song:
#                 print(note)
                ids.append(self.dictionary.word2idx[note])
        idss.append(torch.tensor(ids).type(torch.int64))
        ids = torch.cat(idss)
            
#         with open(path, 'r', encoding="utf8") as f:
#             idss = []
#             for line in f:
#                 words = line.split() + ['<eos>']
#                 ids = []
#                 for word in words:
#                     ids.append(self.dictionary.word2idx[word])
#                 idss.append(torch.tensor(ids).type(torch.int64))
#             ids = torch.cat(idss)

        return ids

In [7]:
corpus = Corpus('../input/classical-music-midi/beeth')

In [8]:
device = 'cuda'
def batchify(data, bsz):
    # Work out how cleanly we can divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

eval_batch_size = 10
train_data = batchify(corpus.train, eval_batch_size)
val_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

In [9]:
# EMBEDDING ENCODES THE MEANING OF THE WORDS
class Embedder(nn.Module):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)   # the embedding module
    def forward(self, x):
        return self.embed(x)

In [10]:
class PositionalEncoding(nn.Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        self.d_model = d_model

        # PE is the Positional Encoding matrix 
        # THIS STORES THE POSITIONS OF THE SEQUENCE
        pe = torch.zeros(max_len, d_model)
        
        # Arange - RETURNS A RANGE BETWEEN VALUES, HERE IT IS 0 - max_len
        # unsqueeze - adds a dimension, 1 means that each element in the first list is now in a list
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        # division term, here it is (10000 ** ((2 * i)/d_model))
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # calculating the position encoding for the even and odd terms
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        # Unsqueeze 0 will put PE in one list
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """
        # make embeddings relatively larger
        # This is so we do not lose the importance of the embedding
        x = x * math.sqrt(self.d_model)
        # we add the embedding to the PE 
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [53]:
class TransformerModel(nn.Module):
    """Container module with an encoder, a recurrent or transformer module, and a decoder."""

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        try:
            from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer
        except:
            raise ImportError('TransformerEncoder module does not exist in PyTorch 1.1 or lower.')
        self.model_type = 'Transformer'
        # original mask
        self.src_mask = None
        # positional encoding
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        # encoder
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder.eval()
        # decoder
        decoder_layers = TransformerDecoderLayer(ninp, nhead, nhid, dropout)
        self.decoder = TransformerDecoder(decoder_layers, nlayers)
        self.decoder.eval()
        # embedding encoding
        self.embedding = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
#         self.decoder = nn.Linear(ninp, ntoken)

        # classification layer
        self.classification_layer = nn.Linear(ninp, ntoken)

        self.init_weights()

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def init_weights(self):
        initrange = 0.1
        nn.init.uniform_(self.embedding.weight, -initrange, initrange)
        nn.init.zeros_(self.classification_layer.weight)
        nn.init.uniform_(self.classification_layer.weight, -initrange, initrange)

    def forward(self, src, has_mask=True):
        if has_mask:
            device = src.device
            if self.src_mask is None or self.src_mask.size(0) != len(src):
                mask = self._generate_square_subsequent_mask(len(src)).to(device)
                self.src_mask = mask
        else:
            self.src_mask = None

#         src = self.embedding(src) * math.sqrt(self.ninp)
        src_embedding = self.embedding(src)
        src_embedding = self.pos_encoder(src_embedding)
        output = self.encoder(src_embedding, self.src_mask)
        output = self.decoder(output, self.embedding(src))
        
        output = self.classification_layer(output) # projection to vocab size

        return F.log_softmax(output, dim=-1)


In [54]:
ntokens = len(corpus.dictionary)
emsize = 400
nhead = 20
nhid = 500
nlayers = 4
dropout = 0.2
# Loop over epochs.
lr = 5
best_val_loss = None
epochs = 400
save = './model.pt'

In [55]:
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [56]:
criterion = nn.NLLLoss()

In [57]:
def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [58]:
# get_batch subdivides the source data into chunks of length args.bptt.
# If source is equal to the example output of the batchify function, with
# a bptt-limit of 2, we'd get the following two Variables for i = 0:
# ┌ a g m s ┐ ┌ b h n t ┐
# └ b h n t ┘ └ c i o u ┘
# Note that despite the name of the function, the subdivison of data is not
# done along the batch dimension (i.e. dimension 1), since that was handled
# by the batchify function. The chunks are along dimension 0, corresponding
# to the seq_len dimension in the LSTM.
seq_length = 40
def get_batch(source, i):
    seq_len = min(seq_length, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [59]:
def evaluate(data_source):
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    ntokens = len(corpus.dictionary)

    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, seq_length):
            data, targets = get_batch(data_source, i)
            
            output = model(data)
            output = output.view(-1, ntokens)
            
            total_loss += len(data) * criterion(output, targets).item()
    return total_loss / (len(data_source) - 1)

In [60]:
def train():
    # Turn on training mode which enables dropout.
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    
    for batch, i in enumerate(range(0, train_data.size(0) - 1, seq_length)):
        data, targets = get_batch(train_data, i)
        # Starting each batch, we detach the hidden state from how it was previously produced.
        # If we didn't, the model would try backpropagating all the way to start of the dataset.
        model.zero_grad()
        
        output = model(data)
        output = output.view(-1, ntokens)
     
        loss = criterion(output, targets)
        loss.backward()
        
        clip = 0.25
        log_interval = 200
        
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        for p in model.parameters():
            p.data.add_(p.grad, alpha=-lr)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | ms/batch {:5.2f} | '
                    'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_data) // seq_length, lr,
                elapsed * 1000 / log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
#         if dry_run:
#             break

In [61]:
# At any point you can hit Ctrl + C to break out of training early.
try:
    for epoch in range(1, epochs+1):
        epoch_start_time = time.time()
        train()
        val_loss = evaluate(val_data)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
                'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                           val_loss, math.exp(val_loss)))
        print('-' * 89)
        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            with open(save, 'wb') as f:
                torch.save(model, f)
            best_val_loss = val_loss
        else:
            # Anneal the learning rate if no improvement has been seen in the validation dataset.
            lr /= 4.0
except KeyboardInterrupt:
    print('-' * 89)
    print('Exiting from training early')

-----------------------------------------------------------------------------------------
| end of epoch   1 | time:  0.49s | valid loss  8.81 | valid ppl  6728.48
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   2 | time:  0.45s | valid loss 10.79 | valid ppl 48760.42
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   3 | time:  0.44s | valid loss  4.53 | valid ppl    92.82
-----------------------------------------------------------------------------------------
-----------------------------------------------------------------------------------------
| end of epoch   4 | time:  0.44s | valid loss  4.41 | valid ppl    82.32
--------------------------------------------------------------------------

In [74]:
n_generate = 500
temperature = 2.0
sequence = []
log_interval = 50 # interval between logs

In [75]:
model.eval()

input = torch.randint(ntokens, (1, 1), dtype=torch.long).to(device)

with open('./output', 'w') as outf:
    with torch.no_grad():  # no tracking history
        for i in range(n_generate):
            
            output = model(input, False)
            word_weights = output[-1].squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            word_tensor = torch.Tensor([[word_idx]]).long().to(device)
            input = torch.cat([input, word_tensor], 0)


            word = corpus.dictionary.idx2word[word_idx]

            outf.write(word + ('\n' if i % 20 == 19 else ' '))
            
            sequence.append(word)

            if i % log_interval == 0:
                print('| Generated {}/{} notes'.format(i, n_generate))

| Generated 0/500 notes
| Generated 50/500 notes
| Generated 100/500 notes
| Generated 150/500 notes
| Generated 200/500 notes
| Generated 250/500 notes
| Generated 300/500 notes
| Generated 350/500 notes
| Generated 400/500 notes
| Generated 450/500 notes


In [76]:
# Creating a MIDI file from prediction
def create_MIDI(gen, name=""):
    
    # the offset is the time difference between notes, we assume its 0.5 here
    offset = 0
    music = []
    
    for seq in gen:
        # chords are seperated by .
        if ('.' in seq) or seq.isdigit():
            chordnotes = seq.split('.')
            n = []
            
            for cur_note in chordnotes:
                new_n = note.Note(int(cur_note))
                new_n.storedInstrument = instrument.Piano() # single piano instrument only
                n.append(new_n)
            
            new_c = chord.Chord(n)
            new_c.offset = offset
            music.append(new_c)
        
        else:            
            new_n = note.Note(seq)
            new_n.storedInstrument = instrument.Piano() # single piano instrument only
            new_n.offset = offset
            music.append(new_n)
        
        offset += 0.5
    
    print(notes)
    # producing a MIDI stream
    midi = stream.Stream(music)
    
    midi.write('midi', fp='new_music_'+name+'.mid')

In [77]:
create_MIDI(sequence, name='beeth_all_diverse')

[['D4', 'E-4', 'F4', 'B-2', 'B-3', 'D4', 'B-3', 'A4', 'G4', 'E-4', 'F4', 'G4', 'C4', 'A4', 'B-4', 'D4', 'C#4', 'F4', 'D4', 'B-3', 'A4', 'G4', 'E-4', 'F4', 'G4', 'C4', 'A4', 'B-4', 'D4', 'C#4', 'A4', 'B-4', 'D4', 'A3', 'C5', 'D5', 'B-3', 'G3', 'E5', '4.10', 'F3', '5.9', 'F4', 'F5', 'E-5', 'D5', 'E-5', 'F5', 'G5', 'F3', 'C5', 'A3', 'C4', 'D5', 'C5', 'E-4', 'B4', 'C5', 'D5', 'E-5', 'F3', 'A4', 'C4', 'E-4', 'B-4', 'A4', 'F#4', 'G4', 'A4', 'B-4', 'D5', 'C5', 'B4', 'C5', 'D5', 'F5', 'E-5', '5', 'D5', 'E-5', 'F5', 'G5', '3.5', 'A4', 'C5', 'B-3', 'E-4', 'B-4', 'D4', 'D4', 'E-4', 'F4', 'B-2', 'B-3', 'D4', 'B-3', 'A4', 'G4', 'E-4', 'F4', 'G4', 'C4', 'A4', 'B-4', 'D4', 'C#4', 'F4', 'D4', 'B-3', 'A4', 'G4', 'E-4', 'F4', 'G4', 'C4', 'A4', 'B-4', 'D4', 'C#4', 'A4', 'B-4', 'D4', 'A3', 'C5', 'D5', 'B-3', 'G3', 'E5', '4.10', 'F3', '5.9', 'F4', 'F5', 'E-5', 'D5', 'E-5', 'F5', 'G5', 'F3', 'C5', 'A3', 'C4', 'D5', 'C5', 'E-4', 'B4', 'C5', 'D5', 'E-5', 'F3', 'A4', 'C4', 'E-4', 'B-4', 'A4', 'F#4', 'G4', 'A4'