**Music Melody-Conditional Multitrack Generation**

In [None]:
import os
import shutil
import glob
import numpy as np 
import pandas as pd
import pretty_midi
import pypianoroll
import tables
from music21 import converter, instrument, note, chord, stream
import music21
import librosa
import librosa.display
import matplotlib.pyplot as plt
from keras.utils import np_utils
import json
import IPython.display
from datetime import datetime
import random

import torch
import torch.nn as nn
from torch.nn import functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import random
import itertools
root_dir = 'drive/MyDrive/ProjectMusic
data_dir = root_dir + '/Lakh Piano Dataset/LPD-5/lpd_5/lpd_5_cleansed'

In [None]:
!apt-get update -qq && apt-get install -qq libfluidsynth1 fluid-soundfont-gm build-essential libasound2-dev libjack-dev
!pip install -qU pyfluidsynth pretty_midi
!pip install music21
!pip install pypianoroll

In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Getting MIDI and Song Metadata**

In [None]:
RESULTS_PATH = os.path.join(root_dir, 'Lakh Piano Dataset', 'Metadata')

# Utility functions for retrieving paths
def msd_id_to_dirs(msd_id):
    """Given an MSD ID, generate the path prefix.
    E.g. TRABCD12345678 -> A/B/C/TRABCD12345678"""
    return os.path.join(msd_id[2], msd_id[3], msd_id[4], msd_id)


def msd_id_to_h5(msd_id):
    """Given an MSD ID, return the path to the corresponding h5"""
    return os.path.join(RESULTS_PATH, 'lmd_matched_h5',
                        msd_id_to_dirs(msd_id) + '.h5')

# Load the midi npz file from the LMD cleansed folder
def get_midi_npz_path(msd_id, midi_md5):
    return os.path.join(data_dir,
                        msd_id_to_dirs(msd_id), midi_md5 + '.npz')

In [None]:
# Open the cleansed ids - cleansed file ids : msd ids
cleansed_ids = pd.read_csv(os.path.join(root_dir, 'Lakh Piano Dataset', 'cleansed_ids.txt'), delimiter = '    ', header = None)
lpd_to_msd_ids = {a:b for a, b in zip(cleansed_ids[0], cleansed_ids[1])}
msd_to_lpd_ids = {a:b for a, b in zip(cleansed_ids[1], cleansed_ids[0])}

In [None]:
# Reading the genre annotations
genre_file_dir = os.path.join(root_dir, 'Lakh Piano Dataset', 'Genre', 'msd_tagtraum_cd1.cls')
ids = []
genres = []
with open(genre_file_dir) as f:
    line = f.readline()
    while line:
        if line[0] != '#':
          split = line.strip().split("\t")
          if len(split) == 2:
            ids.append(split[0])
            genres.append(split[1])
          elif len(split) == 3:
            ids.append(split[0])
            ids.append(split[0])
            genres.append(split[1])
            genres.append(split[2])
        line = f.readline()
genre_df = pd.DataFrame(data={"TrackID": ids, "Genre": genres})

genre_dict = genre_df.groupby('TrackID')['Genre'].apply(lambda x: x.tolist()).to_dict()

**Objects that we need**

- cleansed_ids: dictionary of LPD file name : MSD file name
- lmd_metadata: list of dictionaries - each dict has a msd_id field to identify
- Get the lmd_file_name (actual path )

In [None]:
# Load the processed metadata
with open(os.path.join(root_dir, 'Lakh Piano Dataset', 'processed_metadata.json'), 'r') as outfile:
  lmd_metadata = json.load(outfile)

# Change this into a dictionary of MSD_ID: metadata
lmd_metadata = {e['msd_id']:e for e in lmd_metadata}

In [None]:
# Get all song MSD IDs in pop rock genre
metal_song_msd_ids = [k for k, v in lmd_metadata.items() if 'rock' in v['artist_terms']]

# Randomly choose 1000 songs out of these
train_ids = random.choices(metal_song_msd_ids, k = 2000)

In [None]:
combined_pianorolls = []
i = 0
for msd_file_name in train_ids:

  lpd_file_name = msd_to_lpd_ids[msd_file_name]
  # Get the NPZ path
  npz_path = get_midi_npz_path(msd_file_name, lpd_file_name)
  multitrack = pypianoroll.load(npz_path)
  multitrack.set_resolution(2).pad_to_same()

  # Piano, Guitar, Bass, Strings, Drums
  # Splitting into different parts

  parts = {'piano_part': None, 'guitar_part': None, 'bass_part': None, 'strings_part': None, 'drums_part': None}
  song_length = None
  empty_array = None
  has_empty_parts = False
  for track in multitrack.tracks:
    if track.name == 'Drums':
      parts['drums_part'] = track.pianoroll
    if track.name == 'Piano':
      parts['piano_part'] = track.pianoroll
    if track.name == 'Guitar':
      parts['guitar_part'] = track.pianoroll
    if track.name == 'Bass':
      parts['bass_part'] = track.pianoroll
    if track.name == 'Strings':
      parts['strings_part'] = track.pianoroll
    if track.pianoroll.shape[0] > 0:
      empty_array = np.zeros_like(track.pianoroll)

  for k,v in parts.items():
    if v.shape[0] == 0:
      parts[k] = empty_array.copy()
      has_empty_parts = True

  # Stack all together - Piano, Guitar, Bass, Strings, Drums
  combined_pianoroll = torch.tensor([parts['piano_part'], parts['guitar_part'], parts['bass_part'], parts['strings_part'], parts['drums_part']])

  # These contain velocity information - the force with which the notes are hit - which can be standardized to 0/1 if we want (to compress)
  if has_empty_parts == False:
    combined_pianorolls.append(combined_pianoroll)
    i+=1
    print(i)

  if i == 1000:
    break

In [None]:
pianoroll_lengths = [e.size()[1] for e in combined_pianorolls]
combined_pianorolls = torch.hstack(combined_pianorolls)

In [None]:
torch.save(combined_pianorolls, os.path.join(root_dir, 'Lakh Piano Dataset', 'metal_1000_pianorolls.pt'))
pianoroll_lengths = torch.tensor(pianoroll_lengths)
torch.save(pianoroll_lengths, os.path.join(root_dir, 'Lakh Piano Dataset', 'metal_1000_pianorolls_lengths.pt'))

In [None]:
# Loading
combined_pianorolls = torch.load(os.path.join(root_dir, 'Lakh Piano Dataset', 'rock_1000_pianorolls_res2.pt'))
pianoroll_lengths = torch.load(os.path.join(root_dir, 'Lakh Piano Dataset', 'rock_1000_pianorolls_res2_lengths.pt'))
pianoroll_lengths = pianoroll_lengths.numpy()
pianoroll_cum_lengths = pianoroll_lengths.cumsum()

In [None]:
# Normalize
combined_pianorolls = combined_pianorolls / 127.0

# ### Getting the number of notes played in that time step
# # Number of notes per time step per track
# notes_per_time_step = (combined_pianorolls > 0).type(torch.float32).sum(axis = 2)
# # Censor those with more than 10 notes to be 10
# notes_per_time_step[notes_per_time_step > 10] = 10
# # Normalize to be between [0, 4] - very important to get right
# notes_per_time_step = notes_per_time_step / 2
# notes_per_time_step = notes_per_time_step.unsqueeze(2)
# # Concatenate the number vector
# combined_pianorolls = torch.cat((combined_pianorolls, notes_per_time_step), dim = 2)

# Remake the list of pianorolls - ensuring all songs are multiple of 32
pianorolls_list = []
pianorolls_list.append(combined_pianorolls[:, :(pianoroll_cum_lengths[0] - pianoroll_cum_lengths[0] % 32), :])
for i in range(len(pianoroll_cum_lengths) - 1):
  length = pianoroll_cum_lengths[i+1] - pianoroll_cum_lengths[i]
  # Get the nearest multiple of 32
  length_multiple = length - (length % 32)
  pianoroll = combined_pianorolls[:, pianoroll_cum_lengths[i]:(pianoroll_cum_lengths[i] + length_multiple), :]
  pianorolls_list.append(pianoroll)

# Combine the pianorolls again
combined_pianorolls = torch.hstack(pianorolls_list)

**Creating Music Dataset**

In [None]:
# Creating dataset and dataloader
from torch.utils.data import Dataset, DataLoader

In [None]:
# Dataset which only returns sequences which are multiples of 32
class CombinedDataset(Dataset):
  def __init__(self, pianorolls, instrument_id):
    self.data = pianorolls
    self.length = int(pianorolls.size(1) / 32)
    self.instrument_id = instrument_id

  def __getitem__(self, index):
    sequence = self.data[self.instrument_id, (index * 32):((index+1) * 32), :]
    return sequence
    
  def __len__(self):
    return self.length

In [None]:
# Generation Dataset - all fields cannot be blank
class GenerationDataset(Dataset):
  def __init__(self, list_of_sequences, dataset_length = 32 * 10000, seq_length = 50):

    # Don't normalize anymore since it was done earlier
    self.data = list_of_sequences
    self.n_songs = len(list_of_sequences)
    self.seq_length = seq_length
    self.length = dataset_length

  def __getitem__(self, index):

    valid_sequence = False
    while valid_sequence == False:
      # Choose a random song id
      song_id = random.randint(0, self.n_songs - 1)
      song_length = self.data[song_id].size()[1]
      # Choose a random start window
      start_time = random.randint(0, song_length - self.seq_length * 2 - 2)
      start_time = start_time - (start_time % 32)
      # Check that every track is not empty
      piano_sequence = self.data[song_id][0, start_time:(start_time + self.seq_length), :]
      guitar_sequence = self.data[song_id][1, start_time:(start_time + self.seq_length), :]
      bass_sequence = self.data[song_id][2, start_time:(start_time + self.seq_length), :]
      strings_sequence = self.data[song_id][3, start_time:(start_time + self.seq_length), :]
      drums_sequence = self.data[song_id][4, start_time:(start_time + self.seq_length), :]

      if piano_sequence.sum() != 0 and guitar_sequence.sum() != 0 and bass_sequence.sum() != 0 \
      and strings_sequence.sum() != 0 and drums_sequence.sum() != 0:
        valid_sequence = True
      else:
        if random.random() < 0.1:
          valid_sequence = True

    train_sequence = self.data[song_id][:, start_time:(start_time + self.seq_length), :]
    target_sequence = self.data[song_id][:, (start_time + self.seq_length):(start_time + self.seq_length * 2), :]
    return train_sequence, target_sequence

  def __len__(self):
    return self.length

In [None]:
# Melody prediction dataset - predict the next melody given the current melody
class MelodyDataset(Dataset):
  def __init__(self, list_of_sequences, dataset_length = 32 * 10000, seq_length = 50):

    # Don't normalize anymore since it was done earlier
    self.data = list_of_sequences
    self.n_songs = len(list_of_sequences)
    self.seq_length = seq_length
    self.length = dataset_length

  def __getitem__(self, index):
    # Choose a random song id
    song_id = random.randint(0, self.n_songs - 1)
    song_length = self.data[song_id].size()[1]
    # Choose a random start window
    start_time = random.randint(0, song_length - self.seq_length * 2 - 2)
    # train_sequence: 1 (piano) x seq_length x 128
    train_sequence = self.data[song_id][0, start_time:(start_time + self.seq_length), :]
    # target_sequence: 1 (piano) x seq_length x 128
    target_sequence = self.data[song_id][0, (start_time + self.seq_length):(start_time + self.seq_length * 2), :]
    return train_sequence, target_sequence

  def __len__(self):
    return self.length

# Melody-conditional dataset NEW - returns BOTH the previous harmony, and current melody, and current harmony
# only outputs samples with all tracks non-empty
class ConditionalDataset(Dataset):
  def __init__(self, list_of_sequences, dataset_length = 32 * 1000, seq_length = 50, instrument = 'guitar'):

    self.data = list_of_sequences
    self.n_songs = len(list_of_sequences)
    self.seq_length = seq_length
    self.length = dataset_length
    self.instrument = instrument

  def __getitem__(self, index):
    # Choose a random song id
    valid_sequence = False

    while valid_sequence == False:
      song_id = random.randint(0, self.n_songs - 1)
      song_length = self.data[song_id].size()[1]

      # Choose a random start window
      start_time = random.randint(0, song_length - self.seq_length * 2 - 2)

      # train_sequence: 1 (piano) x seq_length x 128
      piano_sequence = self.data[song_id][0, (start_time + self.seq_length):(start_time + self.seq_length * 2), :]

      if self.instrument == 'guitar':
        past_sequence = self.data[song_id][1, start_time:(start_time + self.seq_length), :]
        target_sequence = self.data[song_id][1, (start_time + self.seq_length):(start_time + self.seq_length * 2), :]
      elif self.instrument == 'bass':
        past_sequence = self.data[song_id][2, start_time:(start_time + self.seq_length), :]
        target_sequence = self.data[song_id][2, (start_time + self.seq_length):(start_time + self.seq_length * 2), :]
      elif self.instrument == 'strings':
        past_sequence = self.data[song_id][3, start_time:(start_time + self.seq_length), :]
        target_sequence = self.data[song_id][3, (start_time + self.seq_length):(start_time + self.seq_length * 2), :]
      elif self.instrument == 'drums':
        past_sequence = self.data[song_id][4, start_time:(start_time + self.seq_length), :]
        target_sequence = self.data[song_id][4, (start_time + self.seq_length):(start_time + self.seq_length * 2), :]
      else:
        past_sequence = None
        target_sequence = None

      if piano_sequence.sum() != 0 and past_sequence.sum() != 0 and target_sequence.sum() != 0:
        valid_sequence = True
      else:
        if random.random() < 0.1:
          valid_sequence = True


    return piano_sequence, past_sequence, target_sequence
  def __len__(self):
    return self.length

In [None]:
# @title Old Datasets
# Melody prediction dataset - target is next ONE time period
class MelodyDatasetOld(Dataset):
  def __init__(self, list_of_sequences, dataset_length = 32 * 10000, seq_length = 50):

    # Don't normalize anymore since it was done earlier

    self.data = list_of_sequences
    self.n_songs = len(list_of_sequences)
    self.seq_length = seq_length
    self.length = dataset_length

  def __getitem__(self, index):
    # Choose a random song id
    song_id = random.randint(0, self.n_songs - 1)
    song_length = self.data[song_id].size()[1]

    # Choose a random start window
    start_time = random.randint(0, song_length - self.seq_length - 2)

    # train_sequence: 5 x seq_length x 128
    train_sequence = self.data[song_id][:, start_time:(start_time + self.seq_length), :]

    # target_sequence: 1 (piano) x 1 x 128
    target_sequence = self.data[song_id][0, (start_time + self.seq_length + 1), :]
    return train_sequence, target_sequence

  def __len__(self):
    return self.length


# Melody-conditional dataset - only outputs samples with all tracks non-empty
class ConditionalDatasetOld(Dataset):
  def __init__(self, list_of_sequences, dataset_length = 32 * 1000, seq_length = 50, instrument = 'guitar'):

    self.data = list_of_sequences
    self.n_songs = len(list_of_sequences)
    self.seq_length = seq_length
    self.length = dataset_length
    self.instrument = instrument

  def __getitem__(self, index):
    # Choose a random song id
    valid_sequence = False

    while valid_sequence == False:
      song_id = random.randint(0, self.n_songs - 1)
      song_length = self.data[song_id].size()[1]

      # Choose a random start window
      start_time = random.randint(0, song_length - self.seq_length - 2)

      # train_sequence: 1 (piano) x seq_length x 128
      piano_sequence = self.data[song_id][0, start_time:(start_time + self.seq_length), :]

      if self.instrument == 'guitar':
        target_sequence = self.data[song_id][1, start_time:(start_time + self.seq_length), :]
      elif self.instrument == 'bass':
        target_sequence = self.data[song_id][2, start_time:(start_time + self.seq_length), :]
      elif self.instrument == 'strings':
        target_sequence = self.data[song_id][3, start_time:(start_time + self.seq_length), :]
      elif self.instrument == 'drums':
        target_sequence = self.data[song_id][4, start_time:(start_time + self.seq_length), :]
      else:
        target_sequence = None

      if piano_sequence.sum() != 0 and target_sequence.sum() != 0:
        valid_sequence = True

    return piano_sequence, target_sequence
  def __len__(self):
    return self.length

**RNN Generation Code**

In [None]:
# @title Encoder-Decoder
class Encoder(nn.Module):
    def __init__(self, input_size = 128, hidden_size = 64, num_layers = 1):
        super(Encoder, self).__init__()
        self.gru = nn.GRU(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers)
    def forward(self, input):
        # Input: batch_size x seq_length x n_pitches
        input = input.permute(1,0,2)
        output, state = self.gru(input)

        # `output` shape: (`num_steps`, `batch_size`, `num_hiddens`)
        # `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
        return output, state


class EncoderDecoder(nn.Module):
    """The base class for the encoder-decoder architecture."""
    def __init__(self, encoder, decoder):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, enc_X, dec_X):
        enc_outputs = self.encoder(enc_X)
        dec_state = self.decoder.init_state(enc_outputs)
        return self.decoder(dec_X, dec_state)


class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0):
        super(Decoder, self).__init__()
        
        self.gru = nn.GRU(embed_size + num_hiddens, num_hiddens, num_layers,
                          dropout=dropout)
        self.dense = nn.Linear(num_hiddens, vocab_size)
        
    def init_state(self, enc_outputs):
        return enc_outputs[1]

    def forward(self, X, state):
        # The output `X` shape: (`num_steps`, `batch_size`, `embed_size`)
        X = self.embedding(X).permute(1,0,2)
        # Broadcast `context` so it has the same `num_steps` as `X`
        context = state[-1].repeat(X.shape[0], 1, 1)
        
        # Concatenate X and context 
        X_and_context = torch.cat((X, context), 2)
        
        # Recurrent unit
        output, state = self.gru(X_and_context, state)
        
        # Linear layer
        output = self.dense(output).permute(1,0,2)

        # `output` shape: (`batch_size`, `num_steps`, `vocab_size`)
        # `state` shape: (`num_layers`, `batch_size`, `num_hiddens`)
        return output, state

In [None]:
# @title Time Distributed Layer
class TimeDistributed(nn.Module):
    def __init__(self, module, batch_first=False):
        super(TimeDistributed, self).__init__()
        self.module = module
        self.batch_first = batch_first

    def forward(self, x):

        if len(x.size()) <= 2:
            return self.module(x)

        # Squash samples and timesteps into a single axis
        x_reshape = x.contiguous().view(-1, x.size(-1))  # (samples * timesteps, input_size)

        y = self.module(x_reshape)

        # We have to reshape Y
        if self.batch_first:
            y = y.contiguous().view(x.size(0), -1, y.size(-1))  # (samples, timesteps, output_size)
        else:
            y = y.view(-1, x.size(1), y.size(-1))  # (timesteps, samples, output_size)

        return y

In [None]:
# ConditionalCNN - uses current melody and previous harmony to predict next harmony
class ConditionalCNN(nn.Module):
    def __init__(self, latent_size = 64):
        super(ConditionalCNN, self).__init__()

        # Encoding layers
        self.enc_conv1 = nn.Conv2d(in_channels = 1, out_channels = 64, kernel_size = (4, 4), stride = (4, 4))
        self.enc_conv2 = nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = (4, 4), stride = (4, 4))
        self.enc_conv3 = nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = (2, 8), stride = (2, 8))
        self.enc_lin1 = nn.Linear(512, 256)
        self.enc_lin2 = nn.Linear(256, latent_size)

        # Decoding layers
        self.dec_lin = nn.Linear(latent_size, 256)
        self.dec_conv1 = nn.ConvTranspose2d(in_channels = 256, out_channels = 128, kernel_size = (2, 8), stride = (2, 8))
        self.dec_conv2 = nn.ConvTranspose2d(in_channels = 128, out_channels = 64, kernel_size = (4, 4), stride = (4, 4))
        self.dec_conv3 = nn.ConvTranspose2d(in_channels = 64, out_channels = 1, kernel_size = (4, 4), stride = (4, 4))

        self.batch_norm_2d64 = nn.BatchNorm2d(64)
        self.batch_norm_2d128 = nn.BatchNorm2d(128)
        self.batch_norm_2d256 = nn.BatchNorm2d(256)

        self.dropout = nn.Dropout(0.4)

    def forward(self, prev_harmony, melody):
        # Input: batch_size x seq_length x n_pitches 
        prev_harmony = prev_harmony.unsqueeze(1)
        # batch_size x num_channels (1) x seq_length x n_pitches 
        prev_harmony = F.relu(self.batch_norm_2d64(self.enc_conv1(prev_harmony)))
        prev_harmony = self.dropout(prev_harmony)
        prev_harmony = F.relu(self.batch_norm_2d128(self.enc_conv2(prev_harmony)))
        prev_harmony = self.dropout(prev_harmony)
        prev_harmony = F.relu(self.batch_norm_2d256(self.enc_conv3(prev_harmony)))
        prev_harmony = prev_harmony.squeeze(3).squeeze(2)

        melody = melody.unsqueeze(1)
        # batch_size x num_channels (1) x seq_length x n_pitches 
        melody = F.relu(self.batch_norm_2d64(self.enc_conv1(melody)))
        melody = self.dropout(melody)
        melody = F.relu(self.batch_norm_2d128(self.enc_conv2(melody)))
        melody = self.dropout(melody)
        melody = F.relu(self.batch_norm_2d256(self.enc_conv3(melody)))
        melody = melody.squeeze(3).squeeze(2)
        
        # Concat melody and previous harmony together
        x = torch.cat((prev_harmony, melody), dim = 1)
        x = F.relu(self.enc_lin1(x))
        latent = self.enc_lin2(x)
        x = F.relu(self.dec_lin(latent))
        x = x.unsqueeze(2).unsqueeze(3)
        x = F.relu(self.batch_norm_2d128(self.dec_conv1(x)))
        x = F.relu(self.batch_norm_2d64(self.dec_conv2(x)))
        x = F.relu(self.dec_conv3(x))
        x = x.squeeze()
        return x, latent

In [None]:
# MelodyCNN - uses previous melody to predict next melody
class MelodyCNN(nn.Module):
    def __init__(self, latent_size = 64):
        super(MelodyCNN, self).__init__()

        # Encoding layers
        self.enc_conv1 = nn.Conv2d(in_channels = 1, out_channels = 64, kernel_size = (4, 4), stride = (4, 4))
        self.enc_conv2 = nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = (4, 4), stride = (4, 4))
        self.enc_conv3 = nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = (2, 8), stride = (2, 8))
        self.enc_lin = nn.Linear(256, latent_size)

        # Decoding layers
        self.dec_lin = nn.Linear(latent_size, 256)
        self.dec_conv1 = nn.ConvTranspose2d(in_channels = 256, out_channels = 128, kernel_size = (2, 8), stride = (2, 8))
        self.dec_conv2 = nn.ConvTranspose2d(in_channels = 128, out_channels = 64, kernel_size = (4, 4), stride = (4, 4))
        self.dec_conv3 = nn.ConvTranspose2d(in_channels = 64, out_channels = 1, kernel_size = (4, 4), stride = (4, 4))

        self.dropout = nn.Dropout(0.4)
        self.batch_norm_2d64 = nn.BatchNorm2d(64)
        self.batch_norm_2d128 = nn.BatchNorm2d(128)
        self.batch_norm_2d256 = nn.BatchNorm2d(256)

    def forward(self, input):
        # Input: batch_size x seq_length x n_pitches 
        input = input.unsqueeze(1)
        # batch_size x num_channels (1) x seq_length x n_pitches 
        x = F.relu(self.batch_norm_2d64(self.enc_conv1(input)))
        x = self.dropout(x)
        x = F.relu(self.batch_norm_2d128(self.enc_conv2(x)))
        x = self.dropout(x)
        x = F.relu(self.batch_norm_2d256(self.enc_conv3(x)))

        x = x.squeeze(3).squeeze(2)
        latent = self.enc_lin(x)
        x = F.relu(self.dec_lin(latent))
        x = x.unsqueeze(2).unsqueeze(3)
        x = F.relu(self.batch_norm_2d128(self.dec_conv1(x)))
        x = F.relu(self.batch_norm_2d64(self.dec_conv2(x)))
        x = F.relu(self.dec_conv3(x))
        x = x.squeeze()
        return x, latent

In [None]:
# @title Old MelodyRNN/CNN
# Outputs batch_size x 128 in the domain of [0, 1] - after sigmoid
class MelodyRNNOld(nn.Module):
  # input_size: number of possible pitches
  # hidden_size: embedding size of each pitch
  # output_size: number of possible pitches (probability distribution)
    def __init__(self, hidden_size, input_size = 128 * 5, output_size = 128, batch_size = 32, n_layers=1):
        super(MelodyRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.batch_size = batch_size
        
        self.gru = nn.GRU(input_size, hidden_size, n_layers)
        self.linear = nn.Linear(hidden_size * n_layers, output_size)
    
    def forward(self, input, hidden):
        # Can use convolutions in the future
        # Input: batch_size x n_tracks x seq_length x pitches
        input = input.permute(0,2,1,3)
        # batch x seq_length x track x pitches
        input = input.flatten(2,3) # Flatten the track and pitches together
        # batch x seq_length x (track x pitches)
        # # batch x seq_length x hidden_dim
        input = input.permute(1,0,2)
        # seq length x batch x hidden_dim
        _, hidden = self.gru(input, hidden)

        # Hidden: hidden layer at FINAL state
        # hidden dim: (num_layer x num_dir) x batch x hidden_size
        h_n = hidden.permute(1,0,2)
        # h_n is batch x (num_layer x num_dir) x hidden_size
        h_n = h_n.contiguous().flatten(1,2)
        # After flattening: batch x (num_layer x num_dir x hidden_size)
        output = self.linear(h_n)
        output = torch.sigmoid(output)
        return output, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(self.n_layers, batch_size, self.hidden_size).to(device)

# Outputs batch_size x 128 in the domain of [0, 1] - after sigmoid
class MelodyCNNOld(nn.Module):
    def __init__(self, seq_length = 32):
        super(MelodyCNN, self).__init__()

        self.conv1 = nn.Conv2d(in_channels = 5, out_channels = 64, kernel_size = (seq_length, 1), stride = (seq_length, 1))
        self.conv2 = nn.Conv2d(in_channels = 64, out_channels = 128, kernel_size = (1, 8), stride = (1, 8))
        self.conv3 = nn.Conv2d(in_channels = 128, out_channels = 256, kernel_size = (1, 16), stride = (1, 16))
        self.linear = nn.Linear(256, 256)
        self.out = nn.Linear(256, 128)
        self.dropout = nn.Dropout(0.4)

    def forward(self, input):
        # Can use convolutions in the future
        # Input: batch_size x n_tracks x seq_length x pitches (32 x 5 x seq_length x 128)
        x = F.relu(self.conv1(input)) # 32 x 5 x 1 x 128
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = x.squeeze()
        x = F.relu(self.linear(x))
        out = torch.sigmoid(self.out(x))
        return out

In [None]:
def grad_clipping(net, theta):  
    """Clip the gradient."""
    params = [p for p in net.parameters() if p.requires_grad]

    norm = torch.sqrt(sum(torch.sum((p.grad ** 2)) for p in params))
    
    if norm > theta:
        for param in params:
            param.grad[:] *= theta / norm

In [None]:
# Code to train all NNs - model_type: cond_cnn, melody_rnn, melody_cnn

def run_epoch(dataloader, model, optimizer, criterion, is_train = True, model_type = 'melody_cnn'):
  
    running_loss = 0
    n_obs = 0

    if model_type == 'melody_cnn':
      for train_seq, target_seq in dataloader:
        train_seq = train_seq.to(device)
        target_seq = target_seq.to(device)
        output, latent = model(train_seq)
        loss = criterion(output, target_seq)
        if is_train == True:
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
        running_loss += loss.item()
        n_obs += train_seq.size()[0]
    elif model_type == 'cond_cnn':
      for piano_seq, past_seq, target_seq in dataloader:
        piano_seq = piano_seq.to(device)
        past_seq = past_seq.to(device)
        target_seq = target_seq.to(device)
        output, latent = model(past_seq, piano_seq)
        loss = criterion(output, target_seq)
        if is_train == True:
          optimizer.zero_grad()
          loss.backward()
          optimizer.step()
        running_loss += loss.item()
        n_obs += piano_seq.size()[0]

    # Return average loss for the input sequence
    return running_loss / n_obs * 100

# Overall training loop
def training_loop(model, optimizer, scheduler, criterion, train_dataloader, test_dataloader, model_type = 'cond_cnn', n_epochs = 50):

  train_losses = []
  test_losses = []

  for epoch in range(1, n_epochs + 1):
    model.train()
    print(scheduler.get_last_lr())
    train_epoch_loss = run_epoch(train_dataloader, model, optimizer, criterion, is_train = True, model_type = model_type)
    train_losses.append(train_epoch_loss)
    scheduler.step()

    model.eval()
    test_epoch_loss = run_epoch(test_dataloader, model, optimizer, criterion, is_train = False, model_type = model_type)
    test_losses.append(test_epoch_loss)

    print('Epoch {}, Train Loss: {}, Test Loss: {}, Time: {}'.format(epoch, train_epoch_loss, test_epoch_loss, datetime.now()))

  return train_losses, test_losses

**Executing New Melody CNN and Conditional CNNs**

In [None]:
melody_train_dataset = MelodyDataset(pianorolls_list, dataset_length = 32 * 8000, seq_length = 32)
melody_train_loader = DataLoader(melody_train_dataset, batch_size = 32, drop_last=True)
melody_test_dataset = MelodyDataset(pianorolls_list[900:1000], dataset_length = 32 * 2000, seq_length = 32)
melody_test_loader = DataLoader(melody_test_dataset, batch_size = 32, drop_last=True)

In [None]:
n_epochs = 50
lr = 0.0005
lr_lambda = 0.98

melody_cnn = MelodyCNN(latent_size = 128).to(device)
optimizer = torch.optim.Adam(melody_cnn.parameters(), lr = lr)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: lr_lambda ** epoch)
criterion = nn.MSELoss()
train_losses, test_losses = training_loop(melody_cnn, optimizer, scheduler, criterion, melody_train_loader, melody_test_loader, model_type = 'melody_cnn', n_epochs = n_epochs)

model_name = 'MelodyCNN_all'
save_path = os.path.join(root_dir, 'Saved Models', model_name)
torch.save(melody_cnn.state_dict(), save_path)

plt.figure()
plt.plot(train_losses, label = 'Train Loss')
plt.plot(test_losses, label = 'Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

for instrument in ['guitar', 'bass', 'strings', 'drums']:
  print(instrument)
  cond_train_dataset = ConditionalDataset(pianorolls_list, dataset_length = 32 * 8000, seq_length = 32, instrument = instrument)
  cond_train_loader = DataLoader(cond_train_dataset, batch_size = 32, drop_last=True)
  cond_test_dataset = ConditionalDataset(pianorolls_list[900:1000], dataset_length = 32 * 1000, seq_length = 32, instrument = instrument)
  cond_test_loader = DataLoader(cond_test_dataset, batch_size = 32, drop_last=True)

  n_epochs = 50
  lr = 0.0005
  lr_lambda = 0.98

  cond_cnn = ConditionalCNN(latent_size = 128).to(device)
  optimizer = torch.optim.Adam(cond_cnn.parameters(), lr = lr)
  scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: lr_lambda ** epoch)
  criterion = nn.MSELoss()
  train_losses, test_losses = training_loop(cond_cnn, optimizer, scheduler, criterion, cond_train_loader, cond_test_loader, model_type = 'cond_cnn', n_epochs = n_epochs)

  model_name = 'CondCNN_all_{}'.format(instrument)
  save_path = os.path.join(root_dir, 'Saved Models', model_name)
  torch.save(cond_cnn.state_dict(), save_path)
  # Plot the losses over epochs
  plt.figure()
  plt.plot(train_losses, label = 'Train Loss')
  plt.plot(test_losses, label = 'Test Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend()
  plt.show()

In [None]:
for instrument in ['guitar', 'bass', 'strings', 'drums']:
  print(instrument)
  cond_train_dataset = ConditionalDataset(pianorolls_list[200:1100], dataset_length = 32 * 4000, seq_length = 32, instrument = instrument)
  cond_train_loader = DataLoader(cond_train_dataset, batch_size = 32, drop_last=True)
  cond_test_dataset = ConditionalDataset(pianorolls_list[0:200], dataset_length = 32 * 1000, seq_length = 32, instrument = instrument)
  cond_test_loader = DataLoader(cond_test_dataset, batch_size = 32, drop_last=True)

  n_epochs = 30
  lr = 0.0005
  lr_lambda = 0.99

  cond_cnn = ConditionalCNN(latent_size = 64).to(device)
  optimizer = torch.optim.Adam(cond_cnn.parameters(), lr = lr)
  scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: lr_lambda ** epoch)
  criterion = nn.MSELoss()
  train_losses, test_losses = training_loop(cond_cnn, optimizer, scheduler, criterion, cond_train_loader, cond_test_loader, model_type = 'cond_cnn', n_epochs = n_epochs)

  model_name = 'CondCNNv5_{}'.format(instrument)
  save_path = os.path.join(root_dir, 'Saved Models', model_name)
  torch.save(cond_cnn.state_dict(), save_path)
  # Plot the losses over epochs
  plt.figure()
  plt.plot(train_losses, label = 'Train Loss')
  plt.plot(test_losses, label = 'Test Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend()
  plt.show()

In [None]:
1+1

### Executing Old Melody CNN/RNN

In [None]:
# Code to train all NNs - model_type: cond_cnn, melody_rnn, melody_cnn

def train_epoch(dataloader, model, optimizer, criterion, model_type = 'cond_cnn'):
  
    running_loss = 0
    n_obs = 0
    for train_seq, target_seq in dataloader:
      train_seq = train_seq.to(device)
      target_seq = target_seq.to(device)
      optimizer.zero_grad()
      if model_type == 'melody_rnn':
        hidden = model.init_hidden(batch_size = 32)
        output, hidden = model(train_seq, hidden)
      elif model_type == 'melody_cnn':
        output = model(train_seq)
      elif model_type == 'cond_cnn':
        output, latent = model(train_seq)
      loss = criterion(output, target_seq)
      loss.backward()
      grad_clipping(model, 1)
      optimizer.step()

      running_loss += loss.item()
      n_obs += train_seq.size()[0]

    # Return average loss for the input sequence
    return running_loss / n_obs * 100

def test_epoch(dataloader, model, optimizer, criterion, model_type = 'cond_cnn'):
    
    running_loss = 0
    n_obs = 0
    for train_seq, target_seq in dataloader:
      train_seq = train_seq.to(device)
      target_seq = target_seq.to(device)
      if model_type == 'melody_rnn':
        hidden = model.init_hidden(batch_size = 32)
        output, hidden = model(train_seq, hidden)
      elif model_type == 'melody_cnn':
        output = model(train_seq)
      elif model_type == 'cond_cnn':
        output, latent = model(train_seq)
      loss = criterion(output, target_seq)
      loss = criterion(output, target_seq)
      running_loss += loss.item()
      n_obs += train_seq.size()[0]

    # Return average loss for the input sequence
    return running_loss / n_obs * 100

# Overall training loop
def training_loop(model, optimizer, scheduler, criterion, train_dataloader, test_dataloader, model_type = 'cond_cnn'):

  train_losses = []
  test_losses = []

  for epoch in range(1, n_epochs + 1):
    model.train()
    print(scheduler.get_last_lr())
    train_epoch_loss = train_epoch(train_dataloader, model, optimizer, criterion, model_type)
    train_losses.append(train_epoch_loss)
    scheduler.step()

    model.eval()
    test_epoch_loss = test_epoch(test_dataloader, model, optimizer, criterion, model_type)
    test_losses.append(test_epoch_loss)

    print('Epoch {}, Train Loss: {}, Test Loss: {}, Time: {}'.format(epoch, train_epoch_loss, test_epoch_loss, datetime.now()))

  return train_losses, test_losses

In [None]:
# Code to train all NNs - model_type: cond_cnn, melody_rnn, melody_cnn

def train_epoch(dataloader, model, optimizer, criterion, model_type = 'cond_cnn'):
  
    running_loss = 0
    n_obs = 0
    for train_seq, target_seq in dataloader:
      train_seq = train_seq.to(device)
      target_seq = target_seq.to(device)
      optimizer.zero_grad()
      if model_type == 'melody_rnn':
        hidden = model.init_hidden(batch_size = 32)
        output, hidden = model(train_seq, hidden)
      elif model_type == 'melody_cnn':
        output = model(train_seq)
      elif model_type == 'cond_cnn':
        output, latent = model(train_seq)
      loss = criterion(output, target_seq)
      loss.backward()
      grad_clipping(model, 1)
      optimizer.step()

      running_loss += loss.item()
      n_obs += train_seq.size()[0]

    # Return average loss for the input sequence
    return running_loss / n_obs * 100

def test_epoch(dataloader, model, optimizer, criterion, model_type = 'cond_cnn'):
    
    running_loss = 0
    n_obs = 0
    for train_seq, target_seq in dataloader:
      train_seq = train_seq.to(device)
      target_seq = target_seq.to(device)
      if model_type == 'melody_rnn':
        hidden = model.init_hidden(batch_size = 32)
        output, hidden = model(train_seq, hidden)
      elif model_type == 'melody_cnn':
        output = model(train_seq)
      elif model_type == 'cond_cnn':
        output, latent = model(train_seq)
      loss = criterion(output, target_seq)
      loss = criterion(output, target_seq)
      running_loss += loss.item()
      n_obs += train_seq.size()[0]

    # Return average loss for the input sequence
    return running_loss / n_obs * 100

# Overall training loop
def training_loop(model, optimizer, scheduler, criterion, train_dataloader, test_dataloader, model_type = 'cond_cnn'):

  train_losses = []
  test_losses = []

  for epoch in range(1, n_epochs + 1):
    model.train()
    print(scheduler.get_last_lr())
    train_epoch_loss = train_epoch(train_dataloader, model, optimizer, criterion, model_type)
    train_losses.append(train_epoch_loss)
    scheduler.step()

    model.eval()
    test_epoch_loss = test_epoch(test_dataloader, model, optimizer, criterion, model_type)
    test_losses.append(test_epoch_loss)

    print('Epoch {}, Train Loss: {}, Test Loss: {}, Time: {}'.format(epoch, train_epoch_loss, test_epoch_loss, datetime.now()))

  return train_losses, test_losses

In [None]:
melody_train_dataset = MelodyDataset(pianorolls_list[0:900], dataset_length = 32 * 8000, seq_length = 32)
melody_train_loader = DataLoader(melody_train_dataset, batch_size = 32, drop_last=True)
melody_test_dataset = MelodyDataset(pianorolls_list[900:1000], dataset_length = 32 * 2000, seq_length = 32)
melody_test_loader = DataLoader(melody_train_dataset, batch_size = 32, drop_last=True)

In [None]:
n_epochs = 30
lr = 0.0005
lr_lambda = 0.98

melody_cnn = MelodyCNN(seq_length = 32).to(device)

optimizer = torch.optim.Adam(melody_cnn.parameters(), lr = lr)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: lr_lambda ** epoch)
criterion = nn.MSELoss()
train_losses, test_losses = training_loop(melody_cnn, optimizer, scheduler, criterion, melody_train_loader, melody_test_loader, model_type = 'melody_cnn')

model_name = 'MelodyCNN'
save_path = os.path.join(root_dir, 'Saved Models', model_name)
torch.save(melody_cnn.state_dict(), save_path)
# Plot the losses over epochs
plt.figure()
plt.plot(train_losses, label = 'Train Loss')
plt.plot(test_losses, label = 'Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
model_name = 'MelodyCNN'
save_path = os.path.join(root_dir, 'Saved Models', model_name)
torch.save(melody_cnn.state_dict(), save_path)

In [None]:
n_epochs = 50
lr = 0.0005
lr_lambda = 0.98

# Create model, optimizer and loss function
melody_rnn = MelodyRNN(hidden_size = 64, batch_size = 32, n_layers = 1).to(device)
optimizer = torch.optim.Adam(melody_rnn.parameters(), lr = lr)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: lr_lambda ** epoch)
criterion = nn.MSELoss()
train_losses, test_losses = training_loop(melody_rnn, optimizer, scheduler, criterion, melody_train_loader, melody_test_loader, model_type = 'melody_rnn')

model_name = 'MelodyRNN_2'
save_path = os.path.join(root_dir, 'Saved Models', model_name)
torch.save(melody_rnn.state_dict(), save_path)
# Plot the losses over epochs
plt.figure()
plt.plot(train_losses, label = 'Train Loss')
plt.plot(test_losses, label = 'Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

melody_cnn = MelodyCNN(seq_length = 32).to(device)

optimizer = torch.optim.Adam(melody_cnn.parameters(), lr = lr)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: lr_lambda ** epoch)
criterion = nn.MSELoss()
train_losses, test_losses = training_loop(melody_cnn, optimizer, scheduler, criterion, melody_tra in_loader, melody_test_loader, model_type = 'melody_cnn')

model_name = 'MelodyCNN'
save_path = os.path.join(root_dir, 'Saved Models', model_name)
torch.save(melody_cnn.state_dict(), save_path)
# Plot the losses over epochs
plt.figure()
plt.plot(train_losses, label = 'Train Loss')
plt.plot(test_losses, label = 'Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

**Executing Conditional RNN/CNN**

In [None]:
for instrument in ['guitar', 'bass', 'strings', 'drums']:
  print(instrument)
  cond_train_dataset = ConditionalDataset(pianorolls_list[0:900], dataset_length = 32 * 4000, seq_length = 32, instrument = instrument)
  cond_train_loader = DataLoader(cond_train_dataset, batch_size = 32, drop_last=True)
  cond_test_dataset = ConditionalDataset(pianorolls_list[900:1000], dataset_length = 32 * 1000, seq_length = 32, instrument = instrument)
  cond_test_loader = DataLoader(cond_test_dataset, batch_size = 32, drop_last=True)

  n_epochs = 50
  lr = 0.0005
  lr_lambda = 0.98

  cond_cnn = ConditionalCNN(latent_size = 64).to(device)
  optimizer = torch.optim.Adam(cond_cnn.parameters(), lr = lr)
  scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: lr_lambda ** epoch)
  criterion = nn.MSELoss()
  train_losses, test_losses = training_loop(cond_cnn, optimizer, scheduler, criterion, cond_train_loader, cond_test_loader, model_type = 'cond_cnn')

  model_name = 'CondCNN_{}'.format(instrument)
  save_path = os.path.join(root_dir, 'Saved Models', model_name)
  torch.save(cond_cnn.state_dict(), save_path)
  # Plot the losses over epochs
  plt.figure()
  plt.plot(train_losses, label = 'Train Loss')
  plt.plot(test_losses, label = 'Test Loss')
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  plt.legend()
  plt.show()

### Evaluating MelodyCNN/RNN to Generate New Music

In [None]:
# Note that it's important to put .eval() because if not batchnorm on a batch of size 1 will lead to errors

model_version = 4
model_name = 'MelodyCNNv{}'.format(model_version - 1)
save_path = os.path.join(root_dir, 'Saved Models', model_name)
melody_model = MelodyCNN(latent_size = 128).to(device)
melody_model.load_state_dict(torch.load(save_path))
melody_model.eval()

model_name = 'CondCNNv{}_guitar'.format(model_version)
save_path = os.path.join(root_dir, 'Saved Models', model_name)
guitar_model = ConditionalCNN(latent_size = 64).to(device)
guitar_model.load_state_dict(torch.load(save_path))
guitar_model.eval()

model_name = 'CondCNNv{}_bass'.format(model_version)
save_path = os.path.join(root_dir, 'Saved Models', model_name)
bass_model = ConditionalCNN(latent_size = 64).to(device)
bass_model.load_state_dict(torch.load(save_path))
bass_model.eval()

model_name = 'CondCNNv{}_strings'.format(model_version)
save_path = os.path.join(root_dir, 'Saved Models', model_name)
strings_model = ConditionalCNN(latent_size = 64).to(device)
strings_model.load_state_dict(torch.load(save_path))
strings_model.eval()

model_name = 'CondCNNv{}_drums'.format(model_version)
save_path = os.path.join(root_dir, 'Saved Models', model_name)
drums_model = ConditionalCNN(latent_size = 64).to(device)
drums_model.load_state_dict(torch.load(save_path))
drums_model.eval()

In [None]:
# Generator dataloader and dataset for the actual song generation (gives the full 5 tracks in chunks of length 32)
gen_dataset = GenerationDataset(pianorolls_list[0:900], dataset_length = 32 * 10000, seq_length = 32)
gen_loader = DataLoader(gen_dataset, batch_size = 1, drop_last=True)

In [None]:
train, test = next(iter(gen_loader))

In [None]:
# Function that takes in previous length-32 5-instrument sequence, and generates a new length-32 5-instrument sequence
# Input_sequence: 5 x 32 x 128, models: 5-tuple of the trained NNs
# threshold as a % of the maximum output that we will keep the notes there (those under threshold will be moved to 0)
# if binarize = True, will set all remaining non-zero to max intensity
def generate_new_music(input_sequence, models, threshold = 0.3, binarize = False):

  melody_model, guitar_model, bass_model, strings_model, drums_model = models
  input_sequence = input_sequence.to(device)

  melody_prev = input_sequence[0, :, :].unsqueeze(0)
  guitar_prev = input_sequence[1, :, :].unsqueeze(0)
  bass_prev = input_sequence[2, :, :].unsqueeze(0)
  strings_prev = input_sequence[3, :, :].unsqueeze(0)
  drums_prev = input_sequence[4, :, :].unsqueeze(0)

  melody_pred, _ = melody_model(melody_prev)
  melody_pred = melody_pred / melody_pred.max()
  melody_pred[melody_pred < threshold] = 0.0
  

  guitar_pred, _ = guitar_model(guitar_prev, melody_pred.unsqueeze(0))
  guitar_pred = guitar_pred / guitar_pred.max()
  guitar_pred[guitar_pred < threshold] = 0.0

  bass_pred, _ = bass_model(bass_prev, melody_pred.unsqueeze(0))
  bass_pred = bass_pred / bass_pred.max()
  bass_pred[bass_pred < threshold] = 0.0

  strings_pred, _ = strings_model(strings_prev, melody_pred.unsqueeze(0))
  strings_pred = strings_pred / strings_pred.max()
  strings_pred[strings_pred < threshold] = 0.0

  drums_pred, _ = drums_model(drums_prev, melody_pred.unsqueeze(0))
  drums_pred = drums_pred / drums_pred.max()
  drums_pred[drums_pred < threshold] = 0.0
  creation = torch.cat((melody_pred.unsqueeze(0), guitar_pred.unsqueeze(0), bass_pred.unsqueeze(0), 
                        strings_pred.unsqueeze(0), drums_pred.unsqueeze(0)), dim = 0)

  if binarize == True:
    creation[creation > 0] = 1
  return creation

In [None]:
models = (melody_model, guitar_model, bass_model, strings_model, drums_model)
generated_track = torch.zeros((5, 128, 128))
generated_track[:, 0:32, :] = train.squeeze()
creation = generate_new_music(train.squeeze(), models, threshold = 0.4, binarize = False)
generated_track[:, 32:64, :] = creation
creation = generate_new_music(creation, models, threshold = 0.4, binarize = False)
generated_track[:, 64:96, :] = creation
creation = generate_new_music(creation, models, threshold = 0.4, binarize = False)
generated_track[:, 96:128, :] = creation

In [None]:
# Only un-normalize here
generated_track_out = generated_track * 127
# Convert predictions into the multitrack pianoroll
piano_track = pypianoroll.StandardTrack(name = 'Piano', program = 0, is_drum = False, pianoroll = generated_track_out[0, :, :].detach().cpu().numpy())
guitar_track = pypianoroll.StandardTrack(name = 'Guitar', program = 24, is_drum = False, pianoroll = generated_track_out[1, :, :].detach().cpu().numpy())
bass_track = pypianoroll.StandardTrack(name = 'Bass', program = 32, is_drum = False, pianoroll = generated_track_out[2, :, :].cpu().detach().numpy())
strings_track = pypianoroll.StandardTrack(name = 'Strings', program = 48, is_drum = False, pianoroll = generated_track_out[3, :, :].cpu().detach().numpy())
drums_track = pypianoroll.StandardTrack(name = 'Drums', is_drum = True, pianoroll = generated_track_out[4, :, :].cpu().detach().numpy())
generated_multitrack = pypianoroll.Multitrack(name = 'Generated', resolution = 2, tracks = [piano_track, guitar_track, bass_track, strings_track, drums_track])
generated_multitrack.plot()

In [None]:
generated_pm = pypianoroll.to_pretty_midi(generated_multitrack)
generated_midi_audio = generated_pm.fluidsynth()
IPython.display.Audio(generated_midi_audio, rate = 44100)

### Old Way of Generating with MelodyRNN / RNNs (requiring random drawing)

In [None]:
# Generating Melody
model_name = 'MelodyRNN_2'
save_path = os.path.join(root_dir, 'Saved Models', model_name)
model = MelodyRNN(hidden_size = 64, batch_size = 32, n_layers = 1).to(device)
model.load_state_dict(torch.load(save_path))

In [None]:
train, target = next(iter(melody_train_loader))

In [None]:
prime_seq = train[30, :, 20:30, :].to(device)
temperature = 8
hidden = model.init_hidden(batch_size = 1)
prime_seq = prime_seq.unsqueeze(0)
# Build up the hidden state
_, hidden = model(prime_seq, hidden)
input = prime_seq[:, :, -1:, :]
# Make prediction - predicted is 128
scores, hidden = model(input, hidden)
scores = scores / scores.max()
predicted_probs = F.softmax(scores * temperature, dim = 1)
predicted_probs[predicted_probs < 0.001] = 0.0
predicted_ids = torch.multinomial(predicted_probs, num_samples = 3)

In [None]:
predicted_ids

In [None]:
predicted_ids[0]

In [None]:
# Code to evaluate the language model i.e. generate new music
# Old code that only generates a fixed number of notes per instrument at any time

def evaluate(net, prime_seq, predict_len = 100, temperature = 20):
    '''
    Arguments:
    prime_seq - priming sequence (converted t)
    predict_len - number of notes to predict for after prime sequence
    '''
    hidden = net.init_hidden(batch_size = 1)

    # Instantiate new tensor to store predicted sequences
    predictions = torch.zeros((5, predict_len + prime_seq.size()[1], 128)).to(device)

    # Set the start of the predicted seq to be the prime sequence
    predictions[:, 0:prime_seq.size()[1], :] = prime_seq[:, :, :128]

    curr_predict_id = prime_seq.size()[1]

    # Reshape prime seq 
    # from n_tracks x seq_length x pitches
    # to become batch_size x n_tracks x seq_length x pitches
    prime_seq = prime_seq.unsqueeze(0)

    # Build up the hidden state
    _, hidden = net(prime_seq, hidden)
    # Input is last character of prime sequence
    input = predictions[:, prime_seq.size()[1] - 1, :]

    while curr_predict_id < predictions.size()[1]:
      # Forward pass of the trained NN - to get next predicted front
      input = input.unsqueeze(0).unsqueeze(2)
      predicted, hidden = net(input, hidden)
      predicted = predicted / predicted.max()

      predicted_probs = F.softmax(predicted * temperature, dim = 1)
      predicted_probs[predicted_probs < 0.001] = 0.0
      predicted_ids = torch.multinomial(predicted_probs, num_samples = 2)
      predicted = torch.zeros((5, 128)).to(device)
      predicted[0, predicted_ids[0]] = 1
      input = predicted.clone()
      predictions[:, curr_predict_id, :] = predicted
      
      curr_predict_id += 1

    return predictions

In [None]:
# Prime sequence
model.to(device)
prime_seq = train[16, :, 10:30, 0:128].to(device)
predictions = evaluate(model, prime_seq, predict_len = 100, temperature = 6)
# Unnormalize
predictions = (predictions * 127).type(torch.int8)

In [None]:
piano_track.plot()

In [None]:
# Convert predictions into the multitrack pianoroll
piano_track = pypianoroll.StandardTrack(name = 'Piano', program = 0, is_drum = False, pianoroll = predictions[0, :, :].detach().cpu().numpy())
guitar_track = pypianoroll.StandardTrack(name = 'Guitar', program = 24, is_drum = False, pianoroll = predictions[1, :, :].detach().cpu().numpy())
bass_track = pypianoroll.StandardTrack(name = 'Bass', program = 32, is_drum = False, pianoroll = predictions[2, :, :].detach().cpu().numpy())
strings_track = pypianoroll.StandardTrack(name = 'Strings', program = 48, is_drum = False, pianoroll = predictions[3, :, :].detach().cpu().numpy())
drums_track = pypianoroll.StandardTrack(name = 'Drums', is_drum = True, pianoroll = predictions[4, :, :].detach().cpu().numpy())


generated_multitrack = pypianoroll.Multitrack(name = 'Generated', resolution = 2, tracks = [piano_track, guitar_track, bass_track, strings_track, drums_track])


#resolution=24, tempo=array(shape=(12000,), dtype=float64), downbeat=array(shape=(12000,), dtype=bool)
# Plot the generated multitrack
generated_multitrack.plot()

In [None]:
# Convert generated multitrack to pretty midi
generated_pm = pypianoroll.to_pretty_midi(generated_multitrack)
generated_midi_audio = generated_pm.fluidsynth()
IPython.display.Audio(generated_midi_audio, rate = 44100)

In [None]:
# Code to evaluate the language model i.e. generate new music
# New code that takes in the encoding of how many notes at every time step

def evaluateWithNumber(net, prime_seq, predict_len = 100, temperature = 20):
    '''
    Arguments:
    prime_seq - priming sequence (converted t)
    predict_len - number of notes to predict for after prime sequence
    '''
    hidden = model.init_hidden(batch_size = 1)

    # Instantiate new tensor to store predicted sequences: n_tracks x seq_length x pitches (128)
    predictions = torch.zeros((5, predict_len + prime_seq.size()[1], 128)).to(device)
    predictions_n_notes = torch.zeros((5, predict_len + prime_seq.size()[1]))

    # Set the start of the predicted seq to be the prime sequence
    predictions[:, 0:prime_seq.size()[1], :] = prime_seq[:, :, :128]
    curr_predict_id = prime_seq.size()[1]

    prime_seq = prime_seq.unsqueeze(0)

    # Build up the hidden state
    _, hidden = model(prime_seq, hidden)

    # prime_seq is 1 x 5 x 50 x 129
    input = prime_seq[:, :, -1:, :]
    # input is 1 x 5 x 1 x 129

    while curr_predict_id < predictions.size()[1]:
      
      scores, hidden = model(input, hidden)
      # predicted is 1 x 645
      scores = scores.view(5, 129)
      predicted_n_notes = (scores[:, -1] * 2 + 1).type(torch.int8)
      scores = scores[:, :-1]

      scores = scores / scores.max()
      predicted_probs = F.softmax(scores * temperature, dim = 1)
      predicted_probs[predicted_probs < 0.001] = 0.0
      predicted = torch.zeros_like(scores)

      for i in range(5):
        instrument_predicted_n_notes = predicted_n_notes[i].item() #  Get number of predicted notes for that instrument
        if instrument_predicted_n_notes > 0:
          topk, indices = torch.topk(predicted_probs[i, :], instrument_predicted_n_notes * 2) # Twice the number of notes up for consideration (get the top and ignore the rest)
          topk[0] = topk[0] / 2
          print(topk / topk.sum())
          instrument_predicted_indices_ids = torch.multinomial(topk, num_samples = instrument_predicted_n_notes) # Choose from the multinomial 
          instrument_predicted_ids = torch.gather(indices, 0, instrument_predicted_indices_ids)
          predicted[i, instrument_predicted_ids] = 1 # Set the predicted notes ids to 1

      # Set next input to just generated prediction
      input = predicted.clone()
      # Get number of notes for generated prediction
      new_predicted_n_notes = (predicted > 0).sum(axis = 1) / 2
      predictions_n_notes[:, curr_predict_id] = new_predicted_n_notes
      new_predicted_n_notes = new_predicted_n_notes.unsqueeze(1)
      input = torch.cat((input, new_predicted_n_notes), dim = 1).unsqueeze(0).unsqueeze(2)
      predictions[:, curr_predict_id, :] = predicted

      curr_predict_id += 1

    return predictions, predictions_n_notes

In [None]:
# Prime sequence
model.to(device)
prime_seq = train[14, :, 30:80, :].to(device)
predictions, predictions_n_notes = evaluateWithNumber(model, prime_seq, predict_len = 100, temperature = 10)
# Unnormalize
predictions = (predictions * 127).type(torch.int8)

In [None]:
# Prime sequence
model.to(device)
prime_seq = train[29, :, 10:20, 0:128].to(device)
predictions = evaluate(model, prime_seq, predict_len = 200, temperature = 10)
# Unnormalize
predictions = (predictions * 127).type(torch.int8)

In [None]:
# Convert predictions into the multitrack pianoroll
piano_track = pypianoroll.StandardTrack(name = 'Piano', program = 0, is_drum = False, pianoroll = predictions[0, :, :].detach().cpu().numpy())
guitar_track = pypianoroll.StandardTrack(name = 'Guitar', program = 24, is_drum = False, pianoroll = predictions[1, :, :].detach().cpu().numpy())
bass_track = pypianoroll.StandardTrack(name = 'Bass', program = 32, is_drum = False, pianoroll = predictions[2, :, :].detach().cpu().numpy())
strings_track = pypianoroll.StandardTrack(name = 'Strings', program = 48, is_drum = False, pianoroll = predictions[3, :, :].detach().cpu().numpy())
drums_track = pypianoroll.StandardTrack(name = 'Drums', is_drum = True, pianoroll = predictions[4, :, :].detach().cpu().numpy())


generated_multitrack = pypianoroll.Multitrack(name = 'Generated', resolution = 2, tracks = [piano_track, guitar_track, bass_track, strings_track, drums_track])


#resolution=24, tempo=array(shape=(12000,), dtype=float64), downbeat=array(shape=(12000,), dtype=bool)
# Plot the generated multitrack
generated_multitrack.plot()

In [None]:
# Convert generated multitrack to pretty midi
generated_pm = pypianoroll.to_pretty_midi(generated_multitrack)
generated_midi_audio = generated_pm.fluidsynth()
IPython.display.Audio(generated_midi_audio, rate = 44100)