In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import IPython.display as ipd
import pyximport
pyximport.install()
%load_ext Cython

import sigkernel as ksig
from utils.midi import *
from utils.data import *
from model.generators import *

In [2]:
hist_len = 50
sample_len = 100
seq_dim = 4
scale = 20.
stride = 10

In [3]:
# path = './data/maestro-v3.0.0_midi/2018/'
path = './data/midi/'
dfs = get_dfs_from_midi(path, min_notes=50)

In [8]:
class MIDIDataset(Dataset):
    '''
    Dataset for dataframes with MIDI data: 5 columns (start time, end time, pitch, velocity, instrument) in this order
    '''
    def __init__(self, dfs: list[pd.DataFrame], sample_len: int, cols: list[int]=[0,1,2,3], scale: float=1., stride: int=1):

        self.seq_dim = len(cols)
        self.cols = cols
        assert 0 in cols and 1 in cols, 'start time and end time column must be included'
        assert 2 in cols, 'pitch column must be included'
        self.sample_len = sample_len
        self.scale = scale
        self.stride = stride

        self.tensors = []
        self.lens = []
        for df in dfs:
            if len(df) >= sample_len:
                rectilinear_path = rectilinear_transform(df, include_velocity=(3 in cols))
                tensor = torch.tensor(rectilinear_path, dtype=torch.float32, requires_grad=False)
                tensor[:,1:] = tensor[:,1:] / scale # scale pitch and velocity which are integers from 0 to 127
                self.tensors.append(tensor)
                self.lens.append(int((tensor.shape[0] - self.sample_len)/self.stride) + 1)
        self.lens = np.cumsum(self.lens)
        self.len = self.lens[-1]

In [9]:
dataset = MIDIDataset(dfs, sample_len, cols=[0,1,2], scale=scale, stride=stride)
dataloader = DataLoader(dataset, batch_size=50, shuffle=True, num_workers=0, drop_last=True)

TypeError: object of type 'MIDIDataset' has no len()

In [None]:
Fs = 22050
audio_data = midi_data.synthesize(fs=Fs)
ipd.Audio(audio_data, rate=Fs)

In [3]:
generator = TransformerMusic(seq_dim, sample_len, hist_len, scale=scale,
                             kernel_size=5, stride=1, n_channels=16, n_head=4, n_transformer_layers=1, hidden_size=128, activation='GELU')
generator = generator.cuda()
optimizer = torch.optim.Adam(generator.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5, verbose=True)

In [4]:
static_kernel = ksig.static.kernels.RationalQuadraticKernel(sigma=0.1)
kernel = ksig.kernels.SignatureKernel(n_levels=5, order=5, normalization=0, static_kernel=static_kernel, device_ids=None)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
for epoch in range(30):
    losses = [] # due to legacy code, losses is actually the mmd
    for batch_num, X in enumerate(tqdm(dataloader)):
        X = X.to(device)

        output = generator(X)
        X_wo_hist = X[:, hist_len:, :]

        # compute loss
        optimizer.zero_grad()
        loss = ksig.tests.mmd_loss_no_compile(X_wo_hist, output, kernel)
        losses.append(loss.item())

        # backpropagate and update weights
        loss.backward()
        optimizer.step()

    # log epoch loss and plot generated samples
    epoch_loss = np.average(losses) # average batch mmd for epoch
    scheduler.step(epoch_loss)
    print(f'Epoch {epoch}, loss: {epoch_loss}')

In [None]:
for x in dataloader:
    x = x.to(device)
    output = generator(x)
    break

In [None]:
i = 0
x_np = x[i].cpu().detach().numpy()
x_np[:, 2:] = np.round(x_np[:, 2:] * scale)
output_np = output[i].cpu().detach().numpy()
output_np[:, 2:] = np.round(output_np[:, 2:] * scale)
df_x = pd.DataFrame(x_np, columns=['Start', 'End', 'Pitch', 'Velocity'])
df_output = pd.DataFrame(output_np, columns=['Start', 'End', 'Pitch', 'Velocity'])

In [None]:
df_input = df_x.iloc[hist_len:]
df_input.iloc[:,:2] = df_input.iloc[:,:2] - df_input.iloc[0,0]
df_input

In [None]:
df_output.iloc[:,:2] = df_output.iloc[:,:2] - df_output.iloc[0,0]
df_output

In [None]:
input_midi = df_to_midi(df_input)
output_midi = df_to_midi(df_output)

In [None]:
Fs = 22050
audio_data = input_midi.synthesize(fs=Fs)
ipd.Audio(audio_data, rate=Fs)

In [None]:
audio_data = output_midi.synthesize(fs=Fs)
ipd.Audio(audio_data, rate=Fs)