## Imports

In [1]:
import json
import glob
import itertools
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import OrderedDict

import librosa
from IPython.display import Audio as ipy_audio
from IPython.core.display import display

from quicktranscribe import tonic, pitch, wave, kde
from mogra import tonnetz
from mogra.datatypes import Swar, normalize_frequency, ratio_to_swar, SWAR_BOUNDARIES

In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch.utils.data import Dataset, DataLoader

## Tonnetz Graph

In [3]:
# define the net
gs = tonnetz.EFGenus.from_list([3,3,3,3,5,5])
tn = tonnetz.Tonnetz(gs)

In [4]:
# get the adjacency and equivalence matrices
adjac = tn.adjacency_matrix()
equiv = tn.equivalence_matrix()

Mock Bhoop Sample

In [5]:
# # Continuous Time Process (batch of 2)
# xx = np.array([
#     [0, 2, 2, 4, 4, 4, 4, 7, 7, 7, 4, 4, 2, 2, 0, 0, 9, 9, 9, 9, 7, 7, 0, 0, 4],
#     [7, 7, 4, 2, 0, 0, 0, 0, 2, 2, 4, 4, 7, 7, 9, 9, 9, 9, 7, 7, 4, 4, 2, 2, 0],
# ])
# # Labels (batch of 2)
# true_nodes = [(0,0), (1,0), (0,1), (-1,1), (-2,1)]
# true_node_indices = [tn.node_coordinates.index(ii) for ii in true_nodes]
# yy = np.zeros((len(tn.node_coordinates), 2), dtype=int)
# for ii in true_node_indices:
#     yy[ii, 0] = 1
#     yy[ii, 1] = 1

## Pitch Tracked Data

Util Functions

In [6]:
def read_sample_and_tonic(track_path, start_min=5, end_min=10):
    
    ctonic = tonic.read_tonic(track_path + ".ctonic.txt")
    y_stereo, sr = wave.read_audio_section(track_path + ".mp3", int(start_min*60), int(end_min*60))
    y_sample = librosa.to_mono(y_stereo.T)
    
    return y_sample, sr, ctonic

In [7]:
def preprocess(track):
    # downsample
    track = track[::16]
    # drop nans
    track = track[~np.isnan(track)]
    # round to nearest int
    track = np.round(track).astype(int)
    
    return track

Read Data

In [8]:
DATA_DIR = "concrete-demo/"

In [9]:
artist = "AjoyChakrabarty"
raags_and_times = {
    # "Bhoop": [(4,6), (7,9), (10,12), (13,15), (16,18), (19,21)],
    # "Deshkar": [(4,6), (7,9), (10,12), (13,15), (16,18), (19,21)],
    "Bhoop": [(3,21)],
    "Deshkar": [(3,21)],
}
# TODO: convert to Tonnetz coordinates
raag_gt_ratios = {
    "Bhoop": [1, 10/9, 5/4, 3/2, 5/3],
    "Deshkar": [1, 9/8, 81/64, 3/2,  27/16],
    "Yaman": [1, 9/8, 5/4, 45/32, 3/2, 27/16, 15/8],
}
raag_gt_nodes = {
    "Bhoop": [(0,0), (1,0), (0,1), (-1,1), (-2,1)],
    "Deshkar": [(0,0), (1,0), (2,0), (3,0), (4,0)],
    "Yaman": [(0,0), (1,0), (2,0), (1,1), (2,1), (0,1), (3,0)],
}

In [None]:
seqs = OrderedDict({rr:[] for rr in raags_and_times.keys()})
for raag in raags_and_times.keys():
    track_mp3 = glob.glob(DATA_DIR + f"*{raag}*{artist}.mp3")[0]
    track_path = track_mp3[:-4]
    for start_min, end_min in tqdm(raags_and_times[raag]):
        y_sample, sr, ctonic = read_sample_and_tonic(track_path, start_min, end_min)
        ftrack = pitch.track_pitch_pyin(y_sample, sr, ctonic)
        seqs[raag].append(preprocess(ftrack))

## TGNN

In [11]:
from mogra.tgnn import TemporalGNN, swarwise_loss

Parameters

In [12]:
num_nodes = len(tn.node_coordinates)
input_dim = 1  # Feature dimension
hidden_dim = 16  # Hidden layer size
num_classes = 12  # Number of equivalence classes
num_time_steps = 25  # Number of time steps
batch_size = 4  # Number of samples in each batch

Prepare Data

In [13]:
adjac = torch.tensor(adjac, dtype=torch.long)
equiv = torch.tensor(equiv, dtype=torch.long)

In [14]:
# get an array from seqs
min_length = min([len(ii) for ii in seqs["Bhoop"] + seqs["Deshkar"]])
xx = np.array([ii[:min_length] for raag, values in seqs.items() for ii in values])

# xx is currently num_samples x sample_length and contains node indices, make it num_samples x num_nodes x sample_length x input_dim with one-hot node encoding
xseq = np.zeros((xx.shape[0], num_nodes, xx.shape[1], input_dim))
for i in range(xx.shape[0]):
    for j in range(xx.shape[1]):
        xseq[i, xx[i, j], j, 0] = 1
xseq = torch.tensor(xseq, dtype=torch.float32)

In [15]:
tn_indices = lambda nodes: [tn.node_coordinates.index(ii) for ii in nodes]
def tn_indices(nodes):
    tni = np.zeros(len(tn.node_coordinates), dtype=int)
    for ii in [tn.node_coordinates.index(ii) for ii in nodes]:
        tni[ii] = 1
    return tni
yy = np.array([tn_indices(raag_gt_nodes[raag]) for raag, values in seqs.items() for ii in values])

# we want to stratify yy labels into 12 classes, with each equivalence class having one (or no) labels
yt = torch.tensor(yy, dtype=torch.long)
yraag = torch.tensor(np.zeros((yt.shape[0], 12)), dtype=torch.long)
for ii, mask in enumerate(equiv.T):
    for jj in range(yt.shape[0]):
        yraag[jj, ii] = (yt[jj, :] * mask).argmax() if sum(yt[jj, :] * mask) > 0 else -1

In [None]:
print(xseq.shape, yraag.shape)

In [17]:
# DataLoader:
# we have num_samples of length num_nodes x sample_length x input_dim
# we want to break down into num_time_steps from sample_length, and return in batches of batch_size
# so we want to return "x"s of batch_size x num_nodes x num_time_steps x input_dim and "y"s of batch_size x num_classes

class SeqDataset(Dataset):
    def __init__(self, xseq, yraag, num_time_steps):
        self.xseq = xseq
        self.yraag = yraag
        self.num_time_steps = num_time_steps
        self.seqs_per_sample = xseq.shape[2] // num_time_steps

    def __len__(self):
        return self.xseq.shape[0] * self.seqs_per_sample

    def __getitem__(self, idx):
        sample_idx = idx // self.seqs_per_sample
        insample_idx = idx % self.seqs_per_sample
        x = self.xseq[sample_idx, :, insample_idx*self.num_time_steps:(insample_idx+1)*self.num_time_steps, :]
        y = self.yraag[sample_idx]
        return x, y

dataset = SeqDataset(xseq, yraag, num_time_steps)
# split into train and val
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Instantiate model and run

In [18]:
# Instantiate the model
model = TemporalGNN(input_dim, hidden_dim, num_classes, num_time_steps)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
train_losses = []
val_losses = []
# optimizer.zero_grad()
for epoch in range(10):
    optimizer.zero_grad()
    train_loss = 0
    for x, y in train_dataloader:
        if x.shape[0] != batch_size:
            continue
        out = model(x, adjac, equiv)
        loss = swarwise_loss(out, y, batch_size)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    # print("train loss: ", train_loss)
    train_losses.append(train_loss/len(train_dataloader))
        
    val_loss = 0
    for x, y in val_dataloader:
        if x.shape[0] != batch_size:
            continue
        out = model(x, adjac, equiv)
        loss = swarwise_loss(out, y, batch_size)
        val_loss += loss.item()
    # print("eval loss: ", val_loss)
    val_losses.append(val_loss/len(val_dataloader))
    

In [None]:
plt.plot(train_losses, label="train")
plt.plot(val_losses, label="val")
plt.legend()
plt.show()

In [21]:
# len(train_dataloader)

Confusion Matrix

In [65]:
def get_preds(out_scores, threshold):
    bs = out_scores.shape[0]
    out_cs = out_scores.view(bs*12, -1)
    out_cs_argmax = out_cs.argmax(dim=1)
    out_cs_max = out_cs.max(dim=1)
    for ii, score in enumerate(out_cs_max.values):
        if score < threshold:
            out_cs_argmax[ii] = -1
    out_preds = out_cs_argmax.view(bs, 12)
    return out_preds

In [69]:
labels = []
preds = []
for x, y in val_dataloader:
    out = model(x, adjac, equiv)
    preds.extend(get_preds(out, 0.5).flatten().detach().numpy())
    labels.extend(y.flatten().detach().numpy())

In [None]:
# Plot confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns

cm = confusion_matrix(labels, preds, labels=np.arange(-1, 45, 2))
plt.figure(figsize=(10,7))
sns.heatmap(cm, annot=True, fmt='d', vmax=15, xticklabels=False, yticklabels=False)
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()


why doesn't more data work? why doesn't a different batch size work?

In [26]:
# x.shape, y.shape

In [23]:
# edges_b = adjac.unsqueeze(0).repeat(batch_size, 1, 1)
# node_features_t = x[:, :, 0, :]
# node_embeddings_t = F.relu(model.gcn1(node_features_t, edges_b))

In [24]:
# model.gcn1(node_features_t[:2,:,:], edges_b[:2,:,:])
# model.gcn1(node_features_t[:3,:,:], edges_b[:3,:,:])

In [25]:
# from torch_geometric.nn import GCNConv
# gcn1 = GCNConv(input_dim, hidden_dim)
# model.gcn1(node_features_t[:3,:,:], edges_b[:3,:,:])