In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import numpy as np
from chord_progressions.type_templates import get_template_from_template_str, TYPE_TEMPLATES
from chord_progressions.utils import is_circular_match
from torch.utils.data import DataLoader
from tqdm import tqdm

from rhythmic_relationships.data import PairDataset

## Define the vocabulary

In [2]:
# Add silence
silence_template = '000000000000'
TYPE_TEMPLATES.update({'silence': silence_template})

# Rename out-of-vocab token and use silence to fill it
if '' in TYPE_TEMPLATES:
    del TYPE_TEMPLATES['']
TYPE_TEMPLATES['oov'] = silence_template

vocab_size = len(TYPE_TEMPLATES)
print(f'Vocab size: {vocab_size}')

inv_type_templates = {v: k for k, v in TYPE_TEMPLATES.items()}
stoi = {s:i for i,s in enumerate(inv_type_templates)}
itos = {i:s for s,i in stoi.items()}
itot = {ix:i for ix, i in enumerate(list(TYPE_TEMPLATES))}

def get_type_from_template(template):
    for chord_type in list(TYPE_TEMPLATES):
        if is_circular_match(
            template,
            get_template_from_template_str(TYPE_TEMPLATES[chord_type]),
        ):
            return chord_type

    return "oov" # out of vocabulary

def pclist_to_i(pclist):
    """Gets a chord type index from a pitch class list
    e.g. pclist_to_i([1,0,0,0,0,0,0,0,0,0,0,0]) -> 1
    """
    chord_type = get_type_from_template(pclist)
    template_str = TYPE_TEMPLATES[chord_type]
    return stoi[template_str]

Vocab size: 193


## Build the dataset

In [3]:
dataset_config = {
    "dataset_name": "babyslakh_20_1bar_4res",
    "part_1": 'Guitar',
    "part_2": 'Piano',
    "repr_1": "chroma",
    "repr_2": "chroma",
}

data = PairDataset(**dataset_config)
loader = DataLoader(data, batch_size=1)

# How many chords types do we use to predict the next one?
# DO NOT CHANGE until the code for constructing the context is written more generally
context_length = 3

X, Y = [], []

silence_tensor = torch.tensor(list(map(int, silence_template))).reshape((1, 12)).float()

for x, y in tqdm(loader):
    # Binarize the chromas
    x = (x > 1).to(torch.int32)[0]
    y = (y > 1).to(torch.int32)[0]

    # fill context with silence
    context = [stoi[TYPE_TEMPLATES['silence']]] * context_length

    for xrow, yrow in zip(x, y):
        ixx = pclist_to_i(xrow.tolist())
        ixy = pclist_to_i(yrow.tolist())
        # print(','.join(itot[i] for i in context), '-->', itot[ixy])

        X.append(context)
        Y.append(ixy)

        # Hard-code context to length 3 for now
        # TODO: construct the context more generally
        if len(Y[-2:]) < 2:
            context = context[1:] = [Y[-1]] + [ixx]
            continue
        context = Y[-2:] + [ixx] # the previous 2 Ys and the current X

        # Generally, I think we want to hold an even amount of each, but if the context has an odd length, hold an extra y.
        #   e.g. for context length c
        #     n_ys = c // 2 if c % 2 == 0 else c // 2 + 1
        #     n_xs = c // 2


X = torch.tensor(X)
Y = torch.tensor(Y)
n_examples = X.nelement()
print(f'{n_examples=}')

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1412/1412 [00:19<00:00, 71.61it/s]

n_examples=67776





In [4]:
X.shape, X.dtype, Y.shape, Y.dtype

(torch.Size([22592, 3]), torch.int64, torch.Size([22592]), torch.int64)

In [27]:
# Bengio et al has a vocab size of 17k and they embed them in a 30-dimensional space
# In our case we have a vocab size of only 193, so we can use a much smaller space, even 2D
embedding_dims = 2
embedding_size = context_length * embedding_dims

g = torch.Generator().manual_seed(73709238413)

# Initialize an embeddings vector randomly
C = torch.randn((vocab_size, embedding_dims), generator=g)

num_neurons = 100
W1 = torch.randn((embedding_size, num_neurons), generator=g)
b1 = torch.randn(num_neurons, generator=g)
W2 = torch.randn((num_neurons, vocab_size), generator=g)
b2 = torch.randn(vocab_size, generator=g)
parameters = [C, W1, b1, W2, b2]
for p in parameters:
    p.requires_grad = True
print(f'num total params: {sum(p.nelement() for p in parameters)}')

num total params: 20579


In [29]:
# Forward pass
emb = C[X]
h = torch.tanh(emb.view(-1, embedding_size) @ W1 + b1)
logits = h @ W2 + b2
loss = F.cross_entropy(logits, Y)

# These three commented lines are equal to the single F.cross_entropy line above
# counts = logits.exp()
# prob = counts / counts.sum(1, keepdims=True)
# loss = -prob[torch.arange(X.shape[0]), Y].log().mean()

loss

tensor(29.9330, grad_fn=<NllLossBackward0>)