In [1]:
import ast
import random
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset, random_split
from sklearn.model_selection import KFold

In [15]:
import pandas as pd

# Load parts with durations
part1 = pd.read_csv("data_processed/data_part1.csv")
part2 = pd.read_csv("data_processed/data_part2.csv")
reconstructed_dataset = pd.read_csv("data_processed/reconstructed_ordered_events.csv")
# Merge them vertically
duration_df = pd.concat([part1, part2], ignore_index=True)


In [16]:
print("Sequences:", len(reconstructed_dataset))
print("Durations:", len(duration_df))

# Optional: check a few row lengths
for i in range(5):
    print(f"Seq {i} = {len((reconstructed_dataset['sequence'][i]))}, Dur = {len(eval(duration_df['durations'][i]))}")


Sequences: 2775
Durations: 2775
Seq 0 = 17989, Dur = 2429
Seq 1 = 17256, Dur = 1813
Seq 2 = 11722, Dur = 1248
Seq 3 = 13755, Dur = 1379
Seq 4 = 11598, Dur = 1357


In [18]:
combined_df = reconstructed_dataset.copy()
combined_df['durations'] = duration_df['durations']


In [19]:
combined_df.to_csv("data_processed/reconstructed_with_durations.csv", index=False)


In [24]:
class Vocabulary:
    def __init__(self, reconstructed_df):
        """
        Build vocabulary of unique (note/chord, duration) string tokens.
        """
        self.token_set = set()

        for i in range(len(reconstructed_df)):
            sequence = reconstructed_df['sequence'][i]
            durations = reconstructed_df['durations'][i]

            if isinstance(sequence, str):
                sequence = eval(sequence)
            if isinstance(durations, str):
                durations = eval(durations)

            assert len(sequence) == len(durations)

            for event, duration in zip(sequence, durations):
                if isinstance(event, list):
                    note_part = "-".join(map(str, sorted(event)))
                else:
                    note_part = str(event)
                token = f"{note_part}_{duration}"
                self.token_set.add(token)

        self.tokens = sorted(self.token_set)
        self.token_to_idx = {token: idx for idx, token in enumerate(self.tokens)}
        self.idx_to_token = {idx: token for idx, token in enumerate(self.tokens)}
        self.vocab_size = len(self.tokens)

    def encode_event(self, token):
        """Convert token string to index"""
        return self.token_to_idx[token]

    def decode_event(self, idx):
        """Convert index back to token string"""
        return self.idx_to_token[idx]

    def __len__(self):
        return self.vocab_size


In [20]:
import numpy as np
import torch
from torch.utils.data import Dataset

class MusicEventDataset(Dataset):
    def __init__(self, reconstructed_df, vocab, seq_length=50):
        """
        Constructs (input_seq, target_token) pairs from note+duration tokens.

        Args:
            reconstructed_df: DataFrame with 'sequence' and 'durations' columns
            vocab: Vocabulary object with encode_event(token: str) -> int
            seq_length: Length of input sequence
        """
        self.samples = []
        self.seq_length = seq_length
        self.vocab = vocab

        for i in range(len(reconstructed_df)):
            sequence = reconstructed_df['sequence'][i]
            durations = reconstructed_df['durations'][i]

            if isinstance(sequence, str):
                sequence = eval(sequence)
            if isinstance(durations, str):
                durations = eval(durations)

            assert len(sequence) == len(durations), "Length mismatch"

            n_events = len(sequence)
            if n_events <= seq_length:
                continue

            for j in range(n_events - seq_length):
                input_tokens = []
                for e, d in zip(sequence[j:j+seq_length], durations[j:j+seq_length]):
                    token = self.tokenize_combined(e, d)
                    input_tokens.append(self.vocab.encode_event(token))

                target_event = sequence[j + seq_length]
                target_duration = durations[j + seq_length]
                target_token = self.tokenize_combined(target_event, target_duration)
                target_encoded = self.vocab.encode_event(target_token)

                self.samples.append((input_tokens, target_encoded))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_seq, target_token = self.samples[idx]

        input_tensor = torch.tensor(input_seq, dtype=torch.long)
        target_tensor = torch.tensor(target_token, dtype=torch.long)

        return input_tensor, target_tensor

    def tokenize_combined(self, event, duration):
        """
        Convert event + duration into a consistent string token, e.g. "45-46_0.25"
        """
        if isinstance(event, list):
            note_part = "-".join(map(str, sorted(event)))
        else:
            note_part = str(event)
        return f"{note_part}_{duration}"


In [25]:
df = pd.read_csv("data_processed/reconstructed_with_durations.csv")

vocab = Vocabulary(df)

dataset = MusicEventDataset(df, vocab=vocab, seq_length=16)

x, y = dataset[0]

print("Input sequence shape:", x.shape)
print("Next event shape:", y.shape)
print("Input sequence (multi-hot vectors):", x)
print("Next event (multi-hot vector):", y)


Input sequence shape: torch.Size([16])
Next event shape: torch.Size([])
Input sequence (multi-hot vectors): tensor([423892,  55161,  55080, 154382, 154382, 154381, 124192, 154381, 154382,
         55086, 288403, 124192, 154381, 361715, 170947, 170947])
Next event (multi-hot vector): tensor(53144)


In [33]:
from tensorflow.keras import Model
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf


class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = tf.keras.metrics.Mean(name="loss")
        self.reconstruction_loss_tracker = tf.keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss")

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]  # discard labels if passed as (x, y)

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)

            # Compute reconstruction loss
            reconstruction_loss = tf.reduce_sum(
                sparse_categorical_crossentropy(data, reconstruction), axis=1
            )

            # Compute KL divergence
            kl_loss = -0.5 * tf.reduce_sum(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1
            )

            total_loss = tf.reduce_mean(reconstruction_loss + kl_loss)

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)

        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]


In [34]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

vocab_size = len(vocab)
embedding_dim = 128
seq_length = 16
latent_dim = 32

# Encoder
encoder_input = Input(shape=(seq_length,))
x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_input)  # shape: (batch, seq_length, embedding_dim)
x = Flatten()(x)
z_mean = Dense(latent_dim)(x)
z_log_var = Dense(latent_dim)(x)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling)([z_mean, z_log_var])

encoder = Model(encoder_input, [z_mean, z_log_var, z])


In [35]:
from tensorflow.keras.layers import Reshape, TimeDistributed

decoder_input = Input(shape=(latent_dim,))
x = Dense(seq_length * embedding_dim, activation="relu")(decoder_input)
x = Reshape((seq_length, embedding_dim))(x)
decoder_output = TimeDistributed(Dense(vocab_size, activation="softmax"))(x)

decoder = Model(decoder_input, decoder_output)


In [36]:
from tensorflow.keras.layers import Reshape, TimeDistributed

decoder_input = Input(shape=(latent_dim,))
x = Dense(seq_length * embedding_dim, activation="relu")(decoder_input)
x = Reshape((seq_length, embedding_dim))(x)
decoder_output = TimeDistributed(Dense(vocab_size, activation="softmax"))(x)

decoder = Model(decoder_input, decoder_output)


In [38]:

vae = VAE(encoder, decoder)
vae.compile(optimizer="adam")
vae.fit(x_train, epochs=20, batch_size=64)


NameError: name 'x_train' is not defined