In [1]:
import ast
import random
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset, random_split
from sklearn.model_selection import KFold

## Import Dataset and Definition of Useful functions

### Import Dataset

In [4]:
def safe_parse_all_columns_df(df):
    """
    Parse all columns in a DataFrame to numeric, coercing errors.
    """
    df['notes'] = df['notes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['chords'] = df['chords'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['velocities'] = df['velocities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['durations'] = df['durations'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['offsets'] = df['offsets'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['ordered_events'] = df['ordered_events'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    return df

def load_dataframe_from_two_csvs(file1, file2):
    """
    Load and concatenate two CSV files into a single pandas DataFrame.
    """
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    full_df = pd.concat([df1, df2], ignore_index=True)
    full_df = safe_parse_all_columns_df(full_df)

    return full_df

def save_dataframe_to_two_csvs(df, file1, file2):
    """
    Split a DataFrame in half and save it into two CSV files.
    """
    halfway = len(df) // 2
    df.iloc[:halfway].to_csv(file1, index=False)
    df.iloc[halfway:].to_csv(file2, index=False)

def load_dataframe_from_one_csv(file):
    """
    Load a DataFrame from a single CSV file.
    """
    df = pd.read_csv(file)
    
    return df

def save_dataframe_to_one_csv(df, file):
    """
    Save a DataFrame to a single CSV file.
    """
    df.to_csv(file, index=True)

def load_reconstructed_events(file):
    """
    Loads the reconstructed events CSV and safely parses the 'sequence' column,
    converting notes to integers and chords to lists of integers.
    """
    df = pd.read_csv(file)

    def safe_parse(seq_str):
        try:
            parsed = ast.literal_eval(seq_str)
            if not isinstance(parsed, list):
                raise ValueError("Parsed sequence is not a list")

            normalized = []
            for el in parsed:
                if isinstance(el, list):
                    normalized.append([int(x) for x in el])
                else:
                    normalized.append(int(el))
            return normalized

        except Exception as e:
            print(f"Error parsing sequence: {seq_str}")
            raise e

    df['sequence'] = df['sequence'].apply(safe_parse)
    return df

In [5]:
root = 'data_processed/'
file1 = root + 'data_part1.csv'
file2 = root + 'data_part2.csv'

df = load_dataframe_from_two_csvs(file1, file2)

### Useful functions

In [6]:
def parse_chord_to_list(chord):
    """
    Convert a chord string to a list of integers.
    """
    if isinstance(chord, str):
        print([int(x) for x in chord.split(',') if x.isdigit()])
        return [int(x) for x in chord.split(',') if x.isdigit()]
    return []

In [7]:
def reconstruct_ordered_events(df):
    """
    Reconstruct the ordered list of events (notes and chords) for each song.
    """
    sequences  = []

    for i in range(len(df)):
        idx_note = 0
        idx_chord = 0
        reconstructed = []

        for element in df['ordered_events'][i]:
            if element == 'n':
                reconstructed.append(df['notes'][i][idx_note])
                idx_note += 1
            elif element == 'c':
                parsed_chord = parse_chord_to_list(df['chords'][i][idx_chord])
                reconstructed.append(df['chords'][i][idx_chord])
                idx_chord += 1
            else:
                raise ValueError(f"Unknown event type: {e}")
        
        sequences.append(reconstructed)

    reconstructed_dataset = pd.DataFrame({'sequence': sequences})
    reconstructed_dataset.index.name = 'index'

    return reconstructed_dataset

In [8]:
save_dataframe_to_one_csv(reconstruct_ordered_events(df), root + 'reconstructed_ordered_events.csv')

## Predict only Events (Notes and Chords)

### Creating the data: Fixed number of events 

Idea for creating the input sequences:
- we take subsets of the list of events representing each song 
- we take the next event of each subset as corresponding training output sequences

This is easy to implement and we will have a consistent sequence lenght for batching, but we are ignoring the timing aspect.

In [9]:
class Vocabulary:
    def __init__(self, reconstructed_df):
        """
        Build vocabulary of unique single notes only.
        """
        self.notes = set()
        for i in range(len(reconstructed_df)):
            sequence = reconstructed_df['sequence'][i]
            for event in sequence:
                if isinstance(event, list):
                    for note in event:
                        self.notes.add(note)
                else:
                    self.notes.add(event)

        self.notes = sorted(self.notes)
        self.note_to_idx = {note: idx for idx, note in enumerate(self.notes)}
        self.idx_to_note = {idx: note for idx, note in enumerate(self.notes)}
        self.vocab_size = len(self.notes)

    def encode_event(self, event):
        """
        Encode an event as a multi-hot vector over single notes.
        """
        vec = np.zeros(self.vocab_size, dtype=np.float32)
        if isinstance(event, list):
            for note in event:
                vec[self.note_to_idx[note]] = 1.0
        else:
            vec[self.note_to_idx[event]] = 1.0
        return vec

    def decode_event(self, vec, threshold=0.5):
        """
        Decode multi-hot vector to list of notes.
        """
        indices = np.where(vec >= threshold)[0]
        notes = [self.idx_to_note[idx] for idx in indices]
        if len(notes) == 1:
            return notes[0]
        else:
            return notes

    def __len__(self):
        return self.vocab_size


Create Dataset object

In [10]:
class MusicEventDataset(Dataset):
    def __init__(self, reconstructed_df, vocab, seq_length=50):
        """
        Constructs all valid (input_seq, target_event) pairs from each song in the dataset.

        Args:
            reconstructed_df: DataFrame with 'sequence' column where each entry is a list of events
            vocab: Vocabulary object to encode events
            seq_length: Length of each training input sequence (target is the next event)
        """
        self.samples = []
        self.seq_length = seq_length
        self.vocab = vocab

        for row_index in range(len(reconstructed_df)):
            sequence = reconstructed_df['sequence'][row_index]
            n_events = len(sequence)

            if n_events <= seq_length:
                continue

            for i in range(n_events - seq_length):
                input_seq = sequence[i : i + seq_length]
                target_event = sequence[i + seq_length]
                self.samples.append((input_seq, target_event))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_seq, target_event = self.samples[idx]

        input_encoded = np.array([self.vocab.encode_event(event) for event in input_seq], dtype=np.float32)
        input_tensor = torch.tensor(input_encoded)

        target_encoded = self.vocab.encode_event(target_event)
        target_tensor = torch.tensor(target_encoded, dtype=torch.float32)

        return input_tensor, target_tensor


In [11]:
reconstructed_dataset = load_reconstructed_events(root + 'reconstructed_ordered_events.csv')

vocab = Vocabulary(reconstructed_dataset)

dataset = MusicEventDataset(reconstructed_dataset, vocab=vocab, seq_length=16)

x, y = dataset[0]

print("Input sequence shape:", x.shape)
print("Next event shape:", y.shape)
print("Input sequence (multi-hot vectors):", x)
print("Next event (multi-hot vector):", y)


Input sequence shape: torch.Size([16, 88])
Next event shape: torch.Size([88])
Input sequence (multi-hot vectors): tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Next event (multi-hot vector): tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])


In [12]:
class MusicEventDataset(Dataset):
    def __init__(self, reconstructed_df, vocab, seq_length=50):
        self.samples = []
        self.seq_length = seq_length
        self.vocab = vocab

        for row_index in range(len(reconstructed_df)):
            sequence = reconstructed_df['sequence'][row_index]
            durations = reconstructed_df['durations'][row_index]  # durations list per song
            assert len(sequence) == len(durations)

            n_events = len(sequence)
            if n_events <= seq_length:
                continue

            for i in range(n_events - seq_length):
                input_seq = sequence[i : i + seq_length]
                input_dur = durations[i : i + seq_length]
                target_event = sequence[i + seq_length]
                target_dur = durations[i + seq_length]
                self.samples.append((input_seq, input_dur, target_event, target_dur))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_seq, input_dur, target_event, target_dur = self.samples[idx]

        input_notes_encoded = np.array([self.vocab.encode_event(ev) for ev in input_seq], dtype=np.float32)
        input_durs_encoded = np.array(input_dur, dtype=np.float32).reshape(-1, 1)

        input_tensor = torch.tensor(np.hstack((input_notes_encoded, input_durs_encoded)))  # concat per timestep
        target_notes_tensor = torch.tensor(self.vocab.encode_event(target_event), dtype=torch.float32)
        target_dur_tensor = torch.tensor(target_dur, dtype=torch.float32)

        return input_tensor, target_notes_tensor, target_dur_tensor


In [13]:
class MusicVAE(nn.Module):
    def __init__(self, input_dim_notes, hidden_dim=256, latent_dim=64, seq_length=16):
        super(MusicVAE, self).__init__()
        self.seq_length = seq_length
        self.input_dim_notes = input_dim_notes
        self.input_dim_total = input_dim_notes + 1  # +1 for duration

        self.encoder_rnn = nn.GRU(self.input_dim_total, hidden_dim, batch_first=True)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)

        self.decoder_input = nn.Linear(latent_dim, hidden_dim)
        self.decoder_rnn = nn.GRU(self.input_dim_total, hidden_dim, batch_first=True)
        self.decoder_output_notes = nn.Linear(hidden_dim, input_dim_notes)
        self.decoder_output_dur = nn.Linear(hidden_dim, 1)

    def encode(self, x):
        _, h = self.encoder_rnn(x)
        h = h.squeeze(0)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        return mu, logvar

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, seq_length):
        batch_size = z.size(0)
        h0 = self.decoder_input(z).unsqueeze(0)

        x = torch.zeros((batch_size, 1, self.input_dim_total), device=z.device)
        notes_out = []
        durs_out = []

        for _ in range(seq_length):
            out, h0 = self.decoder_rnn(x, h0)
            note_step = self.decoder_output_notes(out)
            dur_step = self.decoder_output_dur(out)

            notes_out.append(note_step)
            durs_out.append(dur_step)

            # Feed output back into next step
            x = torch.cat([torch.sigmoid(note_step), dur_step], dim=2).detach()

        notes_out = torch.cat(notes_out, dim=1)
        durs_out = torch.cat(durs_out, dim=1)
        return notes_out, durs_out

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        notes_out, durs_out = self.decode(z, self.seq_length)
        return notes_out, durs_out, mu, logvar


In [14]:
def vae_loss(notes_pred, durs_pred, notes_true, durs_true, mu, logvar):
    note_loss = F.binary_cross_entropy_with_logits(notes_pred, notes_true, reduction='sum')
    dur_loss = F.mse_loss(durs_pred, durs_true, reduction='sum')
    kl_div = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return note_loss + dur_loss + kl_div


In [15]:
event = reconstructed_dataset['sequence'][0][0]
encoding = vocab.encode_event(event)
print("Event:", event)
print("Encoded:", encoding)


Event: 88
Encoded: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [18]:
import pandas as pd

# Load parts with durations
part1 = pd.read_csv("data_processed/data_part1.csv")
part2 = pd.read_csv("data_processed/data_part2.csv")

# Merge them vertically
duration_df = pd.concat([part1, part2], ignore_index=True)


In [22]:
print("Sequences:", len(reconstructed_dataset))
print("Durations:", len(duration_df))

# Optional: check a few row lengths
for i in range(5):
    print(f"Seq {i} = {len((reconstructed_dataset['sequence'][i]))}, Dur = {len(eval(duration_df['durations'][i]))}")


Sequences: 2775
Durations: 2775
Seq 0 = 2429, Dur = 2429
Seq 1 = 1813, Dur = 1813
Seq 2 = 1248, Dur = 1248
Seq 3 = 1379, Dur = 1379
Seq 4 = 1357, Dur = 1357


In [25]:
import pandas as pd
import ast

def safe_eval_list(x):
    try:
        return ast.literal_eval(x)
    except Exception as e:
        print("❌ Error parsing:", x)
        return None  # or [] if you want to skip it entirely

# Load CSVs
reconstructed_df = pd.read_csv("data_processed/reconstructed_ordered_events.csv")
part1 = pd.read_csv("data_processed/data_part1.csv")
part2 = pd.read_csv("data_processed/data_part2.csv")
duration_df = pd.concat([part1, part2], ignore_index=True)

# Apply safe parsing
reconstructed_df['sequence'] = reconstructed_df['sequence'].apply(safe_eval_list)
duration_df['durations'] = duration_df['durations'].apply(safe_eval_list)


In [26]:
combined_df = reconstructed_dataset.copy()
combined_df['durations'] = duration_df['durations']


In [27]:
combined_df.to_csv("data_processed/reconstructed_with_durations.csv", index=False)


In [2]:
combined_df


NameError: name 'combined_df' is not defined

In [3]:
import pandas as pd
import ast

# Load the dataframe
df = pd.read_csv("data_processed/reconstructed_with_durations.csv")

# Helper function to convert events to string tokens
def tokenize_event_duration(event, duration):
    if isinstance(event, list):
        event_token = "-".join(map(str, sorted(event)))
    else:
        event_token = str(event)
    return f"{event_token}_{duration}"

# Tokenize all sequences
tokenized_sequences = []

for seq, durs in zip(df['sequence'], df['durations']):
    # Convert from string to Python object if needed
    if isinstance(seq, str):
        seq = ast.literal_eval(seq)
    if isinstance(durs, str):
        durs = ast.literal_eval(durs)
        
    assert len(seq) == len(durs), "Mismatch between sequence and durations length"
    
    tokens = [tokenize_event_duration(e, d) for e, d in zip(seq, durs)]
    tokenized_sequences.append(tokens)

# Add to dataframe
df['tokenized'] = tokenized_sequences

# Save or inspect
df[['index', 'tokenized']].to_csv("data_processed/tokenized_sequences.csv", index=False)


KeyboardInterrupt: 

In [4]:
import pandas as pd
import ast

df = pd.read_csv("data_processed/reconstructed_with_durations.csv")

def tokenize_event_duration(event, duration):
    if isinstance(event, list):
        event_token = "-".join(map(str, sorted(event)))
    else:
        event_token = str(event)
    return f"{event_token}_{duration}"

all_tokens = set()

for seq, durs in zip(df['sequence'], df['durations']):
    seq = ast.literal_eval(seq)
    durs = ast.literal_eval(durs)
    tokens = [tokenize_event_duration(e, d) for e, d in zip(seq, durs)]
    all_tokens.update(tokens)

print("Number of unique tokens:", len(all_tokens))


Number of unique tokens: 430169


In [5]:
import pandas as pd
import ast
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load tokenized sequence CSV
df = pd.read_csv("data_processed/tokenized_sequences.csv")

# Parse the stringified lists
df["tokenized"] = df["tokenized"].apply(ast.literal_eval)

# Flatten and count token frequencies (optional)
all_tokens = [token for seq in df["tokenized"] for token in seq]
token_freq = Counter(all_tokens)

# Create vocab: token -> index
vocab = {token: idx + 1 for idx, token in enumerate(sorted(token_freq))}  # Reserve 0 for padding
vocab["<PAD>"] = 0
inv_vocab = {i: t for t, i in vocab.items()}

# Encode sequences
encoded_sequences = [[vocab[token] for token in seq] for seq in df["tokenized"]]

# Pad sequences to fixed length
seq_length = 50
padded_sequences = pad_sequences(encoded_sequences, maxlen=seq_length, padding='post', truncating='post')

# Convert to numpy array
import numpy as np
x_train = np.array(padded_sequences)

# Save vocab if needed
import json
with open("data_processed/vocab.json", "w") as f:
    json.dump(vocab, f)


2025-05-31 13:38:07.431525: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [47]:
len(vocab)

430170

In [48]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Reshape, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

# Parameters
vocab_size = len(vocab)            # total number of unique token IDs
print(vocab_size)
embedding_dim = 128                # dimension of token embedding
seq_length = 50                    # length of input sequences
latent_dim = 32                    # size of latent space

# ======== ENCODER ========
encoder_inputs = Input(shape=(seq_length,), name="encoder_input")
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length)(encoder_inputs)
flatten = Flatten()(embedding_layer)

z_mean = Dense(latent_dim, name="z_mean")(flatten)
z_log_var = Dense(latent_dim, name="z_log_var")(flatten)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name="z")([z_mean, z_log_var])

# ======== DECODER ========
decoder_inputs = Input(shape=(latent_dim,), name="decoder_input")
decoder_dense = Dense(seq_length * embedding_dim, activation="relu")(decoder_inputs)
decoder_reshape = Reshape((seq_length, embedding_dim))(decoder_dense)
decoder_output = Dense(vocab_size, activation="softmax")(decoder_reshape)  # Predict token at each position

# ======== MODELS ========
encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
decoder = Model(decoder_inputs, decoder_output, name="decoder")

vae_outputs = decoder(encoder(encoder_inputs)[2])
vae = Model(encoder_inputs, vae_outputs, name="vae")

# ======== VAE LOSS ========
reconstruction_loss = tf.keras.losses.sparse_categorical_crossentropy(encoder_inputs, vae_outputs)
reconstruction_loss = K.sum(reconstruction_loss, axis=1)

kl_loss = -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=1)

vae_loss = K.mean(reconstruction_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer="adam")


430170




ValueError: A KerasTensor cannot be used as input to a TensorFlow function. A KerasTensor is a symbolic placeholder for a shape and dtype, used when constructing Keras Functional models or Keras Functions. You can only use it as input to a Keras layer or a Keras operation (from the namespaces `keras.layers` and `keras.operations`). You are likely doing something like:

```
x = Input(...)
...
tf_fn(x)  # Invalid.
```

What you should do instead is wrap `tf_fn` in a layer:

```
class MyLayer(Layer):
    def call(self, x):
        return tf_fn(x)

x = MyLayer()(x)
```


In [10]:
from tensorflow.keras import Model
from tensorflow.keras.losses import sparse_categorical_crossentropy
import tensorflow as tf


class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = tf.keras.metrics.Mean(name="loss")
        self.reconstruction_loss_tracker = tf.keras.metrics.Mean(name="reconstruction_loss")
        self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss")

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]  # discard labels if passed as (x, y)

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)

            # Compute reconstruction loss
            reconstruction_loss = tf.reduce_sum(
                sparse_categorical_crossentropy(data, reconstruction), axis=1
            )

            # Compute KL divergence
            kl_loss = -0.5 * tf.reduce_sum(
                1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1
            )

            total_loss = tf.reduce_mean(reconstruction_loss + kl_loss)

        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))

        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)

        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]


In [50]:
vae = VAE(encoder, decoder)
vae.compile(optimizer="adam")
vae.fit(x_train, epochs=20, batch_size=64)


Epoch 1/20


In [11]:
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K

# --- Model parameters
vocab_size = len(vocab)        # your built vocabulary
embedding_dim = 128
seq_length = 50
latent_dim = 32

# --- Encoder
encoder_inputs = Input(shape=(seq_length,), name="encoder_input")
x = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=seq_length)(encoder_inputs)
x = Flatten()(x)
z_mean = Dense(latent_dim, name="z_mean")(x)
z_log_var = Dense(latent_dim, name="z_log_var")(x)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, name="z")([z_mean, z_log_var])

encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")




In [12]:
from tensorflow.keras.layers import Reshape, Dense, TimeDistributed, Softmax

decoder_inputs = Input(shape=(latent_dim,), name="decoder_input")
x = Dense(seq_length * embedding_dim, activation="relu")(decoder_inputs)
x = Reshape((seq_length, embedding_dim))(x)
x = TimeDistributed(Dense(vocab_size, activation="softmax"))(x)  # Predict token probabilities

decoder = Model(decoder_inputs, x, name="decoder")


In [13]:
vae = VAE(encoder, decoder)
vae.compile(optimizer="adam")
vae.fit(x_train, epochs=1, batch_size=64)


In [32]:
import pandas as pd
import ast  # for safely evaluating stringified Python literals

# Load the CSV
df = pd.read_csv("data_processed/reconstructed_with_durations.csv")

# Convert string representations of lists into actual Python lists
df["sequence"] = df["sequence"].apply(ast.literal_eval)
df["durations"] = df["durations"].apply(ast.literal_eval)


In [33]:
# Extract unique MIDI notes (not full chords)
all_notes = set()
for seq in combined_df["sequence"]:
    for item in seq:
        if isinstance(item, list):
            all_notes.update(item)
        else:
            all_notes.add(item)

# Final sorted list of unique pitches
sorted_notes = sorted(all_notes)
note2idx = {note: i for i, note in enumerate(sorted_notes)}
vocab_size = len(note2idx)


In [34]:
import torch

def encode_event(note_or_chord, duration):
    note_vec = torch.zeros(vocab_size)
    
    if isinstance(note_or_chord, list):
        for note in note_or_chord:
            if note in note2idx:
                note_vec[note2idx[note]] = 1
    else:
        if note_or_chord in note2idx:
            note_vec[note2idx[note_or_chord]] = 1

    duration_tensor = torch.tensor([float(duration)], dtype=torch.float)
    return torch.cat([note_vec, duration_tensor])  # shape: (vocab_size + 1,)


In [35]:
from torch.utils.data import Dataset

class MusicEventDataset(Dataset):
    def __init__(self, dataframe, seq_length=50):
        self.samples = []
        self.seq_length = seq_length

        for notes, durations in zip(dataframe["sequence"], dataframe["durations"]):
            assert len(notes) == len(durations), "Mismatch between notes and durations"
            if len(notes) <= seq_length:
                continue
            for i in range(len(notes) - seq_length):
                input_seq = [
                    encode_event(notes[j], durations[j])
                    for j in range(i, i + seq_length)
                ]
                target = encode_event(notes[i + seq_length], durations[i + seq_length])
                self.samples.append((torch.stack(input_seq), target))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


In [36]:
dataset = MusicEventDataset(combined_df, seq_length=50)

from torch.utils.data import DataLoader

train_loader = DataLoader(dataset, batch_size=64, shuffle=True)


KeyboardInterrupt: 