In [5]:
import os
import ast
import random
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset, random_split
from sklearn.model_selection import KFold

root = 'data_processed/'

# Generation with Recurrent Neural Networks

To establish a baseline of music generation that we can improve on, we use Recurrent Neural Networks. We formulate the problem as a next-note prediciton problem. This method is quite similar to  recurrence-based language models that are used in NLP.

The input is sequential, but unlike words in NLP, timing and dynamics (duration, velocity, offset) matter a lot in music. To be able to predict notes/chords + durations + offsets + velocities we might need multi-output heads (e.g., softmax for notes/chords/velocities, regression for durations/offsets).

## Import Dataset and Definition of Useful functions

### Import Dataset

In [6]:
def safe_parse_all_columns_df(df):
    """
    Parse all columns in a DataFrame to numeric, coercing errors.
    """
    df['notes'] = df['notes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['chords'] = df['chords'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['velocities'] = df['velocities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['durations'] = df['durations'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['offsets'] = df['offsets'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['ordered_events'] = df['ordered_events'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    return df

def load_dataframe_from_two_csvs(file1, file2):
    """
    Load and concatenate two CSV files into a single pandas DataFrame.
    """
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    full_df = pd.concat([df1, df2], ignore_index=True)
    full_df = safe_parse_all_columns_df(full_df)

    return full_df

def save_dataframe_to_two_csvs(df, file1, file2):
    """
    Split a DataFrame in half and save it into two CSV files.
    """
    halfway = len(df) // 2
    df.iloc[:halfway].to_csv(file1, index=False)
    df.iloc[halfway:].to_csv(file2, index=False)

def load_dataframe_from_one_csv(file):
    """
    Load a DataFrame from a single CSV file.
    """
    df = pd.read_csv(file)
    
    return df

def save_dataframe_to_one_csv(df, file):
    """
    Save a DataFrame to a single CSV file.
    """
    df.to_csv(file, index=True)

def load_reconstructed_events(file):
    """
    Loads the reconstructed events CSV and safely parses the 'sequence' column,
    converting notes to integers and chords to lists of integers.
    """
    df = pd.read_csv(file)

    def safe_parse(seq_str):
        try:
            parsed = ast.literal_eval(seq_str)
            if not isinstance(parsed, list):
                raise ValueError("Parsed sequence is not a list")

            normalized = []
            for el in parsed:
                if isinstance(el, list):
                    normalized.append([int(x) for x in el])
                else:
                    normalized.append(int(el))
            return normalized

        except Exception as e:
            print(f"Error parsing sequence: {seq_str}")
            raise e

    df['sequence'] = df['sequence'].apply(safe_parse)
    return df

In [26]:
file1 = root + 'data_part1.csv'
file2 = root + 'data_part2.csv'

df = load_dataframe_from_two_csvs(file1, file2)

In [27]:
df_reconstructed = load_reconstructed_events('data_processed/reconstructed_with_durations.csv')

In [28]:
len(df['durations'][0])

2429

In [29]:
len(df_reconstructed['sequence'][0])

2429

### Useful functions

In [8]:
def parse_chord_to_list(chord):
    """
    Convert a chord string to a list of integers.
    """
    if isinstance(chord, str):
        print([int(x) for x in chord.split(',') if x.isdigit()])
        return [int(x) for x in chord.split(',') if x.isdigit()]
    return []

In [9]:
def reconstruct_ordered_events(df):
    """
    Reconstruct the ordered list of events (notes and chords) for each song.
    """
    sequences  = []

    for i in range(len(df)):
        idx_note = 0
        idx_chord = 0
        reconstructed = []

        for element in df['ordered_events'][i]:
            if element == 'n':
                reconstructed.append(df['notes'][i][idx_note])
                idx_note += 1
            elif element == 'c':
                parsed_chord = parse_chord_to_list(df['chords'][i][idx_chord])
                reconstructed.append(df['chords'][i][idx_chord])
                idx_chord += 1
            else:
                raise ValueError(f"Unknown event type: {e}")
        
        sequences.append(reconstructed)

    reconstructed_dataset = pd.DataFrame({'sequence': sequences})
    reconstructed_dataset.index.name = 'index'

    return reconstructed_dataset

In [10]:
save_dataframe_to_one_csv(reconstruct_ordered_events(df), root + 'reconstructed_ordered_events.csv')

## Predict only Events (Notes and Chords)

### Creating the data: Fixed number of events 

Idea for creating the input sequences:
- we take subsets of the list of events representing each song 
- we take the next event of each subset as corresponding training output sequences

This is easy to implement and we will have a consistent sequence lenght for batching, but we are ignoring the timing aspect.

In [30]:
class Vocabulary:
    def __init__(self, reconstructed_df):
        """
        Build vocabulary of unique single notes only.
        """
        self.notes = set()
        for i in range(len(reconstructed_df)):
            sequence = reconstructed_df['sequence'][i]
            for event in sequence:
                if isinstance(event, list):
                    for note in event:
                        self.notes.add(note)
                else:
                    self.notes.add(event)

        self.notes = sorted(self.notes)
        self.note_to_idx = {note: idx for idx, note in enumerate(self.notes)}
        self.idx_to_note = {idx: note for idx, note in enumerate(self.notes)}
        self.vocab_size = len(self.notes)

    def encode_event(self, event):
        """
        Encode an event as a multi-hot vector over single notes.
        """
        vec = np.zeros(self.vocab_size, dtype=np.float32)
        if isinstance(event, list):
            for note in event:
                vec[self.note_to_idx[note]] = 1.0
        else:
            vec[self.note_to_idx[event]] = 1.0
        return vec

    def decode_event(self, vec, threshold=0.5):
        """
        Decode multi-hot vector to list of notes.
        """
        indices = np.where(vec >= threshold)[0]
        notes = [self.idx_to_note[idx] for idx in indices]
        if len(notes) == 1:
            return notes[0]
        else:
            return notes

    def __len__(self):
        return self.vocab_size


Create Dataset object

In [12]:
class MusicEventDataset(Dataset):
    def __init__(self, reconstructed_df, vocab, seq_length=50):
        """
        Constructs all valid (input_seq, target_event) pairs from each song in the dataset.
d
        Args:
            reconstructed_df: DataFrame with 'sequence' column where each entry is a list of events
            vocab: Vocabulary object to encode events
            seq_length: Length of each training input sequence (target is the next event)
        """
        self.samples = []
        self.seq_length = seq_length
        self.vocab = vocab

        for row_index in range(len(reconstructed_df)):
            sequence = reconstructed_df['sequence'][row_index]
            n_events = len(sequence)

            if n_events <= seq_length:
                continue

            for i in range(n_events - seq_length):
                input_seq = sequence[i : i + seq_length]
                target_event = sequence[i + seq_length]
                self.samples.append((input_seq, target_event))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_seq, target_event = self.samples[idx]

        input_encoded = np.array([self.vocab.encode_event(event) for event in input_seq], dtype=np.float32)
        input_tensor = torch.tensor(input_encoded)

        target_encoded = self.vocab.encode_event(target_event)
        target_tensor = torch.tensor(target_encoded, dtype=torch.float32)

        return input_tensor, target_tensor


In [32]:
class MusicEventDataset(Dataset):
    def __init__(self, reconstructed_df, vocab, seq_length=50):
        """
        Constructs all valid (input_seq, input_dur_seq, target_event, target_dur) pairs.

        Args:
            reconstructed_df: DataFrame with 'sequence' and 'durations' columns
            vocab: Vocabulary object to encode events
            seq_length: Length of each training input sequence (target is the next event)
        """
        self.samples = []
        self.seq_length = seq_length
        self.vocab = vocab

        for row_index in range(len(reconstructed_df)):
            sequence = reconstructed_df['sequence'][row_index]
            durations = df['durations'][row_index]
            
            if isinstance(durations, str):
                durations = eval(durations)

            n_events = min(len(sequence), len(durations))
            if n_events <= seq_length:
                continue

            for i in range(n_events - seq_length):
                input_seq = sequence[i : i + seq_length]
                input_durs = durations[i : i + seq_length]
                target_event = sequence[i + seq_length]
                target_dur = durations[i + seq_length]

                self.samples.append((input_seq, input_durs, target_event, target_dur))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_seq, input_durs, target_event, target_dur = self.samples[idx]

        input_encoded = [self.vocab.encode_event(event) for event in input_seq]
        input_tensor = torch.tensor(input_encoded, dtype=torch.float32)

        dur_tensor = torch.tensor(input_durs, dtype=torch.float32).unsqueeze(-1)  

        target_encoded = self.vocab.encode_event(target_event)
        target_tensor = torch.tensor(target_encoded, dtype=torch.float32)

        target_dur_tensor = torch.tensor([target_dur], dtype=torch.float32)  
        return input_tensor, dur_tensor, target_tensor, target_dur_tensor


In [35]:
import torch
import numpy as np
from torch.utils.data import Dataset
import ast

class MusicEventDataset(Dataset):
    def __init__(self, reconstructed_df, vocab, seq_length=50):
        """
        Constructs all valid (input_seq, input_dur_seq, target_event, target_dur) pairs.

        Args:
            reconstructed_df: DataFrame with 'sequence' and 'durations' columns
            vocab: Vocabulary object to encode events
            seq_length: Length of each training input sequence (target is the next event)
        """
        self.samples = []
        self.seq_length = seq_length
        self.vocab = vocab

        for row_index in range(len(reconstructed_df)):
            sequence = reconstructed_df['sequence'][row_index]
            durations = df['durations'][row_index]

            if isinstance(durations, str):
                durations = ast.literal_eval(durations)

            n_events = min(len(sequence), len(durations))
            if n_events <= seq_length:
                continue

            for i in range(n_events - seq_length):
                input_seq = sequence[i:i + seq_length]
                input_durs = durations[i:i + seq_length]
                target_event = sequence[i + seq_length]
                target_dur = durations[i + seq_length]

                self.samples.append((input_seq, input_durs, target_event, target_dur))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        input_seq, input_durs, target_event, target_dur = self.samples[idx]

        input_encoded = [self.vocab.encode_event(event) for event in input_seq]
        input_tensor = torch.tensor(input_encoded, dtype=torch.float32)

        dur_tensor = torch.tensor(input_durs, dtype=torch.float32).unsqueeze(-1)  

        target_encoded = self.vocab.encode_event(target_event)
        target_tensor = torch.tensor(target_encoded, dtype=torch.float32)

        target_dur_tensor = torch.tensor([target_dur], dtype=torch.float32)  

        return input_tensor, dur_tensor, target_tensor, target_dur_tensor


In [69]:
reconstructed_dataset = load_reconstructed_events(root + 'reconstructed_ordered_events.csv')
vocab = Vocabulary(reconstructed_dataset)
dataset = MusicEventDataset(reconstructed_dataset, vocab=vocab, seq_length=16)




In [70]:
x, dur, y, target_dur = dataset[0]

print("Input sequence shape:", x.shape)
print("Duration sequence shape:", dur.shape)
print("Next event shape:", y.shape)
print("Target duration shape:", target_dur.shape)

print("Input sequence (multi-hot vectors):\n", x)
print("Duration sequence:\n", dur)
print("Next event (multi-hot vector):", y)
print("Target duration:", target_dur)


Input sequence shape: torch.Size([16, 88])
Duration sequence shape: torch.Size([16, 1])
Next event shape: torch.Size([88])
Target duration shape: torch.Size([1])
Input sequence (multi-hot vectors):
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Duration sequence:
 tensor([[0.2500],
        [2.5000],
        [0.2500],
        [0.3333],
        [0.3333],
        [0.2500],
        [0.3333],
        [0.2500],
        [0.3333],
        [0.7500],
        [1.5000],
        [0.3333],
        [0.2500],
        [1.0000],
        [0.3333],
        [0.3333]])
Next event (multi-hot vector): tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,

In [71]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim, seq_length):
        super(VAE, self).__init__()
        self.seq_length = seq_length
        self.latent_dim = latent_dim

        # Encoder for the input sequence
        self.encoder = nn.Sequential(
            nn.Linear(input_dim * seq_length, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU(),
        )

        # Encoder for the duration sequence
        self.duration_encoder = nn.Sequential(
            nn.Linear(seq_length, 64),
            nn.ReLU(),
        )

        # Combined latent space
        self.fc_mu = nn.Linear(256 + 64, latent_dim)
        self.fc_var = nn.Linear(256 + 64, latent_dim)

        # Decoder for the next event
        self.event_decoder = nn.Sequential(
            nn.Linear(latent_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Linear(512, input_dim),
            nn.Sigmoid(),  # Using Sigmoid to output values between 0 and 1
        )

        # Decoder for the duration
        self.duration_decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def encode(self, x, durs):
        h_seq = self.encoder(x.view(-1, self.seq_length * x.size(-1)))
        h_dur = self.duration_encoder(durs.view(-1, self.seq_length))
        h = torch.cat((h_seq, h_dur), dim=1)
        mu = self.fc_mu(h)
        log_var = self.fc_var(h)
        return mu, log_var

    def reparameterize(self, mu, log_var):
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        event = self.event_decoder(z)
        duration = self.duration_decoder(z)
        return event, duration

    def forward(self, x, durs):
        mu, log_var = self.encode(x, durs)
        z = self.reparameterize(mu, log_var)
        return self.decode(z), mu, log_var

input_dim = 88  
latent_dim = 20
seq_length = 16

vae = VAE(input_dim, latent_dim, seq_length).to(device)


In [74]:
from torch.utils.data import DataLoader

batch_size = 32
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [76]:
import torch.optim as optim

input_dim = 88  
latent_dim = 20
seq_length = 16
vae = VAE(input_dim, latent_dim, seq_length).to(device)

optimizer = optim.Adam(vae.parameters(), lr=1e-3)

def loss_function(recon_event, event, recon_dur, dur, mu, log_var):
    recon_loss_event = F.binary_cross_entropy(recon_event, event, reduction='sum')
    recon_loss_dur = F.mse_loss(recon_dur, dur, reduction='sum')
    KLD = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    return recon_loss_event + recon_loss_dur + KLD


In [77]:
num_epochs = 50

for epoch in range(num_epochs):
    vae.train()
    train_loss = 0

    for batch_idx, (input_seq, dur_seq, target_event, target_dur) in enumerate(data_loader):
        input_seq = input_seq.to(device)
        dur_seq = dur_seq.to(device)
        target_event = target_event.to(device)
        target_dur = target_dur.to(device)
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        recon_batch, mu, log_var = vae(input_seq, dur_seq)
        recon_event, recon_dur = recon_batch

        # Compute loss
        loss = loss_function(recon_event, target_event, recon_dur, target_dur, mu, log_var)

        # Backward pass and optimize
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

    # Print training progress
    print(f'Epoch: {epoch+1}, Loss: {train_loss / len(data_loader.dataset)}')


Epoch: 1, Loss: 9.09855375184919
Epoch: 2, Loss: 9.068299964482913
Epoch: 3, Loss: 9.06272155853027
Epoch: 4, Loss: 9.059639148469826
Epoch: 5, Loss: 9.058641705199394
Epoch: 6, Loss: 9.056255766952983
Epoch: 7, Loss: 9.057195314729015
Epoch: 8, Loss: 9.05771583750041


KeyboardInterrupt: 

In [78]:
torch.save(vae.state_dict(), 'vae_model.pth')


In [79]:
vae = VAE(input_dim=88, latent_dim=20, seq_length=16).to(device)
vae.load_state_dict(torch.load('vae_model.pth'))
vae.eval()


VAE(
  (encoder): Sequential(
    (0): Linear(in_features=1408, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=256, bias=True)
    (3): ReLU()
  )
  (duration_encoder): Sequential(
    (0): Linear(in_features=16, out_features=64, bias=True)
    (1): ReLU()
  )
  (fc_mu): Linear(in_features=320, out_features=20, bias=True)
  (fc_var): Linear(in_features=320, out_features=20, bias=True)
  (event_decoder): Sequential(
    (0): Linear(in_features=20, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=88, bias=True)
    (5): Sigmoid()
  )
  (duration_decoder): Sequential(
    (0): Linear(in_features=20, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=1, bias=True)
  )
)

In [None]:
z = torch.randn(1, latent_dim).to(device)
recon_event, recon_dur = vae.decode(z)


In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# Define the Variational Autoencoder (VAE) class
class VAE(nn.Module):   
    def __init__(self, input_dim, hidden_dim, latent_dim):
        """
        Initializes the VAE model.

        Args:
            input_dim (int): The dimensionality of the input musical note sequence (e.g., 16 notes * 2 features per note if representing note and duration).
            hidden_dim (int): The dimensionality of the hidden layers in the encoder and decoder.
            latent_dim (int): The dimensionality of the latent space.
        """
        super(VAE, self).__init__()

        # --- Encoder ---
        # The encoder maps the input musical sequence to a distribution in the latent space.
        # It consists of fully connected layers.
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),  # ReLU activation for non-linearity
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )

        # Latent space mean (mu) and variance (log_var) layers
        # These layers output the parameters of the Gaussian distribution in the latent space.
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_log_var = nn.Linear(hidden_dim, latent_dim)

        # --- Decoder ---
        # The decoder reconstructs the musical sequence from a sample in the latent space.
        # It also consists of fully connected layers.
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()  # Sigmoid activation to ensure output values are between 0 and 1 (useful for normalized note/duration representations)
        )

    def encode(self, x):
        """
        Encodes the input data into the latent space.

        Args:
            x (torch.Tensor): The input musical note sequence.

        Returns:
            tuple: A tuple containing the mean (mu) and log-variance (log_var) of the latent distribution.
        """
        h = self.encoder(x)  # Pass input through the encoder layers
        mu = self.fc_mu(h)  # Calculate the mean of the latent distribution
        log_var = self.fc_log_var(h)  # Calculate the log-variance of the latent distribution
        return mu, log_var

    def reparameterize(self, mu, log_var):
        """
        Performs the reparameterization trick to sample from the latent space.
        This allows us to backpropagate through the sampling process.

        Args:
            mu (torch.Tensor): Mean of the latent distribution.
            log_var (torch.Tensor): Log-variance of the latent distribution.

        Returns:
            torch.Tensor: A sample from the latent distribution.
        """
        std = torch.exp(0.5 * log_var)  # Calculate standard deviation from log-variance
        eps = torch.randn_like(std)  # Sample random noise from a standard normal distribution
        return mu + eps * std  # Reparameterization trick: z = mu + epsilon * std

    def decode(self, z):
        """
        Decodes a latent space sample back into the original data space.

        Args:
            z (torch.Tensor): A sample from the latent space.

        Returns:
            torch.Tensor: The reconstructed musical note sequence.
        """
        return self.decoder(z)  # Pass the latent sample through the decoder layers

    def forward(self, x):
        """
        Performs a full forward pass through the VAE.

        Args:
            x (torch.Tensor): The input musical note sequence.

        Returns:
            tuple: A tuple containing the reconstructed output, mean (mu), and log-variance (log_var).
        """
        mu, log_var = self.encode(x)  # Encode the input
        z = self.reparameterize(mu, log_var)  # Sample from the latent space
        reconstruction = self.decode(z)  # Decode the sample
        return reconstruction, mu, log_var

# Define the training function
def train_vae(model, dataloader, epochs, learning_rate, device):
    """
    Trains the VAE model.

    Args:
        model (VAE): The VAE model to train.
        dataloader (torch.utils.data.DataLoader): DataLoader for the training data.
        epochs (int): Number of training epochs.
        learning_rate (float): Learning rate for the optimizer.
        device (torch.device): The device to train on (e.g., 'cuda' or 'cpu').
    """
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # Adam optimizer for training
    model.train()  # Set the model to training mode

    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, data in enumerate(dataloader):
            # Ensure data is on the correct device
            data = data.to(device)

            optimizer.zero_grad()  # Clear gradients from previous step

            # Forward pass
            reconstruction, mu, log_var = model(data)

            # --- VAE Loss Function Justification ---
            # The VAE loss function consists of two main components:
            # 1. Reconstruction Loss (L_recon):
            #    Measures how well the VAE reconstructs the input data. For musical notes and durations,
            #    which can often be normalized to a [0, 1] range (e.g., note MIDI values scaled, durations scaled),
            #    Binary Cross-Entropy (BCE) or Mean Squared Error (MSE) are common choices.
            #    Since our decoder uses a Sigmoid activation, indicating normalized outputs
            #    (e.g., if notes/durations are scaled to 0-1), BCE is often a good fit,
            #    especially if we can treat the output as probabilities or scaled values.
            #    If notes/durations are continuous and not necessarily binary, MSE might be more appropriate.
            #    Here, we assume scaled continuous values, so we'll use MSE for simplicity, but BCEWithLogitsLoss
            #    would be suitable if the output was interpreted as probabilities.
            reconstruction_loss = nn.functional.mse_loss(reconstruction, data, reduction='sum')
            # The 'sum' reduction is used because the original VAE paper sums loss over all dimensions
            # and then averages over the batch.

            # 2. Kullback-Leibler (KL) Divergence Loss (L_KL):
            #    Measures the difference between the approximate posterior q(z|x) (output of the encoder)
            #    and the prior p(z) (typically a standard normal distribution N(0, I)).
            #    This term acts as a regularizer, forcing the latent space to be well-structured
            #    and preventing the encoder from mapping all inputs to a single point.
            #    The analytical form for KL divergence between two Gaussian distributions is:
            #    $D_{KL}(N(\mu, \sigma^2) || N(0, I)) = 0.5 * \sum (1 + log(\sigma^2) - \mu^2 - \sigma^2)$
            #    Where $\sigma^2$ is the variance, and $log(\sigma^2)$ is log_var.
            kl_divergence = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())

            # Total VAE Loss: Sum of reconstruction loss and KL divergence loss
            loss = reconstruction_loss + kl_divergence

            # Backpropagation
            loss.backward()  # Compute gradients
            optimizer.step()  # Update model parameters

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader.dataset)  # Average loss per sample
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

    print("Training finished!")


In [38]:
# Define a simple dataset for demonstration
class RandomDataset(Dataset):
    def __init__(self, num_samples, sample_size):
        self.num_samples = num_samples
        self.sample_size = sample_size

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return torch.rand(self.sample_size)

In [49]:
# Example parameters
num_samples = len(df)
sample_size = 16 * 88  # Example: 16 notes with 88 features each

dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Hyperparameters
input_dim = sample_size
hidden_dim = 512
latent_dim = 32
epochs = 10
learning_rate = 1e-3

# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = VAE(input_dim, hidden_dim, latent_dim).to(device)


In [53]:
def train_vae(model, dataloader, epochs, learning_rate, device):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, data in enumerate(dataloader):
            input_tensor, dur_tensor, target_tensor, target_dur_tensor = data

            input_tensor = input_tensor.to(device)
            dur_tensor = dur_tensor.to(device)
            target_tensor = target_tensor.to(device)
            target_dur_tensor = target_dur_tensor.to(device)

            optimizer.zero_grad()

            # Ensure the concatenated input tensor has the correct dimensions
            x = torch.cat((input_tensor, dur_tensor), dim=-1)
            x = x.view(x.size(0), -1)  # Flatten the input tensor if necessary

            reconstruction, mu, log_var = model(x)

            reconstruction_loss = nn.functional.mse_loss(reconstruction, x, reduction='sum')
            kl_divergence = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())

            loss = reconstruction_loss + kl_divergence

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader.dataset)
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}')

    print("Training finished!")


In [55]:
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()

        # Adjust the first linear layer to accept the correct input dimension
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),  # Make sure input_dim matches the concatenated input size
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU()
        )

        self.fc_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc_log_var = nn.Linear(hidden_dim, latent_dim)

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )

    # Rest of the VAE class remains the same


In [44]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x19eecbef0>

In [46]:
print(dataloader)

<torch.utils.data.dataloader.DataLoader object at 0x19eecbef0>


In [56]:
train_vae(model, dataloader, epochs, learning_rate, device)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x1424 and 1408x512)