In [2]:
from fcrbm_utilities import FCRBM, spectrogram_to_audio, process_audio_files, synthesize_audio

2025-08-05 10:38:58.634378: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
import os
import librosa
import numpy as np
import tensorflow as tf

In [4]:
# ingest and process audio samples

audio_sample_dir = 'data/fcrbm_audio_segments/'

# define FFT parameters
n_fft = 2048
hop = 512
sr = 44100

# process audio

spectrogram_data_np, style_data_np, visible_dim = process_audio_files(audio_sample_dir, n_fft, hop, sr)

# Prepare history data (the previous spectrogram frame)
history_data_np = np.roll(spectrogram_data_np, shift=1, axis=0)
history_data_np[0] = 0 # The first frame has no previous frame, so its history is zero.

# Convert to TensorFlow tensors
v_data = tf.constant(spectrogram_data_np, dtype=tf.float32)
u_data = tf.constant(history_data_np, dtype=tf.float32)
y_data = tf.constant(style_data_np, dtype=tf.float32)

# Print the shapes to verify everything is correct
print("Data successfully loaded and preprocessed! ✅")
print(f"Visible data (spectrogram frames) shape: {v_data.shape}")
print(f"History data (previous frames) shape: {u_data.shape}")
print(f"Style data (one-hot vectors) shape: {y_data.shape}")
print(f"Visible layer dimension: {visible_dim}")
print(f"Number of distinct styles (style_dim): {y_data.shape[1]}")

Data successfully loaded and preprocessed! ✅
Visible data (spectrogram frames) shape: (12128, 1025)
History data (previous frames) shape: (12128, 1025)
Style data (one-hot vectors) shape: (12128, 8)
Visible layer dimension: 1025
Number of distinct styles (style_dim): 8


In [18]:
# Assuming spectrogram_data_np is your preprocessed numpy array
spectrogram_mean = np.mean(spectrogram_data_np)
spectrogram_std = np.std(spectrogram_data_np)
v_data_normalized = (spectrogram_data_np - spectrogram_mean) / (spectrogram_std + 1e-8)

# Convert the normalized data to a TensorFlow tensor
v_data = tf.constant(v_data_normalized, dtype=tf.float32)

# Calculate mean and standard deviation for the history data
u_data_mean = np.mean(history_data_np)
u_data_std = np.std(history_data_np)
u_data_normalized = (history_data_np - u_data_mean) / (u_data_std + 1e-8)

u_data = tf.constant(u_data_normalized, dtype=tf.float32)


In [25]:
# match parameters defined in audio processing

visible_dim = int((n_fft / 2) + 1)
hidden_dim = 100
style_dim = 8 # number of segments we are trying to learn
history_dim = visible_dim # the autoregressive data

learning_rate = 0.0000001
epochs = 5
batch_size = 128

#instatiate FCRBM with correct parameters eventually

fcrbm = FCRBM(
    visible_dim=visible_dim,
    hidden_dim=hidden_dim,
    style_dim=style_dim,
    history_dim=visible_dim,
    k=3
)

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)


In [26]:
# v_data (spectrogram frames), u_data (history), y_data (style)

# create a TensorFlow Dataset from the tensors
# dataset = tf.data.Dataset.from_tensor_slices((v_data, u_data, y_data)).shuffle(buffer_size=1000).batch(batch_size)

In [27]:
# Create a repeatable dataset
dataset = tf.data.Dataset.from_tensor_slices((v_data, u_data, y_data)).shuffle(buffer_size=12128).batch(batch_size).repeat()

# Set the number of steps per epoch
steps_per_epoch = 12128 // batch_size

# Training loop
for epoch in range(epochs):
    epoch_loss = 0
    # Iterate for a fixed number of steps
    for step in range(steps_per_epoch):
        v_batch, u_batch, y_batch = next(iter(dataset)) # Manually get the next batch
        loss = fcrbm.train_step(v_batch, u_batch, y_batch, optimizer)
        epoch_loss += loss.numpy()
        
    print(f"Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / steps_per_epoch:.4f}")

Epoch 1/5, Loss: 1.1282
Epoch 2/5, Loss: 1.1099
Epoch 3/5, Loss: 1.1228
Epoch 4/5, Loss: 1.0971
Epoch 5/5, Loss: 1.0782


In [None]:
# data variables:
# fcrbm (the trained model)
# v_data, u_data, y_data (from your original training data)
# n_fft, hop_length, sr (your STFT parameters)

# get a seed from your training data; ensures the generated audio has a coherent start.
seed_index = 100 # Choose a random frame from your dataset... potentially doesn't have to be random, potentially targetted
u_seed = v_data[seed_index]

# choose a style from your dataset. for 9 styles, this would be a one-hot vector, e.g., [0, 0, 1, 0, 0, 0, 0, 0, 0].
style_index = 2 # generate in the style of the 3rd audio file.
style_dim = 9
y_seed = tf.one_hot(style_index, depth=style_dim, dtype=tf.float32)

# --- Generate a new audio waveform ---
print("Synthesizing new audio...")
new_audio_waveform = synthesize_audio(
    model=fcrbm,
    u_seed=u_seed,
    y_seed=y_seed,
    num_frames_to_generate=500, # Number of frames to generate
    n_fft=n_fft,
    hop_length=hop_length,
    sr=sr
)
print("Synthesis complete!")

# Save the generated audio to a file.
librosa.output.write_wav('generated_audio_style_2.wav', new_audio_waveform, sr)
print("Audio saved as 'generated_audio_style_2.wav'")