In [1]:
import torch
import numpy as np
import soundfile as sf
from utils import _latentspace
from utils._modeltraining import EmotionalVAE

In [4]:
# --- Load the pre-trained EmotionalVAE model ---
latent_dim = 750
hidden_dims = [512, 256, 128]
condition_dim = 2
model = EmotionalVAE(latent_dim, hidden_dims, condition_dim)
model_path = r'vae_model\final_model.pt'
checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
print("EmotionalVAE model loaded successfully.")

EmotionalVAE model loaded successfully.


In [5]:
# --- Generate latent tokens using sample() ---
# Define the number of samples and an example emotional condition (valence, arousal)
num_samples = 5
condition = torch.tensor([0.7, 0.6])  # Adjust values as necessary
if len(condition.shape) == 1:
    condition = condition.unsqueeze(0)
condition = condition.repeat(num_samples, 1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
latent_tokens = model.sample(num_samples, condition, device)
# latent_tokens shape: [num_samples, quantizer_count, latent_dim]
print("Latent tokens generated.")

Latent tokens generated.


In [6]:
# --- Feed the latent tokens into EnCodec decoder ---
# Initialize the LatentRepresentationGenerator to access the EnCodec model and its decoder.
# Here we assume that the EnCodec model is loaded inside LatentRepresentationGenerator via _load_encodec_model().

# Adjust parameters as needed
lat_gen = _latentspace.LatentRepresentationGenerator(
    encodec_bandwidth=6.0,
    device=device,
    chunk_sizes=[5]
)

Using device: cpu
Loading EnCodec model...


  WeightNorm.apply(module, name, dim)


EnCodec model loaded successfully (target bandwidth: 6.0 kbps)


In [7]:
# Ensure the EnCodec model is loaded
lat_gen._load_encodec_model()
print("EnCodec decoder loaded successfully.")

# Prepare the latent tokens for decoding.
# The EnCodec decoder expects tokens in a specific dictionary format.
# We assume each sample uses layer 0 tokens.
decoded_audios = []
for sample in latent_tokens:
    # Convert sample to tensor of the right shape if needed.
    # Instead of preparing a dictionary, get the tensor for layer 0 directly.
    codes_tensor = sample.unsqueeze(0).long() 
    scale = torch.ones(1, 1).to(device)
    
    with torch.no_grad():
        codes = sample.unsqueeze(0)              # float tensor
        codes_int = codes.round().long()         # yuvarla → tamsayı
        codes_int = codes_int.clamp(0, 255)      # örnek aralık
        decoded_audio = lat_gen.encodec_model.decode([(codes_int, scale)])[0]
    
    # Move decoded audio to CPU and convert to NumPy array.
    decoded_audio_np = decoded_audio.cpu().numpy()[0]  # Assuming mono channel
    decoded_audios.append(decoded_audio_np)

Loading EnCodec model...
EnCodec model loaded successfully (target bandwidth: 6.0 kbps)
EnCodec decoder loaded successfully.


In [8]:
# --- Save the decoded audio files ---
sample_rate = lat_gen.encodec_model.sample_rate  # Use the EnCodec model's sample rate
for i, audio in enumerate(decoded_audios):
    output_path = f"decoded_sample_{i}.wav"
    sf.write(output_path, audio, sample_rate)
    print(f"Audio sample saved: {output_path}")

# Final message
print("Full audio synthesis pipeline complete: Audio → EnCodec Encoder → Custom Autoencoder → New tokens → EnCodec Decoder → Audio")

Audio sample saved: decoded_sample_0.wav
Audio sample saved: decoded_sample_1.wav
Audio sample saved: decoded_sample_2.wav
Audio sample saved: decoded_sample_3.wav
Audio sample saved: decoded_sample_4.wav
Full audio synthesis pipeline complete: Audio → EnCodec Encoder → Custom Autoencoder → New tokens → EnCodec Decoder → Audio
