In [1]:
%run init_notebook.py

PROJECT ROOT is: c:\Users\Articuno\Desktop\TFG-info


In [None]:

%run init_notebook.py

import torch
import torch.nn as nn
import torchaudio

# Adjust these imports according to your project structure:
from src.models import AutoEncoder
from src.config import CONV_KERNEL_SIZE, CONV_STRIDE, CONV_PADDING
from src.utils.models import compute_output_size, compute_flattened_size

# Define parameters
input_height = 64
input_width = 128
latent_dim = 30
in_channels = 1
filters = [32, 64, 128]

# Instantiate the autoencoder model.
# Make sure your AutoEncoder class is defined to accept (input_height, input_width, latent_dim, in_channels, filters)
model = AutoEncoder(input_height, input_width, latent_dim, in_channels, filters)
print("AutoEncoder model:")
print(model)

# Create a dummy input tensor: shape [batch_size, in_channels, input_height, input_width]
batch_size = 4
dummy_input = torch.randn(batch_size, in_channels, input_height, input_width)

# Pass the dummy input through the autoencoder d
output = model(dummy_input)

# Print input and output shapes
print("Input shape:", dummy_input.shape)
print("Output shape:", output.shape)

# Check if the output shape matches the input shape
if dummy_input.shape == output.shape:
    print("Success: Output shape matches input shape.")
else:
    print("Mismatch: Adjust output_padding in your decoder layers if necessary.")


AutoEncoder model:
AutoEncoder(
  (encoder): Encoder(
    (encoder): Sequential(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): ReLU()
      (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (3): ReLU()
      (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (5): ReLU()
      (6): Flatten(start_dim=1, end_dim=-1)
      (7): Linear(in_features=16384, out_features=30, bias=True)
    )
  )
  (decoder): Decoder(
    (decoder): Sequential(
      (0): Linear(in_features=30, out_features=16384, bias=True)
      (1): Unflatten(dim=1, unflattened_size=(128, 8, 16))
      (2): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
      (3): ReLU()
      (4): ConvTranspose2d(128, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
      (5): ReLU()
      (6): ConvTranspose2d(64, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),

In [1]:
%run init_notebook.py

import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T
import librosa
import soundfile as sf
import time
from tqdm import tqdm
from torch.utils.data import DataLoader, Subset
from torch.optim.lr_scheduler import ReduceLROnPlateau
from IPython.display import Audio, display

from src.dataset import NSynth   
from src.models import AutoEncoder  
from src.config import CONV_KERNEL_SIZE, CONV_STRIDE, CONV_PADDING
from src.utils.dataset import load_raw_waveform

# Mel spectogram definition
mel_transform = nn.Sequential(
    T.MelSpectrogram(
        sample_rate=16000,
        n_fft=1024,
        hop_length=501,
        n_mels=64,
        # power=2.0,      
        normalized=True
    ),
    # T.AmplitudeToDB(stype="power")  # convert power -> dB
)

# Datasets and DataLoaders
train_dataset = NSynth(partition='training', transform=mel_transform)
valid_dataset = NSynth(partition='validation', transform=mel_transform)
test_dataset  = NSynth(partition='testing', transform=mel_transform)

# Subset for quicker training
train_dataset = Subset(train_dataset, list(range(50000)))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=8, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=True, num_workers=8, pin_memory=True)
test_loader  = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Model, Optimizer, and Loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_height = 64
input_width  = 128
latent_dim   = 64
in_channels  = 1
filters      = [64, 128, 256]

model = AutoEncoder(input_height, input_width, latent_dim, in_channels, filters).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
# scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5, verbose=True)
criterion = nn.MSELoss()

# Training Loop
num_epochs = 50
log_interval = 10

print(f"Starting training on {device}...")
for epoch in tqdm(range(num_epochs), desc="Training"):
    model.train()
    start_epoch_time = time.time()
    train_loss = 0.0
    
    # Only interested in mel_spec from (waveform, sample_rate, key, metadata)
    for i, (mel_spec, _, _, _) in enumerate(train_loader):
        mel_spec = mel_spec.to(device)  # shape [batch_size, 1, 64, 128]

        optimizer.zero_grad()
        output = model(mel_spec)
        loss = criterion(output, mel_spec)
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * mel_spec.size(0)

        # if (i + 1) % log_interval == 0:
        #     print(f"  [Epoch {epoch+1}, Step {i+1}/{len(train_loader)}] loss: {loss.item():.4f}")

    # Compute average epoch loss
    train_loss /= len(train_loader.dataset)
    epoch_time = time.time() - start_epoch_time

    # model.eval()
    val_loss = 0.0
    # with torch.no_grad():
    #     for (mel_spec, _, _, _) in valid_loader:
    #         mel_spec = mel_spec.to(device)  # shape [batch_size, 1, 64, 128]
    #         output = model(mel_spec)
    #         loss = criterion(output, mel_spec)
    #         val_loss += loss.item() * mel_spec.size(0)
    # val_loss /= len(valid_loader.dataset)

    print(f"Epoch {epoch+1}, train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, Time: {epoch_time:.2f}s")

print("Training complete.")

# --------------------------
# Testing, Inversion, and Audio Playback
# --------------------------
model.eval()

# Samples to compare
test_indices = [0, 1, 2, 3]

for idx in test_indices:
    print(f"\n=== Test sample index: {idx} ===")
    # (mel_spec, sample_rate, key, metadata) from dataset
    mel_spec, sample_rate, key, metadata = test_dataset[idx]

    # 1) Listen to the Original Audio with no transform applied
    raw_waveform, raw_sr = load_raw_waveform("testing", key)
    print(f"Key: {key}")
    print("Original audio:")
    display(Audio(raw_waveform.numpy(), rate=raw_sr))

    # 2) Reconstruct using the model
    mel_spec = mel_spec.unsqueeze(0).to(device)  # shape [1, 1, 64, 128]
    with torch.no_grad():
        reconstructed_mel = model(mel_spec)  # shape [1, 1, 64, 128]

    # 3) Convert the reconstructed mel to waveform
    recon_np = reconstructed_mel.squeeze().cpu().numpy()  # [64, 128]
    # recon_power = librosa.db_to_power(recon_np)  # dB -> power
    reconstructed_audio = librosa.feature.inverse.mel_to_audio(
        recon_np, sr=16000, n_fft=1024, hop_length=501
    )

    print("Reconstructed audio:")
    display(Audio(reconstructed_audio, rate=raw_sr))


PROJECT ROOT is: c:\Users\Articuno\Desktop\TFG-info
Starting training on cuda...


Training:   2%|▏         | 1/50 [00:32<26:09, 32.03s/it]

Epoch 1, train_loss=11.2641, val_loss=0.0000, Time: 32.03s


Training:   4%|▍         | 2/50 [01:03<25:35, 31.98s/it]

Epoch 2, train_loss=11.2165, val_loss=0.0000, Time: 31.95s


Training:   6%|▌         | 3/50 [01:35<24:59, 31.90s/it]

Epoch 3, train_loss=11.2050, val_loss=0.0000, Time: 31.80s


Training:   8%|▊         | 4/50 [02:07<24:28, 31.91s/it]

Epoch 4, train_loss=11.2011, val_loss=0.0000, Time: 31.94s


Training:  10%|█         | 5/50 [02:39<23:54, 31.88s/it]

Epoch 5, train_loss=11.1991, val_loss=0.0000, Time: 31.81s


Training:  12%|█▏        | 6/50 [03:11<23:24, 31.92s/it]

Epoch 6, train_loss=11.1974, val_loss=0.0000, Time: 31.98s


Training:  14%|█▍        | 7/50 [03:43<22:50, 31.88s/it]

Epoch 7, train_loss=11.1963, val_loss=0.0000, Time: 31.81s


Training:  16%|█▌        | 8/50 [04:15<22:20, 31.92s/it]

Epoch 8, train_loss=11.1955, val_loss=0.0000, Time: 31.98s


Training:  18%|█▊        | 9/50 [04:47<21:49, 31.93s/it]

Epoch 9, train_loss=11.1949, val_loss=0.0000, Time: 31.97s


Training:  20%|██        | 10/50 [05:19<21:15, 31.90s/it]

Epoch 10, train_loss=11.1943, val_loss=0.0000, Time: 31.81s


Training:  22%|██▏       | 11/50 [05:51<20:45, 31.94s/it]

Epoch 11, train_loss=11.1941, val_loss=0.0000, Time: 32.03s


Training:  24%|██▍       | 12/50 [06:23<20:13, 31.95s/it]

Epoch 12, train_loss=11.1936, val_loss=0.0000, Time: 31.95s


Training:  26%|██▌       | 13/50 [06:55<19:45, 32.03s/it]

Epoch 13, train_loss=11.1933, val_loss=0.0000, Time: 32.22s


Training:  28%|██▊       | 14/50 [07:27<19:12, 32.00s/it]

Epoch 14, train_loss=11.1931, val_loss=0.0000, Time: 31.94s


Training:  30%|███       | 15/50 [07:59<18:40, 32.00s/it]

Epoch 15, train_loss=11.1930, val_loss=0.0000, Time: 32.00s


Training:  32%|███▏      | 16/50 [08:31<18:05, 31.94s/it]

Epoch 16, train_loss=11.1927, val_loss=0.0000, Time: 31.80s


Training:  34%|███▍      | 17/50 [09:03<17:34, 31.94s/it]

Epoch 17, train_loss=11.1926, val_loss=0.0000, Time: 31.96s


Training:  36%|███▌      | 18/50 [09:34<17:02, 31.96s/it]

Epoch 18, train_loss=11.1924, val_loss=0.0000, Time: 31.98s


Training:  38%|███▊      | 19/50 [10:07<16:31, 31.98s/it]

Epoch 19, train_loss=11.1924, val_loss=0.0000, Time: 32.03s


Training:  40%|████      | 20/50 [10:38<15:59, 31.98s/it]

Epoch 20, train_loss=11.1922, val_loss=0.0000, Time: 31.97s


Training:  42%|████▏     | 21/50 [11:10<15:27, 31.98s/it]

Epoch 21, train_loss=11.1922, val_loss=0.0000, Time: 32.00s


Training:  44%|████▍     | 22/50 [11:42<14:55, 31.98s/it]

Epoch 22, train_loss=11.1920, val_loss=0.0000, Time: 31.99s


Training:  46%|████▌     | 23/50 [12:14<14:23, 31.99s/it]

Epoch 23, train_loss=11.1919, val_loss=0.0000, Time: 32.01s


Training:  48%|████▊     | 24/50 [12:46<13:51, 31.99s/it]

Epoch 24, train_loss=11.1920, val_loss=0.0000, Time: 31.97s


Training:  50%|█████     | 25/50 [13:18<13:19, 31.98s/it]

Epoch 25, train_loss=11.1918, val_loss=0.0000, Time: 31.95s


Training:  52%|█████▏    | 26/50 [13:50<12:47, 31.98s/it]

Epoch 26, train_loss=11.1919, val_loss=0.0000, Time: 32.00s


Training:  54%|█████▍    | 27/50 [14:22<12:15, 31.98s/it]

Epoch 27, train_loss=11.1917, val_loss=0.0000, Time: 31.97s


Training:  56%|█████▌    | 28/50 [14:54<11:42, 31.92s/it]

Epoch 28, train_loss=11.1916, val_loss=0.0000, Time: 31.80s


Training:  58%|█████▊    | 29/50 [15:26<11:10, 31.95s/it]

Epoch 29, train_loss=11.1916, val_loss=0.0000, Time: 32.00s


Training:  60%|██████    | 30/50 [15:58<10:39, 31.95s/it]

Epoch 30, train_loss=11.1915, val_loss=0.0000, Time: 31.97s


Training:  62%|██████▏   | 31/50 [16:30<10:06, 31.91s/it]

Epoch 31, train_loss=11.1915, val_loss=0.0000, Time: 31.80s


Training:  64%|██████▍   | 32/50 [17:02<09:33, 31.87s/it]

Epoch 32, train_loss=11.1915, val_loss=0.0000, Time: 31.76s


Training:  66%|██████▌   | 33/50 [17:34<09:01, 31.85s/it]

Epoch 33, train_loss=11.1914, val_loss=0.0000, Time: 31.80s


Training:  68%|██████▊   | 34/50 [18:05<08:29, 31.83s/it]

Epoch 34, train_loss=11.1913, val_loss=0.0000, Time: 31.80s


Training:  70%|███████   | 35/50 [18:37<07:57, 31.82s/it]

Epoch 35, train_loss=11.1913, val_loss=0.0000, Time: 31.80s


Training:  72%|███████▏  | 36/50 [19:09<07:25, 31.81s/it]

Epoch 36, train_loss=11.1913, val_loss=0.0000, Time: 31.80s


Training:  74%|███████▍  | 37/50 [19:41<06:52, 31.75s/it]

Epoch 37, train_loss=11.1912, val_loss=0.0000, Time: 31.59s


Training:  76%|███████▌  | 38/50 [20:12<06:20, 31.70s/it]

Epoch 38, train_loss=11.1912, val_loss=0.0000, Time: 31.58s


Training:  78%|███████▊  | 39/50 [20:44<05:48, 31.73s/it]

Epoch 39, train_loss=11.1912, val_loss=0.0000, Time: 31.80s


Training:  80%|████████  | 40/50 [21:16<05:17, 31.75s/it]

Epoch 40, train_loss=11.1911, val_loss=0.0000, Time: 31.80s


Training:  82%|████████▏ | 41/50 [21:47<04:45, 31.75s/it]

Epoch 41, train_loss=11.1911, val_loss=0.0000, Time: 31.77s


Training:  84%|████████▍ | 42/50 [22:19<04:14, 31.76s/it]

Epoch 42, train_loss=11.1911, val_loss=0.0000, Time: 31.78s


Training:  86%|████████▌ | 43/50 [22:51<03:42, 31.77s/it]

Epoch 43, train_loss=11.1911, val_loss=0.0000, Time: 31.80s


Training:  88%|████████▊ | 44/50 [23:23<03:10, 31.78s/it]

Epoch 44, train_loss=11.1911, val_loss=0.0000, Time: 31.80s


Training:  90%|█████████ | 45/50 [23:55<02:39, 31.84s/it]

Epoch 45, train_loss=11.1910, val_loss=0.0000, Time: 31.95s


Training:  92%|█████████▏| 46/50 [24:27<02:07, 31.82s/it]

Epoch 46, train_loss=11.1910, val_loss=0.0000, Time: 31.77s


Training:  94%|█████████▍| 47/50 [24:58<01:35, 31.82s/it]

Epoch 47, train_loss=11.1912, val_loss=0.0000, Time: 31.83s


Training:  96%|█████████▌| 48/50 [25:30<01:03, 31.81s/it]

Epoch 48, train_loss=11.1909, val_loss=0.0000, Time: 31.78s


Training:  98%|█████████▊| 49/50 [26:02<00:31, 31.81s/it]

Epoch 49, train_loss=11.1909, val_loss=0.0000, Time: 31.81s


Training: 100%|██████████| 50/50 [26:34<00:00, 31.89s/it]

Epoch 50, train_loss=11.1909, val_loss=0.0000, Time: 31.77s
Training complete.

=== Test sample index: 0 ===
Key: bass_synthetic_068-049-025
Original audio:





Reconstructed audio:



=== Test sample index: 1 ===
Key: keyboard_electronic_001-021-127
Original audio:


Reconstructed audio:



=== Test sample index: 2 ===
Key: guitar_acoustic_010-066-100
Original audio:


Reconstructed audio:



=== Test sample index: 3 ===
Key: reed_acoustic_037-068-127
Original audio:


Reconstructed audio:
