In [1]:
%run init_notebook.py

PROJECT ROOT is: c:\Users\Articuno\Desktop\TFG-info


In [2]:

import torch
import torch.nn as nn
import torchaudio

# Adjust these imports according to your project structure:
from src.models import AutoEncoder
from src.config import CONV_KERNEL_SIZE, CONV_STRIDE, CONV_PADDING
from src.utils.models import compute_output_size, compute_flattened_size

# Define parameters
input_height = 64
input_width = 128
latent_dim = 30
in_channels = 1
filters = [32, 64, 128]

# Instantiate the autoencoder model.
# Make sure your AutoEncoder class is defined to accept (input_height, input_width, latent_dim, in_channels, filters)
model = AutoEncoder(input_height, input_width, latent_dim, in_channels, filters)
print("AutoEncoder model:")
print(model)

# Create a dummy input tensor: shape [batch_size, in_channels, input_height, input_width]
batch_size = 4
dummy_input = torch.randn(batch_size, in_channels, input_height, input_width)

# Pass the dummy input through the autoencoder d
output = model(dummy_input)

# Print input and output shapes
print("Input shape:", dummy_input.shape)
print("Output shape:", output.shape)

# Check if the output shape matches the input shape
if dummy_input.shape == output.shape:
    print("Success: Output shape matches input shape.")
else:
    print("Mismatch: Adjust output_padding in your decoder layers if necessary.")


AutoEncoder model:
AutoEncoder(
  (encoder): Encoder(
    (encoder): Sequential(
      (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (1): ReLU()
      (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (3): ReLU()
      (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (5): ReLU()
      (6): Flatten(start_dim=1, end_dim=-1)
      (7): Linear(in_features=16384, out_features=30, bias=True)
    )
  )
  (decoder): Decoder(
    (decoder): Sequential(
      (0): Linear(in_features=30, out_features=16384, bias=True)
      (1): Unflatten(dim=1, unflattened_size=(128, 8, 16))
      (2): ConvTranspose2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
      (3): ReLU()
      (4): ConvTranspose2d(128, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
      (5): ReLU()
      (6): ConvTranspose2d(64, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1),

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio.transforms as T
import librosa
import soundfile as sf
import time
from tqdm import tqdm
from torch.utils.data import DataLoader
from IPython.display import Audio

# Adjust these imports according to your project structure:
from src.dataset import NSynth      # Your NSynth dataset class
from src.models import AutoEncoder  # Your autoencoder model
from src.config import CONV_KERNEL_SIZE, CONV_STRIDE, CONV_PADDING

# --------------------------
# Define the Mel Spectrogram Transform
# --------------------------
mel_transform = T.MelSpectrogram(
    sample_rate=16000,
    n_fft=1024,
    hop_length=501,  # So that the mel spectrogram has shape [1, 64, 128]
    n_mels=64,
    normalized=True # Important
)

# --------------------------
# Create Datasets and DataLoaders
# --------------------------
train_dataset = NSynth(partition='training', transform=mel_transform)
test_dataset  = NSynth(partition='testing', transform=mel_transform)

# Optionally, use a subset for quicker debugging:
from torch.utils.data import Subset
train_dataset = Subset(train_dataset, list(range(100)))

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader  = DataLoader(test_dataset, batch_size=1, shuffle=False)

# --------------------------
# Model, Optimizer, and Loss
# --------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_height = 64
input_width  = 128
latent_dim   = 5
in_channels  = 1
filters      = [8]

model = AutoEncoder(input_height, input_width, latent_dim, in_channels, filters).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)  # Reduce learning rate
criterion = nn.MSELoss()

# --------------------------
# Training Loop
# --------------------------
num_epochs = 20
log_interval = 10  # print step info every 10 iterations

print("Starting training...")

for epoch in range(num_epochs):
    model.train()
    start_epoch_time = time.time()
    epoch_loss = 0.0
    
    # Use tqdm to track epoch progress
    # (If you prefer a silent loop, remove tqdm and just use 'for i, batch in enumerate(train_loader):')
    for i, (_, mel_spec, _) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
        mel_spec = mel_spec.to(device)  # e.g., shape [batch, 1, 64, 128]

        optimizer.zero_grad()
        output = model(mel_spec)
        loss = criterion(output, mel_spec)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * mel_spec.size(0)

        # Print step info every log_interval steps
        if (i + 1) % log_interval == 0:
            print(f"  [Epoch {epoch+1}, Step {i+1}/{len(train_loader)}] loss: {loss.item():.4f}")

    # Compute average epoch loss
    epoch_loss /= len(train_loader.dataset)
    epoch_time = time.time() - start_epoch_time
    print(f"Epoch {epoch+1}/{num_epochs}, Avg Loss: {epoch_loss:.4f}, Time: {epoch_time:.2f}s")

print("Training complete.")

# --------------------------
# Testing, Inversion, and Audio Playback
# --------------------------
model.eval()
with torch.no_grad():
    for _, mel_spec, sample_rate in test_loader:
        mel_spec = mel_spec.to(device)  # shape [1, 1, 64, 128]
        reconstructed = model(mel_spec)
        break

print("Input shape:", mel_spec.shape)
print("Reconstructed shape:", reconstructed.shape)

# Convert the reconstructed mel spectrogram to a numpy array (squeeze batch and channel dims)
reconstructed_np = reconstructed.squeeze().cpu().numpy()  # Expected shape: [64, 128]

# Inverse Mel Transformation:
sr = 16000       # Sample rate
n_fft = 1024     # FFT window size
hop_length = 501 # Must match mel_transform hop_length
n_mels = 64      # Number of mel bins

# Convert mel spectrogram back to waveform
reconstructed_audio = librosa.feature.inverse.mel_to_audio(
    reconstructed_np, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)

# Save the audio file
sf.write("reconstructed.wav", reconstructed_audio, sr)
print("Reconstructed audio saved to 'reconstructed.wav'.")

# Play the audio in the notebook
Audio(reconstructed_audio, rate=sr)


Starting training...


Epoch 1/20:  40%|████      | 10/25 [08:41<13:12, 52.81s/it]

  [Epoch 1, Step 10/25] loss: 96.7195


Epoch 1/20:  80%|████████  | 20/25 [17:17<04:18, 51.61s/it]

  [Epoch 1, Step 20/25] loss: 2.7893


Epoch 1/20: 100%|██████████| 25/25 [21:09<00:00, 50.78s/it]


Epoch 1/20, Avg Loss: 17.3753, Time: 1269.48s


Epoch 2/20:  40%|████      | 10/25 [08:45<14:26, 57.78s/it]

  [Epoch 2, Step 10/25] loss: 12.5235


Epoch 2/20:  40%|████      | 10/25 [09:49<14:43, 58.93s/it]


OSError: [Errno 22] Invalid argument