In [1]:
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import scipy

device = torch.device("cuda")
print(device)

# Load source and target audio signals
# source_audio_path = "path/to/source_audio.wav"
# target_audio_path = "path/to/target_audio.wav"

target_audio_path = "./AMPM - ACCAPELLA REFERENCE.wav"
# full_song_path = "/Users/patrickmuller/Library/CloudStorage/OneDrive-Personal/Music/Projects/Remix/NOTD - AM_PM/NOTD - AMPM.wav"
source_audio_path = "./test_kernel.wav"

source_signal, sample_rate = librosa.load(source_audio_path, sr=None)
target_signal, _ = librosa.load(
    target_audio_path, sr=sample_rate
)  # Ensure same sample rate
print(source_signal.shape)
source_signal = source_signal[500000:1000000].to(device)
target_signal = target_signal[500000:1000000].to(device)

print(target_signal.shape)
# Apply STFT to both signals
n_fft = 1024
hop_length = 16

source_stft = librosa.stft(
    source_signal,
    n_fft=n_fft,
    hop_length=hop_length,
    window=scipy.signal.windows.kaiser(n_fft, beta=9),
)
target_stft = librosa.stft(
    target_signal,
    n_fft=n_fft,
    hop_length=hop_length,
    window=scipy.signal.windows.kaiser(n_fft, beta=9),
)

# Convert to magnitude and phase
source_magnitude, source_phase = np.abs(source_stft), np.angle(source_stft)
target_magnitude, target_phase = np.abs(target_stft), np.angle(target_stft)
print(f"source_phase{source_phase.shape}")
print(f"source_phase{source_phase.shape}")

# Prepare data
# Add channel dimension and convert to PyTorch tensors
source_magnitude_tensor = (
    torch.tensor(source_magnitude, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
).to(device)
target_magnitude_tensor = (
    torch.tensor(target_magnitude, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
).to(device)

cuda
(8068131,)


AttributeError: 'numpy.ndarray' object has no attribute 'to'

In [5]:
source_magnitude_tensor.shape

torch.Size([1, 1, 513, 31251])

In [71]:
import torch


def calculate_output_dim(input_tensor, kernel_size, stride, padding):
    """
    Calculate the output dimensions of a 2D convolution.

    Parameters:
    input_tensor (torch.Tensor): The input tensor of shape (batch_size, channels, height, width).
    kernel_size (tuple): The size of the convolution kernel as (kernel_height, kernel_width).
    stride (tuple): The stride of the convolution as (stride_height, stride_width).
    padding (tuple): The padding added to the input as (padding_height, padding_width).

    Returns:
    tuple: The output height and width as (output_height, output_width).
    """
    # Extract input dimensions
    input_height, input_width = input_tensor.shape[2], input_tensor.shape[3]

    # Extract kernel, stride, and padding values
    kernel_height, kernel_width = kernel_size
    stride_height, stride_width = stride
    padding_height, padding_width = padding

    # Calculate the output height and width
    output_height = (
        input_height - kernel_height + 2 * padding_height
    ) // stride_height + 1
    output_width = (input_width - kernel_width + 2 * padding_width) // stride_width + 1

    return output_height, output_width


# Example usage:
# Assume an input tensor of shape (batch_size, channels, height, width)
input_tensor = torch.randn(1, 1, 513, 31251)  # Example input with height and width 28

# Convolution parameters
kernel_size = (9, 9)
stride = (1, 1)
padding = (4, 4)

output_height, output_width = calculate_output_dim(
    input_tensor, kernel_size, stride, padding
)
k=51
p=25
# Convolution parameters
kernel_size = (k, k)
stride = (1, 1)
padding = (p, p)

output_height, output_width = calculate_output_dim(
    input_tensor, kernel_size, stride, padding
)

print(f"Output Height: {output_height}, Output Width: {output_width}")

Output Height: 513, Output Width: 31251


In [76]:
class SpectrogramMapper(nn.Module):
    def __init__(self):
        super(SpectrogramMapper, self).__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=101, padding=50)
        #self.conv2 = nn.Conv2d(8, 16, kernel_size=29, padding=14)
        #self.conv3 = nn.Conv2d(16, 32, kernel_size=15, padding=7)
        #self.conv4 = nn.Conv2d(32, 64, kernel_size=7, padding=3)
        self.conv5 = nn.Conv2d(8, 1, kernel_size=51, padding=25)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.conv1(x))
        #x = self.relu(self.conv2(x))
        #x = self.relu(self.conv3(x))
        #x = self.relu(self.conv4(x))
        x = self.conv5(x)
        return x


# Instantiate the model
model = SpectrogramMapper().to(device)

# Count the total number of parameters in the model
num_params = sum(p.numel() for p in model.parameters())

print(f"Total number of parameters in the model: {num_params}")

Total number of parameters in the model: 102425


In [78]:
# Training settings
num_epochs = 2
learning_rate = 0.01
criterion = nn.MSELoss()  # Mean squared error loss
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=1
)

# Training loop
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    output = model(source_magnitude_tensor)
    loss = criterion(output, target_magnitude_tensor)
    loss.backward()
    optimizer.step()
    scheduler.step(loss)

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")
    print(f"Current learning rate: {scheduler.get_last_lr()}")

Epoch [1/2], Loss: 8160699.5000
Current learning rate: [0.01]
Epoch [2/2], Loss: 7.9747
Current learning rate: [0.01]


In [50]:
# Transform the source spectrogram using the trained model
model.eval()
with torch.no_grad():
    transformed_magnitude_tensor = model(source_magnitude_tensor)

# Convert the transformed magnitude tensor back to numpy
transformed_magnitude = transformed_magnitude_tensor.squeeze().cpu().numpy()

In [None]:
transformed_magnitude

array([[0.08572994, 0.16976465, 0.19978063, ..., 1.9268178 , 1.3516812 ,
        0.69839084],
       [0.18532898, 0.31287467, 0.35332966, ..., 3.6235032 , 2.5378108 ,
        1.3028425 ],
       [0.22610924, 0.3701585 , 0.41909364, ..., 4.9550223 , 3.523865  ,
        1.8337532 ],
       ...,
       [0.23657455, 0.3827555 , 0.4302116 , ..., 0.4176586 , 0.34662616,
        0.19095562],
       [0.21314324, 0.3376507 , 0.3767636 , ..., 0.36463478, 0.30165017,
        0.1637716 ],
       [0.13902925, 0.20420882, 0.22610693, ..., 0.21619073, 0.17689365,
        0.09134401]], dtype=float32)

In [51]:
import numpy as np
import librosa
import librosa.display
import soundfile as sf
import matplotlib.pyplot as plt

# Assuming these variables are defined elsewhere in your code
# transformed_magnitude, source_phase, hop_length, sample_rate, source_signal

# Reconstruct the complex-valued spectrogram from the transformed magnitude and source phase
transformed_stft = transformed_magnitude * np.exp(1j * source_phase)

# Apply the Inverse Short-Time Fourier Transform (ISTFT)
reconstructed_signal = librosa.istft(transformed_stft, hop_length=hop_length)

# Optionally, save the reconstructed audio to a file
sf.write(
    "reconstructed_512_32_real_15_epochs_even_shorter.wav",
    reconstructed_signal,
    sample_rate,
)

# Plot original and transformed signals
# plt.figure(figsize=(12, 6))
# plt.subplot(2, 1, 1)
# plt.title("Original Source Audio Signal")
# librosa.display.waveshow(source_signal, sr=sample_rate)
# plt.subplot(2, 1, 2)
# plt.title("Transformed Audio Signal")
# librosa.display.waveshow(reconstructed_signal, sr=sample_rate)
# plt.tight_layout()
# plt.show()

In [52]:
from IPython.display import Audio

# Replace 'path_to_your_audio_file' with the actual file path
audio = Audio(filename="reconstructed_512_32_real_15_epochs_even_shorter.wav")
audio


In [None]:
n_fft = 128
hop_length = 64

source_stft = librosa.stft(source_signal, n_fft=n_fft, hop_length=hop_length)
target_stft = librosa.stft(target_signal, n_fft=n_fft, hop_length=hop_length)

reconstructed_source = librosa.istft(source_stft, hop_length=hop_length)
reconstructed_target = librosa.istft(target_stft, hop_length=hop_length)

sf.write("reconstruction_test_source.wav", reconstructed_source, sample_rate)
sf.write("reconstruction_test_target.wav", reconstructed_target, sample_rate)

In [22]:
import librosa
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import scipy
import soundfile as sf


device = torch.device("cuda")
print(device)

# Load source and target audio signals
# source_audio_path = "path/to/source_audio.wav"
# target_audio_path = "path/to/target_audio.wav"

target_audio_path = "./AMPM - ACCAPELLA REFERENCE.wav"
# full_song_path = "/Users/patrickmuller/Library/CloudStorage/OneDrive-Personal/Music/Projects/Remix/NOTD - AM_PM/NOTD - AMPM.wav"
source_audio_path = "./test_kernel.wav"

source_signal, sample_rate = librosa.load(source_audio_path, sr=None)
target_signal, _ = librosa.load(
    target_audio_path, sr=sample_rate
)  # Ensure same sample rate
print(source_signal.shape)
source_signal = source_signal[500000:1000000]
target_signal = target_signal[500000:1000000]

source_tensor = (
    torch.tensor(source_signal, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
).to(device)
target_tensor = (
    torch.tensor(target_signal, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
).to(device)

cuda
(8068131,)


In [23]:
import torch.nn as nn
import torch.nn.functional as F


class AudioFilterModel(nn.Module):
    def __init__(self):
        super(AudioFilterModel, self).__init__()
        self.conv1 = nn.Conv1d(
            in_channels=1, out_channels=3, kernel_size=256000, padding="same"
        )
        #self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=16000, padding="same")
        #self.conv3 = nn.Conv1d(in_channels=8, out_channels=16, kernel_size=1000, padding="same" )
        self.conv4 = nn.Conv1d(
            in_channels=3, out_channels=1, kernel_size=500, padding="same"
        )

    def forward(self, x):
        x = F.relu(self.conv1(x))
        #x = F.relu(self.conv2(x))
        #x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        return x


model = AudioFilterModel()
model.to(device)

AudioFilterModel(
  (conv1): Conv1d(1, 3, kernel_size=(256000,), stride=(1,), padding=same)
  (conv4): Conv1d(3, 1, kernel_size=(500,), stride=(1,), padding=same)
)

In [24]:
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # Forward pass
    outputs = model(source_tensor)
    import torch.nn.functional as F

    loss = criterion(outputs, target_tensor)

    # Backward pass and optimization
    loss.backward()
    optimizer.step()

    print(f"Epoch [{epoch}], Loss: {loss.item():.4f}")

Epoch [0], Loss: 0.1192
Epoch [1], Loss: 0.1210
Epoch [2], Loss: 0.1189


In [28]:
# Function to apply the filter to a new audio file
def apply_filter(model, waveform):
    model.eval()
    with torch.no_grad():
        filtered_waveform = model(waveform)
    return filtered_waveform

#full_song = torch.nn.functional.pad(full_song, (0, 3071318 - full_song.shape[1]))

filtered_waveform = apply_filter(model, source_tensor)

#filtered_waveform = rms_normalize(filtered_waveform)
# Save the filtered audio
sf.write(
    "reconstructed_512_32_real_15_epochs_even_shorter.wav",
    filtered_waveform.squeeze().cpu(),
    sample_rate,
)

In [29]:
from IPython.display import Audio

# Replace 'path_to_your_audio_file' with the actual file path
audio = Audio(filename="reconstructed_512_32_real_15_epochs_even_shorter.wav")
audio
