# Mounting to Google Drive 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Importing Libraries

In [None]:
import torch
from torch import nn
import librosa
import numpy as np

# Checking GPU availability


In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print("running on the GPU")
else:
    device = torch.device('cpu')
    print("running on the CPU")

running on the CPU


## Model Definition

In [None]:
"""LSTMNet Class (runs on CPU)"""

class LSTMNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(LSTMNet, self).__init__()
        # Hidden dimensions
        self.hidden_dim = hidden_dim

        # Number of hidden layers
        self.num_layers = num_layers

        # Building LSTM
        # batch_first=True causes input/output tensors to be of shape (batch_dim, seq_dim, feature_dim)
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)

        # Fully-connected layer
        #self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        #x = x.to(device)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        # Initialize cell state
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
        # One time step

        # Using "detach" as we are doing truncated backpropagation through time (BPTT)
        # If we don't, we'll backprop all the way to the start even after going through another batch
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach())) 
        
        return out
    

In [None]:
n_mels = 80
n_layers = 3
rnn = LSTMNet(n_mels, n_mels, n_layers, n_mels)
rnn_total_params = sum(p.numel() for p in rnn.parameters() if p.requires_grad)
print("Total number of parameters for a {} layer LSTM with {} mel features: ".format(n_layers, n_mels))
print(" {}".format(rnn_total_params))

Total number of parameters for a 3 layer LSTM with 80 mel features: 
 155520


In [None]:
""" Generating some random data drawn from a Normal distribution (0,1) """

seq_length = 1000
input = torch.randn(1, seq_length, n_mels)
print(input)

output = rnn(input)
print(output)


print("Norm of input data: {}".format(torch.norm(input)))
print("Norm of output data: {}".format(torch.norm(output)))

tensor([[[-0.7579, -0.3948, -1.3200,  ..., -0.7753, -1.8697,  0.5614],
         [-0.4584, -0.2034,  1.0261,  ...,  0.2085,  0.7370,  0.0159],
         [-2.0952, -0.1791,  0.9458,  ..., -0.8769,  0.0178,  0.5256],
         ...,
         [-0.3332, -0.1104, -0.1425,  ..., -1.2562, -0.0578,  0.2547],
         [ 0.4299, -1.8319, -0.9533,  ...,  1.5572, -1.2213,  0.0718],
         [-0.1717, -0.5625,  0.0332,  ...,  0.4196, -0.2139,  0.4226]]])
tensor([[[ 0.0274, -0.0128,  0.0087,  ..., -0.0384,  0.0252,  0.0296],
         [ 0.0419, -0.0151,  0.0125,  ..., -0.0509,  0.0351,  0.0431],
         [ 0.0498, -0.0125,  0.0141,  ..., -0.0565,  0.0362,  0.0488],
         ...,
         [ 0.0498, -0.0113,  0.0281,  ..., -0.0506,  0.0344,  0.0523],
         [ 0.0456, -0.0143,  0.0306,  ..., -0.0517,  0.0367,  0.0492],
         [ 0.0358, -0.0204,  0.0336,  ..., -0.0508,  0.0375,  0.0443]]],
       grad_fn=<TransposeBackward0>)
Norm of input data: 282.0531311035156
Norm of output data: 13.260701179504395


In [None]:
# Load a mel spectrogram 
audio_file = '/content/gdrive/MyDrive/Colab Notebooks/wavs/LJ001/LJ001-0011.wav'
y_c, sr_c = librosa.load(audio_file, mono=True, duration=9)

mel_spect_c = librosa.feature.melspectrogram(y=y_c, sr=22050, n_fft=1024, hop_length=256, n_mels=80)
mel_spect_c_db = librosa.power_to_db(mel_spect_c, ref=np.max)
mel_spect_c = mel_spect_c_db/10
mel_spect_c = torch.FloatTensor(mel_spect_c)

# Running the mel spectrogram through an LSTM (that has not been trained)
rnn_mel = LSTMNet(n_mels, n_mels, n_layers, n_mels)

print(mel_spect_c)
mel_spect_c_reshape = np.transpose(mel_spect_c[np.newaxis, :, :], (0,2,1))
mel_out = rnn_mel(mel_spect_c_reshape)
print(mel_out)

print("Norm of input data: {}".format(torch.norm(mel_spect_c_reshape)))
print("Norm of output data: {}".format(torch.norm(mel_out)))


loss = nn.MSELoss()
print("MSE loss towards input: {}".format(loss(mel_out, mel_spect_c_reshape)))


# Standardize over time
# Subtract mean and divide by standard deviation
mel_spect_c_norm = (mel_spect_c_reshape - mel_spect_c_reshape.mean(axis=1)) / mel_spect_c_reshape.std(axis=1)
print(mel_spect_c_norm)
print(mel_spect_c_reshape.mean(axis=1))
print(mel_spect_c_reshape.std(axis=1))
mel_norm_out = rnn_mel(mel_spect_c_norm)

print("Norm of input data (after normalization): {}".format(torch.norm(mel_spect_c_norm)))
print("Norm of output data (after normalization): {}".format(torch.norm(mel_norm_out)))


print("MSE loss towards input (after normalization): {}".format(loss(mel_norm_out, mel_spect_c_norm)))

tensor([[-7.5830, -6.9833, -7.0331,  ..., -8.0000, -8.0000, -8.0000],
        [-7.1796, -6.4630, -5.5040,  ..., -7.0143, -7.1549, -7.0855],
        [-6.6826, -5.4176, -5.0248,  ..., -6.3750, -6.8387, -7.0134],
        ...,
        [-8.0000, -8.0000, -7.4036,  ..., -8.0000, -8.0000, -8.0000],
        [-8.0000, -8.0000, -7.5023,  ..., -8.0000, -8.0000, -8.0000],
        [-8.0000, -8.0000, -7.7651,  ..., -8.0000, -8.0000, -8.0000]])
tensor([[[ 0.0368,  0.0118,  0.0504,  ..., -0.0063, -0.0060, -0.0067],
         [ 0.0523,  0.0148,  0.0828,  ..., -0.0120, -0.0137, -0.0102],
         [ 0.0537,  0.0147,  0.1015,  ..., -0.0160, -0.0206, -0.0131],
         ...,
         [ 0.0424,  0.0049,  0.1356,  ..., -0.0048, -0.0364, -0.0253],
         [ 0.0418,  0.0044,  0.1358,  ..., -0.0047, -0.0375, -0.0253],
         [ 0.0408,  0.0041,  0.1359,  ..., -0.0046, -0.0389, -0.0254]]],
       grad_fn=<TransposeBackward0>)
Norm of input data: 1060.8131103515625
Norm of output data: 9.159797668457031
MSE loss 