In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

In [8]:
class CNNClassifier(nn.Module):
    def __init__(self, num_classes: int):
        super(CNNClassifier, self).__init__()

        self.conv_block1 = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)  # reduces both freq and time by 2
        )

        self.conv_block2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.conv_block3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))  # output: [batch, 128, 1, 1]

        self.classifier = nn.Sequential(
            nn.Flatten(),  # [batch, 128]
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x = self.conv_block1(x)  # [batch, 32, H/2, W/2]
        x = self.conv_block2(x)  # [batch, 64, H/4, W/4]
        x = self.conv_block3(x)  # [batch, 128, H/8, W/8]
        x = self.global_pool(x)  # [batch, 128, 1, 1]
        logits = self.classifier(x)  # [batch, num_classes]
        return logits

In [None]:
num_epochs = 50
batch_size = 128


In [15]:
import librosa

array, sr = librosa.load('/Users/okkokuisma/Downloads/XC815854-MIXPRE-013.wav', sr=44100)

In [11]:
D = librosa.stft(array)
db = librosa.amplitude_to_db(np.abs(D))
input = torch.tensor(db.reshape((1, 1, 1025, 3811)))

In [29]:
window_size = 20 * 44100
lower = int(array.shape[0] / 2 - window_size / 2)
upper = int(array.shape[0] / 2 + window_size / 2)
window = np.hanning(window_size)
windowed_input = array[lower:upper] * window
D = librosa.stft(windowed_input)
db = librosa.amplitude_to_db(np.abs(D))
print(db.shape)

(1025, 1723)


In [None]:
a = np.arange(300000)
d = window_size - a.shape[0]
before = int(d / 2)
after = d - before
print().shape

(882000,)


In [None]:
def pad_or_truncate(array: np.ndarray, window_size):
    array_size = array.shape[0]

    if array_size < window_size:
        d = window_size - array_size
        before = int(d / 2)
        after = d - before
        return np.concat([np.zeros(before), array, np.zeros(after)])
    
    elif array_size > window_size:
        lower = int(array_size / 2 - window_size / 2)
        upper = int(array_size / 2 + window_size / 2)
        return array[lower:upper]
    
    return array

In [13]:
model = CNNClassifier(10)
print(model(input))

tensor([[-0.1790,  0.6011, -0.0431,  0.1033,  0.1780, -0.4846,  0.4245, -0.0985,
         -0.0012,  0.1388]], grad_fn=<AddmmBackward0>)
