In [5]:
import warnings
warnings.simplefilter("ignore")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import DataLoader, Dataset

class SpeechCommandsDataset(Dataset):
    def __init__(self, subset):
        self.dataset = torchaudio.datasets.SPEECHCOMMANDS(root='./data/SpeechCommands', download=True, subset=subset)
        self.labels = sorted(list(set(dat[2] for dat in self.dataset)))
        self.label_to_index = {label: i for i, label in enumerate(self.labels)}

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        waveform, sample_rate, label, *_ = self.dataset[index]
        label_index = self.label_to_index[label]
        return waveform, label_index

train_dataset = SpeechCommandsDataset(subset='training')
test_dataset = SpeechCommandsDataset(subset='testing')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
class PyramidBlock1D(nn.Module):
    def __init__(self, in_channels, out_channels, input_dim):
        super(PyramidBlock1D, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, padding=1)
        self.ln1 = nn.LayerNorm([out_channels, input_dim])
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, padding=1)
        self.ln2 = nn.LayerNorm([out_channels, input_dim // 2])
        self.dropout = nn.Dropout(0.1)
        self.pool = nn.MaxPool1d(2)

    def forward(self, x):
        x = torch.relu(self.ln1(self.conv1(x)))
        x = self.pool(x)
        x = torch.relu(self.ln2(self.conv2(x)))
        x = self.dropout(x)
        return x
    
class LaplacianNet1D(nn.Module):
    def __init__(self, input_shape, num_classes):
        super(LaplacianNet1D, self).__init__()
        self.fc_input = (256 + input_shape[0]) * (input_shape[1] // 8)  # Adjust based on final dimensions
        self.pyramid_block1 = PyramidBlock1D(input_shape[0], 64, input_shape[1])
        self.pyramid_block2 = PyramidBlock1D(64 + input_shape[0], 128, input_shape[1] // 2)
        self.pyramid_block3 = PyramidBlock1D(128 + input_shape[0], 256, input_shape[1] // 4)
        self.fc = nn.Linear(self.fc_input, 128)
        self.output = nn.Linear(128, num_classes)
        print('Using LaplacianNet1D')

    def forward(self, inputs):
        x_l1, x_l2, x_l3, x_l4 = inputs

        x1 = self.pyramid_block1(x_l1)
        x2 = self.pyramid_block2(torch.cat((x1, x_l2), dim=1))
        x3 = self.pyramid_block3(torch.cat((x2, x_l3), dim=1))
        x = torch.cat([x3, x_l4], dim=1)
        x = x.view(x.size(0), -1)
        x = nn.functional.relu(self.fc(x))
        return self.output(x)


In [None]:
def train(model, device, train_loader, optimizer, criterion):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        # Model expects 4 inputs
        inputs = [data] * 4
        output = model(inputs)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 10 == 0:
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
                  f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            inputs = [data] * 4
            output = model(inputs)
            test_loss += criterion(output, target).item()  # Sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # Get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print(f'\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} '
          f'({100. * correct / len(test_loader.dataset):.0f}%)\n')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")
input_shape = (1, 16000)  # Assuming 1 channel and 16000 time steps (1 second audio at 16kHz)
num_classes = len(train_dataset.labels)

model = LaplacianNet1D(input_shape=input_shape, num_classes=num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, 11):  # 10 epochs
    train(model, device, train_loader, optimizer, criterion)
    test(model, device, test_loader, criterion)