## Imports & device

In [135]:
!pip install torchaudio



In [136]:
import pandas as pd
import numpy as np
import random
import tarfile

from sklearn import preprocessing

import IPython.display as ipd

In [137]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import torchaudio

In [138]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Load the CSV file

In [145]:
df = pd.read_csv("data/dataset.csv", sep=",", header=0)
df["label"] = df["label"].apply(lambda x: x == "beep")
df.head()

Unnamed: 0,split,label,file_path,class
0,test,False,data/sound-data-splits/test/cycle/Sound-2020-0...,1
1,test,False,data/sound-data-splits/test/cycle/Sound-2020-0...,1
2,test,False,data/sound-data-splits/test/cycle/Sound-2020-0...,1
3,test,False,data/sound-data-splits/test/cycle/Sound-2020-0...,1
4,test,False,data/sound-data-splits/test/cycle/Sound-2020-0...,1


In [146]:
le = preprocessing.LabelEncoder()
le.fit(df.label)
NUM_LABELS = len(le.classes_)

df["class"] = df["label"].apply(lambda x: le.transform([x])[0])
df[["label", "class"]].drop_duplicates()

Unnamed: 0,label,class
0,False,0
136,True,1


In [147]:
example_file = df[df.split == "train"].sample(n=1).iloc[0, :].file_path

print(example_file)
ipd.Audio(example_file)

data/sound-data-splits/train/spin/Sound-2020-02-15 18:10:47.295435.wav


## Data Loader

In [148]:
NUM_CHANNELS = 2
SAMPLE_RATE = 44100
NUM_SECONDS = 3

INPUT_SIZE = int(SAMPLE_RATE * NUM_SECONDS)
KERNEL_SIZE = int((SAMPLE_RATE / 100) * 2) # 20 milliseconds
STRIDE = int(KERNEL_SIZE / 2)

BATCH_SIZE = 32

In [149]:
def load(file_path):
    sound, sampling_rate = torchaudio.load(file_path, normalization=True)
    assert sampling_rate == SAMPLE_RATE

    if sound.shape[1] < INPUT_SIZE:
    	difference = INPUT_SIZE - sound.shape[1]
    	padding = torch.zeros(sound.shape[0], difference)
    	sound = torch.cat([sound, padding], 1)
    elif sound.shape[1] > INPUT_SIZE:
    	random_idx = random.randint(0, sound.shape[1] - INPUT_SIZE)
    	sound = sound.narrow(1, random_idx, INPUT_SIZE)

    # sound = torch.mean(sound, dim=0)
    assert sound.shape[1] == INPUT_SIZE
    return sound


class WashingMachineDataset(Dataset):
    
    def __init__(self, df):
    	self.entries = df[["class", "file_path"]].to_dict(orient="records")
        
    def __getitem__(self, index):
    	label = int(self.entries[index]["class"])
    	sound = load(self.entries[index]["file_path"])
    	
    	return sound, label

    def __len__(self):
    	return len(self.entries)

In [150]:
train_set = WashingMachineDataset(df[df.split == "train"])
val_set = WashingMachineDataset(df[df.split == "val"])
test_set = WashingMachineDataset(df[df.split == "test"])

print("Train set size: " + str(len(train_set)))
print("Validation set size: " + str(len(val_set)))
print("Test set size: " + str(len(test_set)))

Train set size: 1127
Validation set size: 139
Test set size: 146


In [151]:
kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=BATCH_SIZE,
    shuffle=True,
    **kwargs
)
val_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=BATCH_SIZE,
    shuffle=True,
    **kwargs
)
test_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=BATCH_SIZE,
    shuffle=False,
    **kwargs
)

## Network definition

In [152]:
class BeepNet(nn.Module):
    
    def __init__(self):
        super(BeepNet, self).__init__()
        self.main = nn.Sequential(
          nn.Conv1d(
              in_channels=NUM_CHANNELS,
              out_channels=2,
              kernel_size=KERNEL_SIZE,
              stride=STRIDE
          ),
          nn.BatchNorm1d(num_features=NUM_CHANNELS),
          nn.MaxPool1d(kernel_size=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(in_features=298, out_features=NUM_LABELS),
            nn.Softmax(dim=1)
        )
        
    def forward(self, x):
        batch_size = x.size(0)
        hidden = self.main(x)
        return self.classifier(hidden.view(batch_size, -1))

model = BeepNet()
model.to(device)
print(model)

BeepNet(
  (main): Sequential(
    (0): Conv1d(2, 2, kernel_size=(882,), stride=(441,))
    (1): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Linear(in_features=298, out_features=2, bias=True)
    (1): Softmax(dim=1)
  )
)


In [153]:
def train(model, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
      optimizer.zero_grad()
      data = data.to(device)
      data = data.requires_grad_() #set requires_grad to True for training

      # target = target.to(device)
      output = model(data)
      loss = cross_entropy(output, target)
      loss.backward()
      optimizer.step()
      if batch_idx % 10 == 0:
          print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss))

In [154]:
def test(model, epoch, data_loader):
    model.eval()
    correct = 0
    for data, target in data_loader:
        data = data.to(device)
        output = model(data)
        pred = output.argmax(1)
        correct += pred.eq(target).sum().item()
    print('\nTesting - Accuracy: {}/{} ({:.0f}%)\n'.format(
            correct, len(val_loader.dataset),
            100. * correct / len(val_loader.dataset)))

In [155]:
optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)
cross_entropy = nn.CrossEntropyLoss()

for epoch in range(1, 3):    
    train(model, epoch)
    test(model, epoch, val_loader)
    scheduler.step()


Testing - Accuracy: 138/139 (99%)


Testing - Accuracy: 138/139 (99%)


Testing - Accuracy: 138/139 (99%)


Testing - Accuracy: 138/139 (99%)


Testing - Accuracy: 138/139 (99%)


Testing - Accuracy: 138/139 (99%)



In [156]:
test(model, epoch, test_loader)


Testing - Accuracy: 138/139 (99%)



In [160]:
example_file = df[
    (df.split == "test")
    & (df.label != True)
].sample(n=1).iloc[0, :].file_path

sound = load(example_file).unsqueeze(0)

print(example_file, sound.shape)
ipd.Audio(example_file)

data/sound-data-splits/test/spin/Sound-2020-02-15 20:23:19.968006.wav torch.Size([1, 2, 132300])


In [161]:
model.eval()
output = model(sound)
le.classes_[output.argmax(1)]

False

In [163]:
torch.save(model.state_dict(), "pytorch_model.pt")