# Training Environment Sound Classification with Convolutional Neural Networks

In [1]:
from pathlib import Path

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from urbandata import k_fold_urban_sound, UrbanSoundDataSet

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'{device=}')

device=device(type='cuda')


## Consts & Hyperparams

In [2]:
urban_root = Path('~/sound_datasets/urbansound8k').expanduser()
urban_metadata = urban_root / 'metadata/UrbanSound8K.csv'
urban_audio_path = urban_root / "audio/"

In [3]:
target_sample_rate = 22050
mel_kwargs = {
    'n_fft': 1024,
    'n_mels': 60
}

In [4]:
isShuffle = True
batch_size = 64
lr = 0.001
momentum = 0.9
EPOCHS=5

## Network Setup

In [5]:
tmp_ds = UrbanSoundDataSet(urban_audio_path, ['fold5/100032-3-0-0.wav'], sample_rate=target_sample_rate, mel_kwargs=mel_kwargs)
X_shape = tmp_ds.getXShape()
print(f'Shape of model input, X: {X_shape}')

X_channels, X_rows, X_cols = X_shape

Shape of model input, X: torch.Size([2, 60, 173])


In [6]:
class SCCN(torch.nn.Module):
    '''The Simple CNN'''
    def __init__(self):
        super().__init__()
        
        # Recreating ESC paper, but with some modernization
        self.model = nn.Sequential(
            nn.Conv2d(X_channels, 60, 5),
            nn.BatchNorm2d(60),
            nn.MaxPool2d(kernel_size=2),
            nn.ReLU(),
            nn.Conv2d(60, 120, kernel_size=5),
            nn.BatchNorm2d(120),
            nn.MaxPool2d(kernel_size=2),
            nn.ReLU(),
            nn.Flatten(),
            nn.Dropout(0.5),
            nn.Linear(120 * 12 * 40, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 10),
            nn.Softmax(dim=1)
        )
        
    def forward(self, X):
        return self.model(X)

## Training, with Cross Validation

In [7]:
folds = k_fold_urban_sound(urban_metadata)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8732 entries, 0 to 8731
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   slice_file_name  8732 non-null   object 
 1   fsID             8732 non-null   int64  
 2   start            8732 non-null   float64
 3   end              8732 non-null   float64
 4   salience         8732 non-null   int64  
 5   fold             8732 non-null   int64  
 6   classID          8732 non-null   int64  
 7   class            8732 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 545.9+ KB

Summarizing folds:
-----------------------------------------------------------
Training set size for fold 1 : 7859
Training set info: 

Total samples: 7859
Class            | Frequency  | Percentage
----------------------------------------
engine_idling    | 904        | 11.50%
dog_bark         | 900        | 11.45%
children_playing | 900        | 11.45%
air_conditio

-----------------------------------------------------------
Training set size for fold 6 : 7909
Training set info: 

Total samples: 7909
Class            | Frequency  | Percentage
----------------------------------------
jackhammer       | 932        | 11.78%
dog_bark         | 900        | 11.38%
children_playing | 900        | 11.38%
air_conditioner  | 900        | 11.38%
street_music     | 900        | 11.38%
drilling         | 900        | 11.38%
engine_idling    | 893        | 11.29%
siren            | 855        | 10.81%
car_horn         | 401        | 5.07%
gun_shot         | 328        | 4.15%
----------------------------------------

Duration statistics: 
count    7909.000000
mean        3.608685
std         0.970717
min         0.054517
25%         4.000000
50%         4.000000
75%         4.000000
max         4.000000
Name: duration, dtype: float64
Validation set size for fold 6 : 823
Validation set info: 

Total samples: 823
Class            | Frequency  | Percentage
------

-----------------------------------------------------------





In [8]:
model = SCCN().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)
loss_fn = torch.nn.CrossEntropyLoss()

In [9]:
def train_one_epoch(epoch, dl):
    running_loss = 0.
    last_loss = 0.
    
    for batch_idx, batch in enumerate(dl):
        (Xs, ys) = batch['spectrogram'].to(device), batch['label'].to(device)
        
        optimizer.zero_grad()
        
        yhats = model(Xs)
        
        loss = loss_fn(yhats, ys)
        loss.backward()
        
        optimizer.step()
        
        running_loss += loss.item()
        if batch_idx % 100 == 99:
            last_loss = running_loss / 100 #batch loss
            print(f'\tbatch {batch_idx+1} loss: {last_loss}')
            running_loss=0
        
    return last_loss

In [None]:
fold_results = []
print(f'-----{len(folds)}-Fold Cross Validation-----')
for fold_idx, fold_bundle in enumerate(folds):
    print(f"Fold {fold_idx}:")
    train_ds = UrbanSoundDataSet(urban_audio_path, fold_bundle['train'], sample_rate=target_sample_rate, mel_kwargs=mel_kwargs)
    validation_ds = UrbanSoundDataSet(urban_audio_path, fold_bundle['validation'], sample_rate=target_sample_rate, mel_kwargs=mel_kwargs)
    print(f"\tSize of train, val datasets: {(len(train_ds), len(validation_ds))}")
    
    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=isShuffle)
    validation_dl = DataLoader(validation_ds, batch_size=batch_size, shuffle=isShuffle)
    
    for epoch in range(EPOCHS):
        print(f'Epoch {epoch+1}')
        
        model.train()
        avg_loss = train_one_epoch(epoch, train_dl)
        
        running_vloss = 0.
        model.eval()
        with torch.no_grad():
            for batch_idx, batch in enumerate(validation_dl):
                (vXs, vys) = batch['spectrogram'].to(device), batch['label'].to(device)
                
                vyhats = model(vXs)
                vloss = loss_fn(vyhats, vys)
                running_vloss += vloss
        avg_vloss = running_vloss / (batch_idx+1)
        
        print(f'LOSS train {avg_loss} val {avg_vloss}')


-----10-Fold Cross Validation-----
Fold 0:
	Size of train, val datasets: (7859, 873)
Epoch 1
	batch 100 loss: 2.2562784337997437
LOSS train 2.2562784337997437 val 2.239978790283203
Epoch 2
	batch 100 loss: 2.2017250943183897
LOSS train 2.2017250943183897 val 2.199753999710083
Epoch 3
	batch 100 loss: 2.1688004660606386
LOSS train 2.1688004660606386 val 2.1855928897857666
Epoch 4
	batch 100 loss: 2.1419480991363526
LOSS train 2.1419480991363526 val 2.1630935668945312
Epoch 5
	batch 100 loss: 2.1125372457504272
LOSS train 2.1125372457504272 val 2.1475577354431152
Fold 1:
	Size of train, val datasets: (7844, 888)
Epoch 1
	batch 100 loss: 2.0926465821266174
LOSS train 2.0926465821266174 val 2.085951566696167
Epoch 2
	batch 100 loss: 2.0693882799148557
LOSS train 2.0693882799148557 val 2.083510637283325
Epoch 3
	batch 100 loss: 2.0475182509422303
LOSS train 2.0475182509422303 val 2.0520262718200684
Epoch 4
	batch 100 loss: 2.035152292251587
LOSS train 2.035152292251587 val 2.058280229568481