<a href="https://colab.research.google.com/github/markwreinke/OrganStopClassifier/blob/main/TestOrganStopClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Start with the imports
import pandas as pd
import librosa
import numpy as np
import matplotlib.pyplot as plt
import librosa.display
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook as tqdm
import os
import pickle
import IPython

In [2]:
# Mount GoogleDrive, cd and confirm current directory
from google.colab import drive
drive.mount("/content/drive")

dir = "/content/drive/My Drive/OrganStopDataset"
print(os.getcwd())

if(os.getcwd() != dir):
  os.chdir(dir)

print(os.getcwd())

Mounted at /content/drive
/content
/content/drive/My Drive/OrganStopDataset


In [3]:
# Here we are going to convert a Spectrogram into an image
def spec_to_image(spec, eps=1e-6):
  mean = spec.mean()
  std = spec.std()
  spec_norm = (spec - mean) / (std + eps)
  spec_min, spec_max = spec_norm.min(), spec_norm.max()
  spec_scaled = 255 * (spec_norm - spec_min) / (spec_max - spec_min)
  spec_scaled = spec_scaled.astype(np.uint8)
  return spec_scaled

  # This spectrogram is normalized using z score normalization and 
  # scaled using min-max scaling so its values lie between 0 and 255

In [4]:
# This function give a mel spectrogram of the given file
#  sr=None -> Librosa should use the native sampling rate of 44.1KHz to load the audio data instead of the default of 22.05KHz

#  Next if the sample is less than num_sec long, it is padded with refected versions of itself - i.e., if num_sec is 5,and a 1 second clip is sent it, it will be changed to: clip + reversed clip + clip + reversed clip + clip.
# If the clip runtime is greater than num_sec, it will be truncated.

#  2048 samples are chosen for each window (about 46ms)
#  A hop_length of 512 samples is chose, meaning the window is moved by skipping 512 samples to get the next time frame
#  The number of mel filters is 128, makes the height of the spectrogram image 128
#  fmin and fmax are the lowest and highest frequencies
def get_melspectrogram_db(file_path, sr=None, n_fft=2048, hop_length=512, n_mels=128, fmin=20, fmax=8300, top_db=80, num_sec = 1):
  wav, sr = librosa.load(file_path, sr=sr)
  if wav.shape[0]<num_sec*sr:
     wav=np.pad(wav,int(np.ceil((num_sec*sr-wav.shape[0])/2)),mode='reflect')
  else:
    wav=wav[:num_sec*sr]

  spec=librosa.feature.melspectrogram(wav, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmin=fmin, fmax=fmax)

  #Librosa squares the magnitude of the spectrogram when constructing Mel Spectrogram, so we use power_to_db to convert power magnitude to decipels. top_db is used to threshold the output.
  spec_db=librosa.power_to_db(spec, top_db=top_db)
  return spec_db

In [5]:
# This is us loading data into pytorch by building dataloaders to
# preprocess and load data.
# I augmented it by getting it to save the filename as well

# Where Torch Dataloaders support Map-Style and Iterable-Style datasets, this one is a Map-style dataset as it implements __getitem__() and __len__() protocals, and s represents a mapp from indeices/keys to data samples.


class OrganStopSamples(Dataset):
  def __init__(self, base, df, in_col, out_col):
    self.df = df
    self.data = []
    self.labels = []
    self.filenames = []
    self.c2i = {} # This is a dictionary that maps Category:index
    self.i2c = {} # This is a dicitonary that maps index: category
    self.categories = sorted(df[out_col].unique())
    for i, category in enumerate(self.categories):
      self.c2i[category] = i
      self.i2c[i] = category
    for ind in tqdm(range(len(df))):
      row = df.iloc[ind]
      file_path = os.path.join(base, row[in_col])
      # print("base: ", base, "\nin_col: ", in_col, "\nrow[in_col]: ", row[in_col], "\nfile_path:", file_path)
      self.data.append(spec_to_image(get_melspectrogram_db(file_path))[np.newaxis,...]) # This takes the mel_spectrogram of the current file and appends it to the data list, also adds a new dimension to the front of the data
      self.labels.append(self.c2i[row['category_1']]) # This is the sister list of the data list, in which it maps the category to the data of the corresponding index of the data list.
      self.filenames.append(file_path) #Sace the filename

  def __len__(self):
     return len(self.data)
  def __getitem__(self, idx):
    return self.data[idx], self.labels[idx]
  def __getFileName__(self, idx):
    return self.filenames[idx]

In [6]:
# This is pretty close to the ESC50 project's module, with the change being the number of categoriesbeing 3 and not 50

class OrganStopModel(nn.Module):
  def __init__(self, input_shape, batch_size=16, num_cats=4):
    super().__init__()
    self.conv1 = nn.Conv2d(1, 32, kernel_size = 3, stride=1, padding=1)
    self.bn1 = nn.BatchNorm2d(32)
    self.conv2 = nn.Conv2d(32, 32, kernel_size = 3, stride=1, padding=1)
    self.bn2 = nn.BatchNorm2d(32)
    self.conv3 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
    self.bn3 = nn.BatchNorm2d(64)
    self.conv4 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
    self.bn4 = nn.BatchNorm2d(64)
    self.conv5 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
    self.bn5 = nn.BatchNorm2d(128)
    self.conv6 = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
    self.bn6 = nn.BatchNorm2d(128)
    self.conv7 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
    self.bn7 = nn.BatchNorm2d(256)
    self.conv8 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
    self.bn8 = nn.BatchNorm2d(256)
    self.dense1 = nn.Linear(256*(((input_shape[1]//2)//2)//2)*(((input_shape[2]//2)//2)//2),500)
    self.dropout = nn.Dropout(0.5)
    self.dense2 = nn.Linear(500, num_cats)
  def forward(self, x):
    x = self.conv1(x)
    x = F.relu(self.bn1(x))
    x = self.conv2(x)
    x = F.relu(self.bn2(x))
    x = F.max_pool2d(x, kernel_size=2) 
    x = self.conv3(x)
    x = F.relu(self.bn3(x))
    x = self.conv4(x)
    x = F.relu(self.bn4(x))
    x = F.max_pool2d(x, kernel_size=2)
    x = self.conv5(x)
    x = F.relu(self.bn5(x))
    x = self.conv6(x)
    x = F.relu(self.bn6(x))
    x = F.max_pool2d(x, kernel_size=2)
    x = self.conv7(x)
    x = F.relu(self.bn7(x))
    x = self.conv8(x)
    x = F.relu(self.bn8(x))
    x = x.view(x.size(0),-1)
    x = F.relu(self.dense1(x))
    x = self.dropout(x)
    x = self.dense2(x)
    return x

In [7]:
if torch.cuda.is_available():
  device=torch.device('cuda:0')
else:
  device=torch.device('cpu')

In [8]:
# This loads up the saved model (that was already trained)
with open('organStopModel.pth', 'rb') as modelF:
  trained_model = torch.load(modelF, map_location = device)
modelF.close()

#This is important to call to set dropout and batch normalization layers to evaluation mode
trained_model.eval()

OrganStopModel(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv6): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn6): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_runn

In [9]:

testDataFrame = pd.read_csv('stopTestDirectory.csv')
testDataFrame.head()
directory = "/content/drive/My Drive/OrganStopDataset/TestData"
test = testDataFrame[testDataFrame['fold']!=6]

print(type(test))

testData = OrganStopSamples('TestData', test, 'filename', 'category_1')
testLoader = DataLoader(testData, batch_size = 16, shuffle = True)

<class 'pandas.core.frame.DataFrame'>


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=599.0), HTML(value='')))




In [10]:
# Save the testdata's index:category attribute as a file
with open('organTestI2C.pkl', 'wb') as testDataF:
  pickle.dump(testData.i2c, testDataF)

In [11]:
with open('organTestI2C.pkl', 'rb') as openTestF:
  testDataI2C = pickle.load(openTestF)

In [12]:
mislabeledSamples = []

for i in range(0, testData.__len__()):
  data, label = testData.__getitem__(i)
  print("This clip should be a: " + str(testDataI2C[label]))
  spec_t = torch.tensor(data).to(device, dtype = torch.float32)
  pr = trained_model(spec_t.reshape(1,*spec_t.shape))
  ind = pr.argmax(dim=1).cpu().detach().numpy().ravel()[0]
  print("The model says it is a: " + str(testDataI2C[ind]))
  print("The model filename is: " + str(testData.__getFileName__(i)))
  print("")

  if str(testDataI2C[label]) != str(testDataI2C[ind]):
    mislabeledSamples.append((i, ind))


This clip should be a: Hautbois
The model says it is a: Hautbois
The model filename is: TestData/Hautbois8AT4051Left AT4051 WinLett-101.wav

This clip should be a: Hautbois
The model says it is a: Hautbois
The model filename is: TestData/Hautbois8AT4051Left AT4051 WinLett-102.wav

This clip should be a: Hautbois
The model says it is a: Hautbois
The model filename is: TestData/Hautbois8AT4051Left AT4051 WinLett-103.wav

This clip should be a: Hautbois
The model says it is a: Hautbois
The model filename is: TestData/Hautbois8AT4051Left AT4051 WinLett-104.wav

This clip should be a: Hautbois
The model says it is a: Hautbois
The model filename is: TestData/Hautbois8AT4051Left AT4051 WinLett-105.wav

This clip should be a: Hautbois
The model says it is a: Hautbois
The model filename is: TestData/Hautbois8AT4051Left AT4051 WinLett-106.wav

This clip should be a: Hautbois
The model says it is a: Hautbois
The model filename is: TestData/Hautbois8AT4051Left AT4051 WinLett-107.wav

This clip sho

In [13]:
print("There were " + str(len(mislabeledSamples)) + " mislabled samples:")
for i in range(0, len(mislabeledSamples)):
  wgInd = mislabeledSamples[i][0]
  data, label = testData.__getitem__(wgInd)
  print("i: " + str(i))
  print("The model predicted a: " + str(testDataI2C[mislabeledSamples[i][1]]))
  print("The correct label was: " + str(testDataI2C[label]))
  print("The filename was: " + str(testData.__getFileName__(wgInd)))
  IPython.display.display(IPython.display.Audio(filename= testData.__getFileName__(wgInd)))
  print("")

There were 9 mislabled samples:
i: 0
The model predicted a: Krummhorn8
The correct label was: Trumpette8
The filename was: TestData/Trumpette8ShureKSM44Right ShureKSM44 WinRight-101.wav



i: 1
The model predicted a: Krummhorn8
The correct label was: Trumpette8
The filename was: TestData/Trumpette8ShureKSM44Right ShureKSM44 WinRight-102.wav



i: 2
The model predicted a: Krummhorn8
The correct label was: Trumpette8
The filename was: TestData/Trumpette8ShureKSM44Right ShureKSM44 WinRight-106.wav



i: 3
The model predicted a: Krummhorn8
The correct label was: Trumpette8
The filename was: TestData/Trumpette8ShureKSM44Right ShureKSM44 WinRight-108.wav



i: 4
The model predicted a: Krummhorn8
The correct label was: Trumpette8
The filename was: TestData/Trumpette8ShureKSM44Right ShureKSM44 WinRight-110.wav



i: 5
The model predicted a: Krummhorn8
The correct label was: Trumpette8
The filename was: TestData/Trumpette8ShureKSM44Right ShureKSM44 WinRight-113.wav



i: 6
The model predicted a: Krummhorn8
The correct label was: Trumpette8
The filename was: TestData/Trumpette8ShureKSM44Right ShureKSM44 WinRight-117.wav



i: 7
The model predicted a: Krummhorn8
The correct label was: Trumpette8
The filename was: TestData/Trumpette8ShureKSM44Right ShureKSM44 WinRight-118.wav



i: 8
The model predicted a: Krummhorn8
The correct label was: Trumpette8
The filename was: TestData/Trumpette8ShureKSM44Right ShureKSM44 WinRight-119.wav





In [14]:
filenameTest = os.getcwd() + "/TestData/Krummhorn8ShureKSM44Left-Live10 ShureKSM44Left-103.wav"
print(os.getcwd())
spec = spec_to_image(get_melspectrogram_db(filenameTest))
spec_t = torch.tensor(spec).to(device, dtype = torch.float32)
pr = trained_model(spec_t.reshape(1,1,*spec_t.shape))
ind = pr.argmax(dim=1).cpu().detach().numpy().ravel()[0]
print("ind: " + str(ind))
print("This is testDataI2C: " + str(testDataI2C))
print(testDataI2C[ind])
IPython.display.display(IPython.display.Audio(filename= filenameTest))

/content/drive/My Drive/OrganStopDataset
ind: 1
This is testDataI2C: {0: 'Hautbois', 1: 'Krummhorn8', 2: 'Principal8', 3: 'Trumpette8'}
Krummhorn8


In [15]:
openTestF.close()
testDataF.close()