# K-Nearest Neighbours Model

## Imports

In [2]:
# Import the required libraries
import librosa
import matplotlib.pyplot as plt
import numpy as np
import librosa.display
import scipy.fftpack
import math
import soundfile as sf
import sklearn.preprocessing
import pandas as pd

from scipy.io import wavfile
from scipy.signal import find_peaks

from sklearn.neural_network import MLPClassifier

# Required to train and split the data
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

# Required to import the data
from os import listdir

# KNN Imports
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

# Data Processing
from sklearn import preprocessing
from scipy import stats

# Kfold 
from sklearn.model_selection import KFold

## Import Features

In [3]:
onset_strength_peak_rate = np.loadtxt('extracted_features/onset_strength_peak_rate_1000.csv', delimiter=',',dtype=np.double)
flux = np.loadtxt('extracted_features/norm_2_1000.csv', delimiter=',',dtype=np.double)
var_zc = np.loadtxt('extracted_features/var_zc_1000.csv', delimiter=',',dtype=np.double)
low_energy_frame_percentages =  np.loadtxt('extracted_features/low_energy_frame_percentages_1000.csv', delimiter=',',dtype=np.double)
all_labels = np.loadtxt('extracted_features/labels_zc1_1000.csv', delimiter=',')

print("Length of onset_strength_peak_rate : {}".format(len(onset_strength_peak_rate)))
print("Length of flux : {}".format(len(flux)))
print("Length of var_zc : {}".format(len(var_zc)))
print("Length of low_energy_frame_percentages : {}".format(len(low_energy_frame_percentages)))
print("Length of all_labels : {}".format(len(all_labels)))

Length of onset_strength_peak_rate : 1000
Length of flux : 1000
Length of var_zc : 1000
Length of low_energy_frame_percentages : 1000
Length of all_labels : 1000


## Remove Outliers, Normalize, and Combine Data

In [725]:
def removeOutliers(x, outlierConstant):
    a = np.array(x)
    # print(a.shape)
    upper_quartile = np.percentile(a, 75)
    lower_quartile = np.percentile(a, 25)
    IQR = (upper_quartile - lower_quartile) * outlierConstant
    quartileSet = (lower_quartile - IQR, upper_quartile + IQR)
    
    result = []
    removed = []
    
    for i,value in enumerate(a):
        if ((value >= quartileSet[0]) and (value <= quartileSet[1])): result.append(value)
        else: removed.append(i)
    
    return np.array(removed), np.array(result)

In [726]:
# Remove outliers caused by var_vc
removed, var_zc = removeOutliers(var_zc,1.5)
onset_strength_peak_rate = np.delete(onset_strength_peak_rate,removed)
low_energy_frame_percentages = np.delete(low_energy_frame_percentages,removed)
all_labels = np.delete(all_labels,removed)

# Remove outliers caused by onset_strength_peak_rate
removed, onset_strength_peak_rate = removeOutliers(onset_strength_peak_rate,1.5)
var_zc = np.delete(var_zc,removed)
low_energy_frame_percentages = np.delete(low_energy_frame_percentages,removed)
all_labels = np.delete(all_labels,removed)

# Remove outliers caused by low_energy_frame_percentages
removed, low_energy_frame_percentages = removeOutliers(low_energy_frame_percentages,1.5)
var_zc = np.delete(var_zc,removed)
onset_strength_peak_rate = np.delete(onset_strength_peak_rate,removed)
all_labels = np.delete(all_labels,removed)

# Normalise the Data
flux = flux / flux.max()
var_zc = var_zc / var_zc.max()
onset_strength_peak_rate = onset_strength_peak_rate / onset_strength_peak_rate.max()
low_energy_frame_percentages = low_energy_frame_percentages / low_energy_frame_percentages.max()

print("Length of onset_strength_peak_rate : {}".format(len(onset_strength_peak_rate)))
print("Length of var_zc : {}".format(len(var_zc)))
print("Length of low_energy_frame_percentages : {}".format(len(low_energy_frame_percentages)))
print("Length of all_labels : {}".format(len(all_labels)))

Length of onset_strength_peak_rate : 962
Length of var_zc : 962
Length of low_energy_frame_percentages : 962
Length of all_labels : 962


## Knn for Flux Only

In [132]:
# Get data
X = np.array([ [flux[i]] for i,x in enumerate(var_zc) ])
print("Length of combined data : {}".format(len(X)))

# Knn
n_neighbors  = 5
weights = 'distance'

clf = KNeighborsClassifier(n_neighbors, weights=weights)

splits = 5
kf = KFold(n_splits=splits,shuffle=True)

sum = 0
for train_indices, test_indices in kf.split(X):

    # Normalise the Data independently
    X_train = X[train_indices] 
    X_train = X_train / X_train.max() 
    X_test = X[test_indices] 
    X_test = X_test / X_test.max()

    clf.fit(X_train,all_labels[train_indices])
    z = clf.predict(X_test)
    correct_pred = 0
    for x,value in enumerate(z):
        if value == all_labels[test_indices][x]: correct_pred+=1
    sum += correct_pred/len(test_indices)*100
    # print(correct_pred/len(test_indices)*100)

print("Average Accuracy Score : {:3.2f} %".format(sum/splits))

Length of combined data : 1000
Average Accuracy Score : 90.70 %


## KNN For ZCR Only

In [133]:
# Get data
X = np.array([ [var_zc[i]] for i,x in enumerate(var_zc) ])
print("Length of combined data : {}".format(len(X)))

# Knn
n_neighbors  = 5
weights = 'distance'

clf = KNeighborsClassifier(n_neighbors, weights=weights)

splits = 5
kf = KFold(n_splits=splits,shuffle=True)

sum = 0
for train_indices, test_indices in kf.split(X):

    # Normalise the Data independently
    X_train = X[train_indices] 
    X_train = X_train / X_train.max() 
    X_test = X[test_indices] 
    X_test = X_test / X_test.max()

    clf.fit(X_train,all_labels[train_indices])
    z = clf.predict(X_test)
    correct_pred = 0
    for x,value in enumerate(z):
        if value == all_labels[test_indices][x]: correct_pred+=1
    sum += correct_pred/len(test_indices)*100
    # print(correct_pred/len(test_indices)*100)

print("Average Accuracy Score : {:3.2f} %".format(sum/splits))

Length of combined data : 1000
Average Accuracy Score : 75.50 %


## KNN For Low Energy Frames Only

In [137]:
# Get data
X = np.array([ [low_energy_frame_percentages[i]] for i,x in enumerate(var_zc) ])
print("Length of combined data : {}".format(len(X)))

# Knn
n_neighbors  = 5
weights = 'distance'

clf = KNeighborsClassifier(n_neighbors, weights=weights)

splits = 5
kf = KFold(n_splits=splits,shuffle=True)

sum = 0
for train_indices, test_indices in kf.split(X):

    # Normalise the Data independently
    X_train = X[train_indices] 
    # X_train = X_train / X_train.max() 
    X_test = X[test_indices] 
    # X_test = X_test / X_test.max()

    clf.fit(X_train,all_labels[train_indices])
    z = clf.predict(X_test)
    correct_pred = 0
    for x,value in enumerate(z):
        if value == all_labels[test_indices][x]: correct_pred+=1
    sum += correct_pred/len(test_indices)*100
    # print(correct_pred/len(test_indices)*100)

print("Average Accuracy Score : {:3.2f} %".format(sum/splits))

Length of combined data : 1000
Average Accuracy Score : 63.30 %


## KNN For All Features

In [9]:
X = np.array([ [flux[i],var_zc[i],low_energy_frame_percentages[i]] for i,x in enumerate(var_zc) ])

# Knn
n_neighbors  = 5
weights = 'distance'

clf = KNeighborsClassifier(n_neighbors, weights=weights)

splits = 5
kf = KFold(n_splits=splits,shuffle=True)

sum = 0
for train_indices, test_indices in kf.split(X):

    # Get Data
    X_train_flux = flux[train_indices]
    X_train_low_e = low_energy_frame_percentages[train_indices]
    X_train_var_zc = var_zc[train_indices]

    X_test_flux = flux[test_indices]
    X_test_low_e = low_energy_frame_percentages[test_indices]
    X_test_var_zc = var_zc[test_indices]

    # normalise data
    X_train_flux = X_train_flux / X_train_flux.max()
    X_train_low_e = X_train_low_e / X_train_low_e.max()
    X_train_var_zc = X_train_var_zc / X_train_var_zc.max()

    X_test_flux = X_test_flux / X_test_flux.max()
    X_test_low_e = X_test_low_e / X_test_low_e.max()
    X_test_var_zc = X_test_var_zc / X_test_var_zc.max()

    # Normalise the Data independently
    X_train = np.array([ [X_train_flux[i],X_train_var_zc[i],X_train_low_e[i]] for i,x in enumerate(train_indices) ])
    # X_train = X_train / X_train.max() 
    X_test = np.array([ [X_test_flux[i],X_test_var_zc[i],X_test_low_e[i]] for i,x in enumerate(test_indices) ])
    # X_test = X_test / X_test.max()

    clf.fit(X_train,all_labels[train_indices])
    z = clf.predict(X_test)
    correct_pred = 0
    for x,value in enumerate(z):
        if value == all_labels[test_indices][x]: correct_pred+=1
    sum += correct_pred/len(test_indices)*100
    # print(correct_pred/len(test_indices)*100)

print("Average Accuracy Score : {:3.2f} %".format(sum/splits))

Average Accuracy Score : 96.30 %
