In [115]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

In [116]:
data = pd.read_csv(r'..\Datasets\Sri Lankan Voice Recordings.csv', header = 0)
data.head()

Unnamed: 0,Subject Id,Jitter(local),"Jitter(local, absolute)",Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),"Shimmer (local, db)",Shimmer (apq3),Shimmer (apq5),...,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,Fraction of locally unvoiced frames,Number of voice breaks,Degree of voice breaks,UPDRS,status
0,1,0.01488,9e-05,0.009,0.00794,0.02699,0.08334,0.779,0.04517,0.04609,...,187.576,160,159,0.006065,0.000416,0.0,0,0.0,23,1
1,1,0.00728,3.8e-05,0.00353,0.00376,0.01059,0.05864,0.642,0.02058,0.0318,...,234.505,170,169,0.005181,0.000403,0.02247,0,0.0,23,1
2,1,0.0122,7.4e-05,0.00732,0.0067,0.02196,0.08719,0.875,0.04347,0.05166,...,211.442,1431,1427,0.006071,0.000474,0.10656,1,0.00178,23,1
3,2,0.00427,3.4e-05,0.00243,0.00249,0.00728,0.07428,0.694,0.04205,0.04311,...,129.205,345,344,0.008073,0.000143,0.0,0,0.0,8,1
4,2,0.00844,6.9e-05,0.00509,0.00489,0.01527,0.14053,1.239,0.06131,0.07015,...,126.788,585,582,0.008174,0.000123,0.00209,2,0.00768,8,1


In [117]:
data.shape

(120, 29)

In [119]:
X = data.iloc[:, 1:24].values
X

array([[1.488000e-02, 9.021300e-05, 9.000000e-03, ..., 1.590000e+02,
        6.064725e-03, 4.162760e-04],
       [7.280000e-03, 3.769800e-05, 3.530000e-03, ..., 1.690000e+02,
        5.181253e-03, 4.034940e-04],
       [1.220000e-02, 7.404100e-05, 7.320000e-03, ..., 1.427000e+03,
        6.070749e-03, 4.742890e-04],
       ...,
       [4.710000e-03, 3.383700e-05, 1.860000e-03, ..., 1.750000e+02,
        7.178780e-03, 1.298030e-04],
       [4.319000e-02, 3.171220e-04, 2.774000e-02, ..., 2.560000e+02,
        7.342534e-03, 3.119150e-04],
       [1.543000e-02, 1.033570e-04, 1.004000e-02, ..., 4.250000e+02,
        6.700038e-03, 1.082910e-04]])

In [120]:
y = data.iloc[:, -1].values
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [124]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=12, test_size = 0.05)

In [125]:
standard_X = StandardScaler()
X_train = standard_X.fit_transform(X_train)
X_test = standard_X.transform(X_test)

In [126]:
training_accuracy = []
test_accuracy = []
# try n_neighbors from 1 to 100
neighbors_settings = range(1, 40)
index = 0
max_accuracy = 0

for n_neighbors in neighbors_settings:
    # build the model
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, Y_train)
    # record training set accuracy
    training_accuracy.append(knn.score(X_train, Y_train))
    # record test set accuracy
    accuracy = knn.score(X_test, Y_test)
    test_accuracy.append(accuracy)
    if(accuracy > max_accuracy):
        max_accuracy = accuracy
        index = n_neighbors

print("Best Accuracy:", max_accuracy)
print("N_neighbors: ", index)

Best Accuracy: 0.8333333333333334
N_neighbors:  9


In [127]:
model = KNeighborsClassifier(n_neighbors=9)
model.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=9, p=2,
                     weights='uniform')

In [128]:
y_pred = model.predict(X_test)

In [129]:
cm = confusion_matrix(Y_test, y_pred) # Calulate Confusion matrix for test set.
print(cm)

[[2 1]
 [0 3]]


In [130]:
accuracies = cross_val_score(estimator = model, X = X_train, y = Y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 55.98 %
Standard Deviation: 12.12 %


In [131]:
data_test = pd.read_csv(r'../Datasets/Sri Lankan Dataset - Finalised.csv', header = 0)
data_test.head()

Unnamed: 0,Subject ID,Jitter(local),"Jitter(local, absolute)",Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),"Shimmer (local, db)",Shimmer (apq3),Shimmer (apq5),...,Minimum pitch,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,Fraction of locally unvoiced frames,Number of voice breaks,Degree of voice breaks,status
0,16,0.01468,8.6e-05,0.0051,0.00615,0.0153,0.0851,0.0814,0.03719,0.05185,...,82.429,244.408,94,93,0.00582,0.0019,0.0,0,0.0,1
1,16,0.001029,4.7e-05,0.00407,0.00574,0.0122,0.04188,0.557,0.01337,0.02081,...,187.378,242.159,113,112,0.00457,0.000297,0.00641,0,0.0,1
2,16,0.01335,5.6e-05,0.00403,0.00663,0.0121,0.05257,0.743,0.01663,0.02859,...,108.624,355.665,122,120,0.0042,0.000703,0.025,0,0.0,1
3,16,0.02164,0.000113,0.0113,0.012,0.03389,0.0625,0.0781,0.01745,0.0295,...,184.991,239.213,69,68,0.00524,0.000254,0.07759,0,0.0,1
4,16,0.0209,9.5e-05,0.00989,0.01107,0.02966,0.06128,0.838,0.02133,0.03216,...,178.835,273.413,87,86,0.00456,0.000433,0.01639,0,0.0,1


In [132]:
data_test.shape

(99, 28)

In [133]:
testing_X = data_test.iloc[:, 1:24].values
testing_X

array([[1.468e-02, 8.550e-05, 5.100e-03, ..., 9.300e+01, 5.820e-03,
        1.900e-03],
       [1.029e-03, 4.700e-05, 4.070e-03, ..., 1.120e+02, 4.570e-03,
        2.970e-04],
       [1.335e-02, 5.610e-05, 4.030e-03, ..., 1.200e+02, 4.200e-03,
        7.030e-04],
       ...,
       [6.290e-03, 6.700e-05, 3.600e-03, ..., 4.430e+02, 1.060e-02,
        1.460e-04],
       [3.880e-03, 4.110e-05, 2.000e-03, ..., 3.310e+02, 1.060e-02,
        1.600e-04],
       [8.550e-03, 7.950e-05, 4.150e-03, ..., 4.580e+02, 9.300e-03,
        2.520e-03]])

In [134]:
testing_X = standard_X.transform(testing_X)
testing_X

array([[ 0.85853335,  0.66915057,  0.16458618, ..., -0.90013491,
        -0.35002187,  2.00844864],
       [-1.0697394 , -0.17256105, -0.0771665 , ..., -0.8349714 ,
        -1.14072389, -0.22520751],
       [ 0.67066413,  0.02638897, -0.08655496, ..., -0.80753414,
        -1.37477169,  0.34052199],
       ...,
       [-0.32659655,  0.26469174, -0.18748083, ...,  0.30024554,
         2.67362267, -0.4356143 ],
       [-0.66702124, -0.30155062, -0.56301899, ..., -0.08387621,
         2.67362267, -0.41610638],
       [-0.00736013,  0.53797474, -0.05838959, ...,  0.35169041,
         1.85129256,  2.87237054]])

In [135]:
y_predict_test = model.predict(testing_X)

In [136]:
y_predict_test

array([1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0], dtype=int64)

In [137]:
total = 0
for i in y_predict_test:
    total = total + i
    
print(total)

62


In [138]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=12, test_size = 0.05)

In [139]:
min_max_scaler = preprocessing.MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

In [140]:
from sklearn.model_selection import GridSearchCV
#create new a knn model
knn2 = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 50)}
#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=10)
#fit model to data
knn_gscv.fit(X_train, Y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [141]:
#check top performing n_neighbors value
knn_gscv.best_params_

{'n_neighbors': 5}

In [142]:
knn_gscv.best_score_

0.6136363636363635

In [143]:
training_accuracy = []
test_accuracy = []
# try n_neighbors from 1 to 100
neighbors_settings = range(1, 100)
index = 0
max_accuracy = 0

for n_neighbors in neighbors_settings:
    # build the model
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, Y_train)
    # record training set accuracy
    training_accuracy.append(knn.score(X_train, Y_train))
    # record test set accuracy
    accuracy = knn.score(X_test, Y_test)
    test_accuracy.append(accuracy)
    if(accuracy > max_accuracy):
        max_accuracy = accuracy
        index = n_neighbors

print("Best Accuracy:", max_accuracy)
print("N_neighbors: ", index)

Best Accuracy: 0.8333333333333334
N_neighbors:  36


In [144]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [145]:
accuracies = cross_val_score(estimator = model, X = X_train, y = Y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 61.36 %
Standard Deviation: 8.11 %


In [146]:
testing_X = data_test.iloc[:, 1:24].values
testing_X

array([[1.468e-02, 8.550e-05, 5.100e-03, ..., 9.300e+01, 5.820e-03,
        1.900e-03],
       [1.029e-03, 4.700e-05, 4.070e-03, ..., 1.120e+02, 4.570e-03,
        2.970e-04],
       [1.335e-02, 5.610e-05, 4.030e-03, ..., 1.200e+02, 4.200e-03,
        7.030e-04],
       ...,
       [6.290e-03, 6.700e-05, 3.600e-03, ..., 4.430e+02, 1.060e-02,
        1.460e-04],
       [3.880e-03, 4.110e-05, 2.000e-03, ..., 3.310e+02, 1.060e-02,
        1.600e-04],
       [8.550e-03, 7.950e-05, 4.150e-03, ..., 4.580e+02, 9.300e-03,
        2.520e-03]])

In [147]:
testing_X = min_max_scaler.transform(testing_X)
testing_X

array([[ 0.30951804,  0.25434041,  0.16179193, ...,  0.03806228,
         0.38231429,  0.27616559],
       [-0.0210947 ,  0.13039755,  0.1236579 , ...,  0.05121107,
         0.21563227,  0.03617821],
       [ 0.27730685,  0.15969314,  0.12217697, ...,  0.0567474 ,
         0.16629439,  0.09696104],
       ...,
       [ 0.10632114,  0.19478345,  0.10625694, ...,  0.28027682,
         1.01970635,  0.01357179],
       [ 0.0479535 ,  0.11140371,  0.04701962, ...,  0.20276817,
         1.01970635,  0.01566775],
       [ 0.16105595,  0.23502464,  0.12661977, ...,  0.29065744,
         0.84635704,  0.36898665]])

In [148]:
y_testing_pred = model.predict(testing_X)

In [149]:
y_testing_pred

array([1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int64)

In [150]:
y_testing_pred.size

99

In [151]:
total = 0
for i in y_testing_pred:
    total = total + i
    
print(total)

51


In [152]:
from sklearn.model_selection import GridSearchCV
#create new a knn model
knn2 = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
param_grid = {‘n_neighbors’: np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=5)
#fit model to data
knn_gscv.fit(X, y)

SyntaxError: invalid character in identifier (<ipython-input-152-90980b4d398a>, line 5)