### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os, sys

### Importing the dataset

In [84]:
data = pd.read_csv(r'Parkinsons Train Data.csv', header=0)
data.head()

Unnamed: 0,Subject Id,Jitter(local),"Jitter(local, absolute)",Jitter (rap),Jitter (ppq5),Jitter (ddp),Shimmer (local),"Shimmer (local, db)",Shimmer (apq3),Shimmer (apq5),...,Minimum pitch,Maximum pitch,Number of pulses,Number of periods,Mean period,Standard deviation of period,Fraction of locally unvoiced frames,Number of voice breaks,Degree of voice breaks,Class information
0,1,1.488,9e-05,0.9,0.794,2.699,8.334,0.779,4.517,4.609,...,142.229,187.576,160,159,0.006065,0.000416,0.0,0,0.0,1
1,1,0.728,3.8e-05,0.353,0.376,1.059,5.864,0.642,2.058,3.18,...,159.515,234.505,170,169,0.005181,0.000403,2.247,0,0.0,1
2,1,1.22,7.4e-05,0.732,0.67,2.196,8.719,0.875,4.347,5.166,...,146.445,211.442,1431,1427,0.006071,0.000474,10.656,1,0.178,1
3,1,2.502,0.000123,1.156,1.634,3.469,13.513,1.273,5.263,8.771,...,182.713,220.23,94,92,0.00491,0.00032,0.0,0,0.0,1
4,1,3.509,0.000167,1.715,1.539,5.145,9.112,1.04,3.102,4.927,...,182.821,225.162,117,114,0.004757,0.00038,18.182,1,13.318,1


### Seperating the X and Y variables

In [85]:
features = data.loc[:, data.columns != 'Class information'].values[:, 1:]
labels = data.loc[:, 'Class information'].values

In [87]:
labels[labels == 1].shape[0], labels[labels == 0].shape[0]

(520, 520)

In [86]:
print(data.columns.values)

['Subject Id' 'Jitter(local)' 'Jitter(local, absolute)' 'Jitter (rap)'
 'Jitter (ppq5)' 'Jitter (ddp)' 'Shimmer (local)' 'Shimmer (local, db)'
 'Shimmer (apq3)' 'Shimmer (apq5)' 'Shimmer (apq11)' 'Shimmer (dda)' 'AC'
 'NDH' 'HTM' 'Median Pitch' 'Mean Pitch' 'Standard deviation'
 'Minimum pitch' 'Maximum pitch' 'Number of pulses' 'Number of periods'
 'Mean period' 'Standard deviation of period'
 'Fraction of locally unvoiced frames' 'Number of voice breaks'
 'Degree of voice breaks' 'Class information']


### Feature Selection

In [88]:
#analyzing the variables that won't affect the model
from sklearn import datasets
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

xgBoost = XGBClassifier()

rfe = RFE(xgBoost)
rfe = rfe.fit(features, labels)
print(rfe.support_)
print(rfe.ranking_)

[False  True  True  True False  True False False  True  True False  True
 False  True  True False  True  True  True False False False False  True
 False False]
[ 2  1  1  1 10  1 11  8  1  1 14  1 12  1  1  3  1  1  1  7  5  9  4  1
 13  6]


### Function to calculate XGBoost

In [89]:
def getXGBoost(X, y):
    from xgboost import XGBClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=66, test_size = 0.05, stratify = data['Class information'])
    # Train
    model = XGBClassifier()
    model.fit(X_train, Y_train)
    
    # Evaluate
    Y_hat = [round(yhat) for yhat in model.predict(X_test)]
    print("Accuracy for test set -> ", end = "")
    print(accuracy_score(Y_test, Y_hat)) # Test set accuracy
    print("Accuracy for training set -> ", end = "")
    Y_hat = [round(yhat) for yhat in model.predict(X)]
    print(accuracy_score(y, Y_hat)) # Full set accuracy

### Training the model with all the features

In [90]:
getXGBoost(features, labels)

Accuracy for test set -> 0.8653846153846154
Accuracy for training set -> 0.8923076923076924


### Training the model with RFE features

In [93]:
cols_RFE = ['Jitter(local, absolute)', 'Jitter (rap)', 'Jitter (ppq5)', 'Shimmer (local)',
        'Shimmer (apq5)', 'Shimmer (apq11)', 'AC', 'HTM', 'Median Pitch', 'Standard deviation',
        'Minimum pitch' , 'Maximum pitch', 'Fraction of locally unvoiced frames' ]
X = data[cols_RFE]
getXGBoost(X, y)

Accuracy for test set -> 0.8461538461538461
Accuracy for training set -> 0.885576923076923


### Training and finding the best models (Features chosen pairwise)

In [91]:
y = data['Class information']
cols_1 = ['Jitter(local)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_1]

In [8]:
getXGBoost(X, y)

Accuracy for test set -> 0.7692307692307693
Accuracy for training set -> 0.8740384615384615


In [9]:
cols_2 = ['Jitter(local)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_2]

In [10]:
getXGBoost(X, y)

Accuracy for test set -> 0.7884615384615384
Accuracy for training set -> 0.864423076923077


In [11]:
cols_3 = ['Jitter(local)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_3]

In [12]:
getXGBoost(X, y)

Accuracy for test set -> 0.7692307692307693
Accuracy for training set -> 0.8634615384615385


In [13]:
cols_4 = ['Jitter(local)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_4]

In [14]:
getXGBoost(X, y)

Accuracy for test set -> 0.7307692307692307
Accuracy for training set -> 0.8721153846153846


In [15]:
cols_5 = ['Jitter(local)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_5]

In [16]:
getXGBoost(X, y)

Accuracy for test set -> 0.8269230769230769
Accuracy for training set -> 0.8798076923076923


In [17]:
cols_6 = ['Jitter(local)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_6]

In [18]:
getXGBoost(X, y)

Accuracy for test set -> 0.7692307692307693
Accuracy for training set -> 0.8634615384615385


In [19]:
cols_7 = ['Jitter(local, absolute)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_7]

In [20]:
getXGBoost(X, y)

Accuracy for test set -> 0.8076923076923077
Accuracy for training set -> 0.8807692307692307


In [21]:
cols_8 = ['Jitter(local, absolute)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_8]

In [22]:
getXGBoost(X, y)

Accuracy for test set -> 0.7307692307692307
Accuracy for training set -> 0.8663461538461539


In [23]:
cols_9 = ['Jitter(local, absolute)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_9]

In [24]:
getXGBoost(X, y)

Accuracy for test set -> 0.75
Accuracy for training set -> 0.8663461538461539


In [25]:
cols_10 = ['Jitter(local, absolute)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_10]

In [26]:
getXGBoost(X, y)

Accuracy for test set -> 0.7692307692307693
Accuracy for training set -> 0.8682692307692308


In [27]:
cols_11 = ['Jitter(local, absolute)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_11]

In [28]:
getXGBoost(X, y)

Accuracy for test set -> 0.7884615384615384
Accuracy for training set -> 0.8721153846153846


In [29]:
cols_12 = ['Jitter(local, absolute)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_12]

In [30]:
getXGBoost(X, y)

Accuracy for test set -> 0.75
Accuracy for training set -> 0.8663461538461539


In [31]:
cols_13 = ['Jitter (rap)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_13]

In [32]:
getXGBoost(X, y)

Accuracy for test set -> 0.8269230769230769
Accuracy for training set -> 0.8769230769230769


In [33]:
cols_14 = ['Jitter (rap)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_14]

In [34]:
getXGBoost(X, y)

Accuracy for test set -> 0.8846153846153846
Accuracy for training set -> 0.8673076923076923


In [35]:
cols_15 = ['Jitter (rap)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_15]

In [36]:
getXGBoost(X, y)

Accuracy for test set -> 0.7884615384615384
Accuracy for training set -> 0.8682692307692308


In [37]:
cols_16 = ['Jitter (rap)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_16]

In [38]:
getXGBoost(X, y)

Accuracy for test set -> 0.75
Accuracy for training set -> 0.8692307692307693


In [39]:
cols_17 = ['Jitter (rap)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_17]

In [40]:
getXGBoost(X, y)

Accuracy for test set -> 0.8653846153846154
Accuracy for training set -> 0.8721153846153846


In [41]:
cols_18 = ['Jitter (rap)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_18]

In [42]:
getXGBoost(X, y)

Accuracy for test set -> 0.7884615384615384
Accuracy for training set -> 0.8682692307692308


In [43]:
cols_19 = ['Jitter (ppq5)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_19]

In [44]:
getXGBoost(X, y)

Accuracy for test set -> 0.7692307692307693
Accuracy for training set -> 0.8625


In [45]:
cols_20 = ['Jitter (ppq5)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_20]

In [46]:
getXGBoost(X, y)

Accuracy for test set -> 0.8076923076923077
Accuracy for training set -> 0.8615384615384616


In [47]:
cols_21 = ['Jitter (ppq5)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_21]

In [48]:
getXGBoost(X, y)

Accuracy for test set -> 0.7884615384615384
Accuracy for training set -> 0.8625


In [49]:
cols_22 = ['Jitter (ppq5)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_22]

In [50]:
getXGBoost(X, y)

Accuracy for test set -> 0.7884615384615384
Accuracy for training set -> 0.8721153846153846


In [51]:
cols_23 = ['Jitter (ppq5)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_23]

In [52]:
getXGBoost(X, y)

Accuracy for test set -> 0.8269230769230769
Accuracy for training set -> 0.875


In [53]:
cols_24 = ['Jitter (ppq5)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_24]

In [54]:
getXGBoost(X, y)

Accuracy for test set -> 0.7884615384615384
Accuracy for training set -> 0.8625


In [55]:
cols_25 = ['Jitter (ddp)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_25]
y = data['Class information']

In [56]:
getXGBoost(X, y)

Accuracy for test set -> 0.8269230769230769
Accuracy for training set -> 0.8673076923076923


In [57]:
cols_26 = ['Jitter (ddp)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_26]

In [58]:
getXGBoost(X, y)

Accuracy for test set -> 0.8461538461538461
Accuracy for training set -> 0.8548076923076923


In [59]:
cols_27 = ['Jitter (ddp)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_27]

In [60]:
getXGBoost(X, y)

Accuracy for test set -> 0.7884615384615384
Accuracy for training set -> 0.8586538461538461


In [61]:
cols_28 = ['Jitter (ddp)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_28]

In [62]:
getXGBoost(X, y)

Accuracy for test set -> 0.7307692307692307
Accuracy for training set -> 0.8682692307692308


In [63]:
cols_29 = ['Jitter (ddp)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_29]

In [64]:
getXGBoost(X, y)

Accuracy for test set -> 0.8461538461538461
Accuracy for training set -> 0.8653846153846154


In [65]:
cols_30 = ['Jitter (ddp)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_30]
y = data['Class information']

In [66]:
getXGBoost(X, y)

Accuracy for test set -> 0.7884615384615384
Accuracy for training set -> 0.8586538461538461
