In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [2]:
data = pd.read_csv(r'Parkinsons Train Data.csv', header=0)
data = data.dropna()
print(data.shape)
print(list(data.columns))

(1040, 28)
['Subject Id', 'Jitter(local)', 'Jitter(local, absolute)', 'Jitter (rap)', 'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)', 'Shimmer (local, db)', 'Shimmer (apq3)', 'Shimmer (apq5)', 'Shimmer (apq11)', 'Shimmer (dda)', 'AC', 'NDH', 'HTM', 'Median Pitch', 'Mean Pitch', 'Standard deviation', 'Minimum pitch', 'Maximum pitch', 'Number of pulses', 'Number of periods', 'Mean period', 'Standard deviation of period', 'Fraction of locally unvoiced frames', 'Number of voice breaks', 'Degree of voice breaks', 'Class information']


In [72]:
def getGBoosting(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=66, test_size= 0.01, stratify=data['Class information'])

    from sklearn.ensemble import GradientBoostingClassifier

    gb = GradientBoostingClassifier(random_state=0)
    gb.fit(X_train, y_train)

    print("Accuracy on training set: {:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(gb.score(X_test, y_test)))

    gb1 = GradientBoostingClassifier(random_state=0, max_depth=1)
    gb1.fit(X_train, y_train)

    print("Accuracy on training set: {:.3f}".format(gb1.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(gb1.score(X_test, y_test)))
    
    gb2 = GradientBoostingClassifier(random_state=0, learning_rate=0.01)
    gb2.fit(X_train, y_train)

    print("Accuracy on training set: {:.3f}".format(gb2.score(X_train, y_train)))
    print("Accuracy on test set: {:.3f}".format(gb2.score(X_test, y_test)))
    

In [73]:
y = data['Class information']

In [74]:
cols_full = ['Jitter(local)', 'Jitter(local, absolute)',
       'Jitter (rap)', 'Jitter (ppq5)', 'Jitter (ddp)', 'Shimmer (local)',
       'Shimmer (local, db)', 'Shimmer (apq3)', 'Shimmer (apq5)',
       'Shimmer (apq11)', 'Shimmer (dda)', 'AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_full]

In [75]:
getGBoosting(X, y)

Accuracy on training set: 0.915
Accuracy on test set: 0.818
Accuracy on training set: 0.714
Accuracy on test set: 0.727
Accuracy on training set: 0.755
Accuracy on test set: 0.727


### Selecting the columns for analysis (Choosing 1 from Jitter and 1 from Shimmer)

In [77]:
cols_1 = ['Jitter(local)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_1]

In [78]:
getGBoosting(X, y)

Accuracy on training set: 0.899
Accuracy on test set: 0.818
Accuracy on training set: 0.714
Accuracy on test set: 0.727
Accuracy on training set: 0.738
Accuracy on test set: 0.727


In [79]:
cols_2 = ['Jitter(local)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_2]

In [80]:
getGBoosting(X, y)

Accuracy on training set: 0.898
Accuracy on test set: 0.727
Accuracy on training set: 0.719
Accuracy on test set: 0.818
Accuracy on training set: 0.735
Accuracy on test set: 0.727


In [81]:
cols_3 = ['Jitter(local)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_3]

In [82]:
getGBoosting(X, y)

Accuracy on training set: 0.894
Accuracy on test set: 0.727
Accuracy on training set: 0.703
Accuracy on test set: 0.636
Accuracy on training set: 0.727
Accuracy on test set: 0.727


In [83]:
cols_4 = ['Jitter(local)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_4]

In [84]:
getGBoosting(X, y)

Accuracy on training set: 0.905
Accuracy on test set: 0.727
Accuracy on training set: 0.711
Accuracy on test set: 0.727
Accuracy on training set: 0.737
Accuracy on test set: 0.727


In [85]:
cols_5 = ['Jitter(local)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_5]

In [86]:
getGBoosting(X, y)

Accuracy on training set: 0.899
Accuracy on test set: 0.818
Accuracy on training set: 0.714
Accuracy on test set: 0.727
Accuracy on training set: 0.748
Accuracy on test set: 0.727


In [87]:
cols_6 = ['Jitter(local)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_6]

In [88]:
getGBoosting(X, y)

Accuracy on training set: 0.894
Accuracy on test set: 0.727
Accuracy on training set: 0.703
Accuracy on test set: 0.636
Accuracy on training set: 0.727
Accuracy on test set: 0.727


In [89]:
cols_7 = ['Jitter(local, absolute)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_7]

In [90]:
getGBoosting(X, y)

Accuracy on training set: 0.903
Accuracy on test set: 0.818
Accuracy on training set: 0.720
Accuracy on test set: 0.818
Accuracy on training set: 0.763
Accuracy on test set: 0.818


In [91]:
cols_8 = ['Jitter(local, absolute)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_8]

In [92]:
getGBoosting(X, y)

Accuracy on training set: 0.891
Accuracy on test set: 0.727
Accuracy on training set: 0.714
Accuracy on test set: 0.818
Accuracy on training set: 0.762
Accuracy on test set: 0.727


In [93]:
cols_9 = ['Jitter(local, absolute)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_9]

In [94]:
getGBoosting(X, y)

Accuracy on training set: 0.908
Accuracy on test set: 0.636
Accuracy on training set: 0.700
Accuracy on test set: 0.818
Accuracy on training set: 0.755
Accuracy on test set: 0.818


In [95]:
cols_10 = ['Jitter(local, absolute)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_10]

In [96]:
getGBoosting(X, y)

Accuracy on training set: 0.899
Accuracy on test set: 0.727
Accuracy on training set: 0.712
Accuracy on test set: 0.818
Accuracy on training set: 0.748
Accuracy on test set: 0.727


In [97]:
cols_11 = ['Jitter(local, absolute)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_11]

In [98]:
getGBoosting(X, y)

Accuracy on training set: 0.902
Accuracy on test set: 0.818
Accuracy on training set: 0.726
Accuracy on test set: 0.818
Accuracy on training set: 0.745
Accuracy on test set: 0.727


In [99]:
cols_12 = ['Jitter(local, absolute)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_12]

In [100]:
getGBoosting(X, y)

Accuracy on training set: 0.908
Accuracy on test set: 0.636
Accuracy on training set: 0.700
Accuracy on test set: 0.818
Accuracy on training set: 0.755
Accuracy on test set: 0.818


In [101]:
cols_13 = ['Jitter (rap)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_13]

In [102]:
getGBoosting(X, y)

Accuracy on training set: 0.910
Accuracy on test set: 0.818
Accuracy on training set: 0.701
Accuracy on test set: 0.818
Accuracy on training set: 0.730
Accuracy on test set: 0.818


In [103]:
cols_14 = ['Jitter (rap)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_14]

In [104]:
getGBoosting(X, y)

Accuracy on training set: 0.903
Accuracy on test set: 0.727
Accuracy on training set: 0.704
Accuracy on test set: 0.818
Accuracy on training set: 0.745
Accuracy on test set: 0.727


In [105]:
cols_15 = ['Jitter (rap)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_15]

In [106]:
getGBoosting(X, y)

Accuracy on training set: 0.902
Accuracy on test set: 0.727
Accuracy on training set: 0.705
Accuracy on test set: 0.727
Accuracy on training set: 0.737
Accuracy on test set: 0.818


In [107]:
cols_16 = ['Jitter (rap)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_16]

In [108]:
getGBoosting(X, y)

Accuracy on training set: 0.881
Accuracy on test set: 0.727
Accuracy on training set: 0.705
Accuracy on test set: 0.818
Accuracy on training set: 0.740
Accuracy on test set: 0.727


In [109]:
cols_17 = ['Jitter (rap)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_17]

In [110]:
getGBoosting(X, y)

Accuracy on training set: 0.902
Accuracy on test set: 0.727
Accuracy on training set: 0.711
Accuracy on test set: 0.727
Accuracy on training set: 0.745
Accuracy on test set: 0.727


In [111]:
cols_18 = ['Jitter (rap)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_18]

In [112]:
getGBoosting(X, y)

Accuracy on training set: 0.902
Accuracy on test set: 0.727
Accuracy on training set: 0.705
Accuracy on test set: 0.727
Accuracy on training set: 0.737
Accuracy on test set: 0.818


In [113]:
cols_19 = ['Jitter (ppq5)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_19]

In [114]:
getGBoosting(X, y)

Accuracy on training set: 0.896
Accuracy on test set: 0.727
Accuracy on training set: 0.712
Accuracy on test set: 0.727
Accuracy on training set: 0.731
Accuracy on test set: 0.727


In [115]:
cols_20 = ['Jitter (ppq5)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_20]

In [116]:
getGBoosting(X, y)

Accuracy on training set: 0.898
Accuracy on test set: 0.818
Accuracy on training set: 0.703
Accuracy on test set: 0.818
Accuracy on training set: 0.732
Accuracy on test set: 0.727


In [117]:
cols_21 = ['Jitter (ppq5)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_21]

In [118]:
getGBoosting(X, y)

Accuracy on training set: 0.894
Accuracy on test set: 0.636
Accuracy on training set: 0.701
Accuracy on test set: 0.727
Accuracy on training set: 0.729
Accuracy on test set: 0.727


In [119]:
cols_22 = ['Jitter (ppq5)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_22]

In [120]:
getGBoosting(X, y)

Accuracy on training set: 0.898
Accuracy on test set: 0.818
Accuracy on training set: 0.709
Accuracy on test set: 0.818
Accuracy on training set: 0.741
Accuracy on test set: 0.727


In [121]:
cols_23 = ['Jitter (ppq5)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_23]

In [122]:
getGBoosting(X, y)

Accuracy on training set: 0.899
Accuracy on test set: 0.818
Accuracy on training set: 0.718
Accuracy on test set: 0.818
Accuracy on training set: 0.743
Accuracy on test set: 0.727


In [123]:
cols_24 = ['Jitter (ppq5)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_24]

In [124]:
getGBoosting(X, y)

Accuracy on training set: 0.894
Accuracy on test set: 0.636
Accuracy on training set: 0.701
Accuracy on test set: 0.727
Accuracy on training set: 0.729
Accuracy on test set: 0.727


In [125]:
cols_25 = ['Jitter (ddp)','Shimmer (local)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_25]
y = data['Class information']

In [126]:
getGBoosting(X, y)

Accuracy on training set: 0.905
Accuracy on test set: 0.818
Accuracy on training set: 0.701
Accuracy on test set: 0.818
Accuracy on training set: 0.727
Accuracy on test set: 0.818


In [127]:
cols_26 = ['Jitter (ddp)','Shimmer (local, db)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_26]

In [128]:
getGBoosting(X, y)

Accuracy on training set: 0.902
Accuracy on test set: 0.727
Accuracy on training set: 0.704
Accuracy on test set: 0.818
Accuracy on training set: 0.745
Accuracy on test set: 0.727


In [129]:
cols_27 = ['Jitter (ddp)','Shimmer (apq3)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_27]

In [130]:
getGBoosting(X, y)

Accuracy on training set: 0.898
Accuracy on test set: 0.727
Accuracy on training set: 0.705
Accuracy on test set: 0.727
Accuracy on training set: 0.738
Accuracy on test set: 0.818


In [131]:
cols_28 = ['Jitter (ddp)','Shimmer (apq5)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_28]

In [132]:
getGBoosting(X, y)

Accuracy on training set: 0.898
Accuracy on test set: 0.727
Accuracy on training set: 0.705
Accuracy on test set: 0.818
Accuracy on training set: 0.740
Accuracy on test set: 0.727


In [136]:
cols_29 = ['Jitter (ddp)','Shimmer (apq11)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_29]

In [137]:
getGBoosting(X, y)

Accuracy on training set: 0.893
Accuracy on test set: 0.727
Accuracy on training set: 0.711
Accuracy on test set: 0.727
Accuracy on training set: 0.745
Accuracy on test set: 0.727


In [138]:
cols_30 = ['Jitter (ddp)','Shimmer (dda)','AC', 'NDH', 'HTM',
       'Median Pitch', 'Mean Pitch', 'Standard deviation',
       'Minimum pitch', 'Maximum pitch', 'Number of pulses',
       'Number of periods', 'Mean period', 'Standard deviation of period',
       'Fraction of locally unvoiced frames', 'Number of voice breaks',
       'Degree of voice breaks']
X = data[cols_30]
y = data['Class information']

In [139]:
getGBoosting(X, y)

Accuracy on training set: 0.898
Accuracy on test set: 0.727
Accuracy on training set: 0.705
Accuracy on test set: 0.727
Accuracy on training set: 0.738
Accuracy on test set: 0.818
