In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier

In [2]:
# loading the data from csv file to a Pandas DataFrame
parkinsons_data = pd.read_csv('..\dataset\parkinsons.csv')

In [3]:
# printing the first 5 rows of the dataframe
parkinsons_data.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.813031,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.0,20.651,1,0.429895,0.825288,-4.813031,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.0,20.644,1,0.434969,0.845379,-4.813031,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.0,19.649,1,0.417356,0.845379,-4.813031,0.234513,2.33218,0.410335


In [4]:
# number of rows and columns in the dataframe
parkinsons_data.shape

(8223, 24)

In [5]:
# getting more information about the dataset
parkinsons_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8223 entries, 0 to 8222
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              8223 non-null   object 
 1   MDVP:Fo(Hz)       8223 non-null   float64
 2   MDVP:Fhi(Hz)      8223 non-null   float64
 3   MDVP:Flo(Hz)      8223 non-null   float64
 4   MDVP:Jitter(%)    8223 non-null   float64
 5   MDVP:Jitter(Abs)  8223 non-null   float64
 6   MDVP:RAP          8223 non-null   float64
 7   MDVP:PPQ          8223 non-null   float64
 8   Jitter:DDP        8223 non-null   float64
 9   MDVP:Shimmer      8223 non-null   float64
 10  MDVP:Shimmer(dB)  8223 non-null   float64
 11  Shimmer:APQ3      8223 non-null   float64
 12  Shimmer:APQ5      8223 non-null   float64
 13  MDVP:APQ          8223 non-null   float64
 14  Shimmer:DDA       8223 non-null   float64
 15  NHR               8223 non-null   float64
 16  HNR               8223 non-null   float64


In [6]:
# checking for missing values in each column
parkinsons_data.isnull().sum()

name                0
MDVP:Fo(Hz)         0
MDVP:Fhi(Hz)        0
MDVP:Flo(Hz)        0
MDVP:Jitter(%)      0
MDVP:Jitter(Abs)    0
MDVP:RAP            0
MDVP:PPQ            0
Jitter:DDP          0
MDVP:Shimmer        0
MDVP:Shimmer(dB)    0
Shimmer:APQ3        0
Shimmer:APQ5        0
MDVP:APQ            0
Shimmer:DDA         0
NHR                 0
HNR                 0
status              0
RPDE                0
DFA                 0
spread1             0
spread2             0
D2                  0
PPE                 0
dtype: int64

In [7]:
# getting some statistical measures about the data
parkinsons_data.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0,...,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0,8223.0
mean,2713.14074,193.540847,115.108255,0.014301,4.3e-05,0.284334,0.015445,0.565303,0.018633,0.280428,...,0.049998,0.022597,21.595973,0.762982,0.503349,0.751784,-5.718488,-0.072122,1.990103,0.205714
std,9033.320712,89.886232,43.064498,0.052139,3.4e-05,1.565576,1.091708,50.363387,0.020584,0.195021,...,0.035413,0.038473,4.68858,0.425279,0.102418,0.073744,1.099634,1.310305,1.81168,0.088115
min,88.333,102.145,65.476,0.00168,0.0,0.00068,0.00092,0.00204,0.0,0.085,...,0.01364,0.0,8.441,0.0,0.25657,0.574282,-7.964984,-7.964984,-7.964984,0.044539
25%,117.274,131.897,82.764,0.00339,2e-05,0.00171,0.00194,0.00504,0.0,0.149,...,0.02487,0.00495,19.055,1.0,0.427785,0.686579,-6.482096,0.158266,2.058658,0.13639
50%,148.79,172.86,102.874,0.00494,4e-05,0.00226,0.00263,0.00742,0.01608,0.216,...,0.03867,0.01049,22.219,1.0,0.50238,0.743937,-5.736781,0.210279,2.344876,0.199889
75%,197.569,219.29,135.041,0.00742,5e-05,0.003725,0.0039,0.01168,0.02719,0.35,...,0.06321,0.0222,25.032,1.0,0.59604,0.845379,-4.960234,0.269866,2.631793,0.249703
max,34567.0,592.03,239.17,0.345678,0.00026,9.0,99.0,4567.0,0.11908,1.302,...,0.16942,0.31482,33.047,1.0,0.685151,0.845379,-2.434031,0.450493,3.671155,0.527367


In [8]:
# distribution of target Variable
parkinsons_data['status'].value_counts()

status
1    6274
0    1949
Name: count, dtype: int64

In [9]:
parkinsons_data = parkinsons_data.select_dtypes(include=['int64', 'float64'])
parkinsons_data.groupby('status').mean()

Unnamed: 0_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2718.57509,217.248181,142.693662,0.012281,2.4e-05,0.237406,0.002145,0.006001,0.011323,0.163199,...,0.013395,0.032046,0.010624,24.185711,0.449681,0.738083,-6.71606,-0.198728,1.707281,0.12777
1,2711.452574,186.176232,106.538927,0.014928,4.8e-05,0.298912,0.019576,0.739048,0.020904,0.316846,...,0.026979,0.055575,0.026317,20.791478,0.520021,0.756039,-5.408595,-0.032791,2.077961,0.229927


In [10]:
print(parkinsons_data.columns)

Index(['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'status', 'RPDE', 'DFA',
       'spread1', 'spread2', 'D2', 'PPE'],
      dtype='object')


In [11]:
X = parkinsons_data.drop(columns=['status'], axis=1)
Y = parkinsons_data['status']

In [12]:
print(X)

      MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
0         119.992       157.302        74.997         0.00784   
1         122.400       148.650       113.819         0.00968   
2         116.682       131.111       111.555         0.01050   
3         116.676       137.871       111.366         0.00997   
4         116.014       141.781       110.655         0.01284   
...           ...           ...           ...             ...   
8218      150.440       163.441       144.736         0.00396   
8219      148.462       161.078       141.998         0.00397   
8220      149.818       163.417       144.786         0.00336   
8221      117.226       123.925       106.656         0.00417   
8222      116.848       217.552        99.503         0.00531   

      MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
0              0.00007   0.00370   0.00554     0.01109       0.04374   
1              0.00008   0.00465   0.00696     0.01394       0.06134   
2  

In [13]:
print(Y)

0       1
1       1
2       1
3       1
4       1
       ..
8218    1
8219    1
8220    1
8221    0
8222    0
Name: status, Length: 8223, dtype: int64


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [15]:
print(X.shape, X_train.shape, X_test.shape)

(8223, 22) (6578, 22) (1645, 22)


In [26]:
# Initialize the AdaBoost model
model = AdaBoostClassifier(random_state=42)

In [27]:
# Training the AdaBoost model with training data
model.fit(X_train, Y_train)

In [28]:
# Accuracy score on training data
adaboost_training_data_accuracy = accuracy_score(Y_train, model.predict(X_train))
print('AdaBoost Accuracy score on training data:', adaboost_training_data_accuracy)

AdaBoost Accuracy score on training data: 0.9884463362724232


In [29]:
# Accuracy score on test data
adaboost_test_data_accuracy = accuracy_score(Y_test, model.predict(X_test))
print('AdaBoost Accuracy score on test data:', adaboost_test_data_accuracy)


AdaBoost Accuracy score on test data: 0.9848024316109423


In [30]:
input_data = (140.341,159.774,67.021,0.00817,0.00001,0.0043,0.0044,0.01289,0.03198,0.313,0.0183,0.0181,0.02428,0.0549,0.02183,19.57,0.537264,0.720908,-5.40942,0.22685,2.359973,0.226156)

# changing input data to a numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the numpy array
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)


if (prediction[0] == 0):
  print("The Person does not have Parkinsons Disease")

else:
  print("The Person has Parkinsons")

[1]
The Person has Parkinsons




In [31]:
import pickle

In [32]:
filename = 'parkinsons_prediction_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [33]:
# loading the saved model
loaded_model = pickle.load(open('parkinsons_prediction_model.sav', 'rb'))
     

In [34]:
for column in X.columns:
  print(column)

MDVP:Fo(Hz)
MDVP:Fhi(Hz)
MDVP:Flo(Hz)
MDVP:Jitter(%)
MDVP:Jitter(Abs)
MDVP:RAP
MDVP:PPQ
Jitter:DDP
MDVP:Shimmer
MDVP:Shimmer(dB)
Shimmer:APQ3
Shimmer:APQ5
MDVP:APQ
Shimmer:DDA
NHR
HNR
RPDE
DFA
spread1
spread2
D2
PPE
