In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics
from sklearn import preprocessing

In [2]:
data = pd.read_csv('parkinsons2.csv')

In [3]:
data.head().transpose()

Unnamed: 0,0,1,2,3,4
MDVP:Fo(Hz),119.992,122.4,116.682,116.676,116.014
MDVP:Fhi(Hz),157.302,148.65,131.111,137.871,141.781
MDVP:Flo(Hz),74.997,113.819,111.555,111.366,110.655
MDVP:Jitter(%),0.00784,0.00968,0.0105,0.00997,0.01284
MDVP:Jitter(Abs),7e-05,8e-05,9e-05,9e-05,0.00011
MDVP:RAP,0.0037,0.00465,0.00544,0.00502,0.00655
MDVP:PPQ,0.00554,0.00696,0.00781,0.00698,0.00908
Jitter:DDP,0.01109,0.01394,0.01633,0.01505,0.01966
MDVP:Shimmer,0.04374,0.06134,0.05233,0.05492,0.06425
MDVP:Shimmer(dB),0.426,0.626,0.482,0.517,0.584


# Description of the columns:
MDVP:Fo(Hz) - Average vocal fundamental frequency

MDVP:Fhi(Hz) - Maximum vocal fundamental frequency

MDVP:Flo(Hz) - Minimum vocal fundamental frequency

MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP - Several measures of variation in fundamental frequency

MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA - Several measures of variation in amplitude

NHR,HNR - Two measures of ratio of noise to tonal components in the voice

RPDE,D2 - Two nonlinear dynamical complexity measures

DFA - Signal fractal scaling exponent

spread1,spread2,PPE - Three nonlinear measures of fundamental frequency variation

status - Health status of the subject (one) - Parkinson's, (zero) - healthy

In [4]:
data.columns

Index(['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:Jitter(Abs)', 'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP',
       'MDVP:Shimmer', 'MDVP:Shimmer(dB)', 'Shimmer:APQ3', 'Shimmer:APQ5',
       'MDVP:APQ', 'Shimmer:DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1',
       'spread2', 'D2', 'PPE', 'status'],
      dtype='object')

In [5]:
data.shape

(195, 23)

### We observe that there are 195 rows and 23 columns in the given dataset.

In [6]:
#Since column names are big it will be easy to do plots and calculations if the column names are small
data.columns= ['Fo','Fhi','Flo','Jitter(%)','Jitter(Abs)','RAP','PPQ','DDP','Shimmer','Shimmer(dB)','APQ3','APQ5','APQ','DDA','NHR','HNR','RPDE','DFA','spread1','spread2','D2','PPE','status']

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195 entries, 0 to 194
Data columns (total 23 columns):
Fo             195 non-null float64
Fhi            195 non-null float64
Flo            195 non-null float64
Jitter(%)      195 non-null float64
Jitter(Abs)    195 non-null float64
RAP            195 non-null float64
PPQ            195 non-null float64
DDP            195 non-null float64
Shimmer        195 non-null float64
Shimmer(dB)    195 non-null float64
APQ3           195 non-null float64
APQ5           195 non-null float64
APQ            195 non-null float64
DDA            195 non-null float64
NHR            195 non-null float64
HNR            195 non-null float64
RPDE           195 non-null float64
DFA            195 non-null float64
spread1        195 non-null float64
spread2        195 non-null float64
D2             195 non-null float64
PPE            195 non-null float64
status         195 non-null int64
dtypes: float64(22), int64(1)
memory usage: 35.1 KB


From the above information, we observe that all the given features are continuous except 'status'(since given in the description).

In [8]:
data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Fo,195.0,154.228641,41.390065,88.333,117.572,148.79,182.769,260.105
Fhi,195.0,197.104918,91.491548,102.145,134.8625,175.829,224.2055,592.03
Flo,195.0,116.324631,43.521413,65.476,84.291,104.315,140.0185,239.17
Jitter(%),195.0,0.00622,0.004848,0.00168,0.00346,0.00494,0.007365,0.03316
Jitter(Abs),195.0,4.4e-05,3.5e-05,7e-06,2e-05,3e-05,6e-05,0.00026
RAP,195.0,0.003306,0.002968,0.00068,0.00166,0.0025,0.003835,0.02144
PPQ,195.0,0.003446,0.002759,0.00092,0.00186,0.00269,0.003955,0.01958
DDP,195.0,0.00992,0.008903,0.00204,0.004985,0.00749,0.011505,0.06433
Shimmer,195.0,0.029709,0.018857,0.00954,0.016505,0.02297,0.037885,0.11908
Shimmer(dB),195.0,0.282251,0.194877,0.085,0.1485,0.221,0.35,1.302


In [9]:
data[data.isnull().any(axis=1)]

Unnamed: 0,Fo,Fhi,Flo,Jitter(%),Jitter(Abs),RAP,PPQ,DDP,Shimmer,Shimmer(dB),...,DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,status


From the above two cells,we observe that there are no missing values in the given dataset

In [10]:
data['status'].value_counts().sort_index()

0     48
1    147
Name: status, dtype: int64

Most are having Parkinson disease. The ratio is almost 1:3 in favor of status 1. So, the model's ability to predict status 1 will be better than predicting status 0.


In [11]:
array = data.values
X = array[:,0:22]
Y = array[:,22]

In [12]:
#Splitting the data into train and test in 70/30 ratio with random state as 2.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2)

# LogisticRegression

In [13]:
LR = LogisticRegression()
LR.fit(X_train, Y_train)
Y1_predict = LR.predict(X_test)
Y_acc = metrics.accuracy_score(Y_test,Y1_predict)
print("Accuracy of the model is {0:2f}".format(Y_acc*100))


Accuracy of the model is 81.355932


# RandomForestClassifier 

In [14]:
from sklearn.ensemble import RandomForestClassifier 

In [15]:
#Splitting the data into train and test in 70/30 ratio with random state as 2.
array = data.values
X = array[:,0:22]
Y = array[:,22]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2)

In [16]:
R = RandomForestClassifier()
R.fit(X_train, Y_train)
Y1_predict = R.predict(X_test)
Y_acc = metrics.accuracy_score(Y_test,Y1_predict)
print("Accuracy of the model is {0:2f}".format(Y_acc*100))

Accuracy of the model is 81.355932


# GaussianNB

In [17]:
from sklearn.naive_bayes import GaussianNB

In [18]:
array = data.values
X = array[:,0:22]
Y = array[:,22]
#Splitting the data into train and test in 70/30 ratio with random state as 2.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2)
G = GaussianNB()
G.fit(X_train, Y_train)
Y1_predict = G.predict(X_test)
Y_acc = metrics.accuracy_score(Y_test,Y1_predict)
print("Accuracy of the model is {0:2f}".format(Y_acc*100))

Accuracy of the model is 62.711864


# LinearDiscriminantAnalysis

In [19]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [20]:
array = data.values
X = array[:,0:22]
Y = array[:,22]
#Splitting the data into train and test in 70/30 ratio with random state as 2.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2)
LD = LinearDiscriminantAnalysis()
LD.fit(X_train, Y_train)
Y1_predict = LD.predict(X_test)
Y_acc = metrics.accuracy_score(Y_test,Y1_predict)
print("Accuracy of the model is {0:2f}".format(Y_acc*100))

Accuracy of the model is 81.355932


In [21]:
data.head()

Unnamed: 0,Fo,Fhi,Flo,Jitter(%),Jitter(Abs),RAP,PPQ,DDP,Shimmer,Shimmer(dB),...,DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,status
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,...,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654,1
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,...,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674,1
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,...,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,1
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,...,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975,1
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,...,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335,1


In [22]:
p=LD.predict([[
119.992,157.302,74.997,0.00784,0.00007,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
]])

In [23]:
print(p)

[ 1.]


In [24]:
data.tail()

Unnamed: 0,Fo,Fhi,Flo,Jitter(%),Jitter(Abs),RAP,PPQ,DDP,Shimmer,Shimmer(dB),...,DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,status
190,174.188,230.978,94.261,0.00459,3e-05,0.00263,0.00259,0.0079,0.04087,0.405,...,0.07008,0.02764,19.517,0.448439,0.657899,-6.538586,0.121952,2.657476,0.13305,0
191,209.516,253.017,89.488,0.00564,3e-05,0.00331,0.00292,0.00994,0.02751,0.263,...,0.04812,0.0181,19.147,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895,0
192,174.688,240.005,74.287,0.0136,8e-05,0.00624,0.00564,0.01873,0.02308,0.256,...,0.03804,0.10715,17.883,0.407567,0.655683,-6.787197,0.158453,2.679772,0.131728,0
193,198.764,396.961,74.904,0.0074,4e-05,0.0037,0.0039,0.01109,0.02296,0.241,...,0.03794,0.07223,19.02,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306,0
194,214.289,260.277,77.973,0.00567,3e-05,0.00295,0.00317,0.00885,0.01884,0.19,...,0.03078,0.04398,21.209,0.462803,0.664357,-5.724056,0.190667,2.555477,0.148569,0


In [25]:
pr=LD.predict([[198.764,396.961,74.904,0.0074,0.00004,0.0037,0.0039,0.01109,0.02296,0.241,0.01265,0.01321,0.01588,0.03794,0.07223,19.02,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306
]])

In [26]:
print(pr)

[ 0.]


In [27]:
B=([['MDVP:Fo(Hz)','MDVP:Fhi(Hz)','MDVP:Flo(Hz)','MDVP:Jitter(%)','MDVP:Jitter(Abs)','MDVP:RAP','MDVP:PPQ','Jitter:DDP','MDVP:Shimmer','MDVP:Shimmer(dB)','Shimmer:APQ3','Shimmer:APQ5','MDVP:APQ','Shimmer:DDA','NHR','HNR','RPDE','DFA','spread1','spread2','D2','PPE'
]])

In [None]:
R=int(1)
C=int(22)
matrix=[]
for i in range(R):
    d=[]
    for j in range(C):
        print(B[i][j],end=" ")
        d.append(float(input()))
        matrix.append(d)


MDVP:Fo(Hz) 

In [32]:
matrix

[[119.992,
  157.302,
  74.997,
  0.00784,
  7e-05,
  0.0037,
  0.00554,
  0.01109,
  0.04374,
  0.426,
  0.02182,
  0.0313,
  0.02971,
  0.06545,
  0.02211,
  21.033,
  0.414783,
  0.815285,
  -4.81303,
  0.266482,
  2.301442,
  0.284654],
 [119.992,
  157.302,
  74.997,
  0.00784,
  7e-05,
  0.0037,
  0.00554,
  0.01109,
  0.04374,
  0.426,
  0.02182,
  0.0313,
  0.02971,
  0.06545,
  0.02211,
  21.033,
  0.414783,
  0.815285,
  -4.81303,
  0.266482,
  2.301442,
  0.284654],
 [119.992,
  157.302,
  74.997,
  0.00784,
  7e-05,
  0.0037,
  0.00554,
  0.01109,
  0.04374,
  0.426,
  0.02182,
  0.0313,
  0.02971,
  0.06545,
  0.02211,
  21.033,
  0.414783,
  0.815285,
  -4.81303,
  0.266482,
  2.301442,
  0.284654],
 [119.992,
  157.302,
  74.997,
  0.00784,
  7e-05,
  0.0037,
  0.00554,
  0.01109,
  0.04374,
  0.426,
  0.02182,
  0.0313,
  0.02971,
  0.06545,
  0.02211,
  21.033,
  0.414783,
  0.815285,
  -4.81303,
  0.266482,
  2.301442,
  0.284654],
 [119.992,
  157.302,
  74.997,
  0.

In [33]:
a=LD.predict(matrix)
z=int(a[1])
if z==1 :
    print("person as parkison disease")
else:
     print("person don't have parkison disease")
        

person as parkison disease
