In [21]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score

In [22]:
df = pd.read_csv('parkinsons.csv')

In [23]:
# distribution of target variable
df['status'].value_counts()

1    147
0     48
Name: status, dtype: int64

In [24]:
# group the data based on the target variable
df.groupby('status').mean()
# there is distinct difference in healthy people and parkinsons people

  df.groupby('status').mean()


Unnamed: 0_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,181.937771,223.63675,145.207292,0.003866,2.3e-05,0.001925,0.002056,0.005776,0.017615,0.162958,...,0.013305,0.028511,0.011483,24.67875,0.442552,0.695716,-6.759264,0.160292,2.154491,0.123017
1,145.180762,188.441463,106.893558,0.006989,5.1e-05,0.003757,0.0039,0.011273,0.033658,0.321204,...,0.0276,0.053027,0.029211,20.974048,0.516816,0.725408,-5.33342,0.248133,2.456058,0.233828


In [37]:
# separating features and target
# x = df.drop(columns=['name', 'status'], axis=1)
x = df.drop(columns=['name', 'status', 'spread1', 'spread2', 'PPE', 'RPDE', 'D2', 'NHR', 'DFA'], axis=1)
y = df['status']

In [38]:
x.head()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,HNR
0,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,21.033
1,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,0.03134,0.04518,0.04368,0.09403,19.085
2,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,0.0359,0.0827,20.651
3,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,0.02924,0.04005,0.03772,0.08771,20.644
4,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,0.0349,0.04825,0.04465,0.1047,19.649


In [26]:
# splitting the data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [27]:
# standardizing the data so that all features are in the same range
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

In [28]:
# we're not fitting the scaler on the test data because we don't want to leak any information from the test data
x_test = scaler.transform(x_test)

In [29]:
# Model training using svm
model = svm.SVC(kernel = 'linear')

In [30]:
# training the svm model with training data
model.fit(x_train, y_train)

In [31]:
# model evaluation
# accuracy score on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(y_train, x_train_prediction)

In [32]:
print(f'Training data accuracy: {training_data_accuracy}')

Training data accuracy: 0.8269230769230769


In [33]:
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(y_test, x_test_prediction)
print(f'Test data accuracy: {test_data_accuracy}')

Test data accuracy: 0.8205128205128205


In [34]:
# Building a predictive system
def predict(input_data):
    # changing the input data to a numpy array
    input_data = np.asarray(input_data)
    # reshape the array as we are predicting for one instance
    input_data_reshape = input_data.reshape(1, -1)
    # standardizing the input data
    std_data = scaler.transform(input_data_reshape)
    prediction = model.predict(std_data)
    if prediction[0] == 0:
        return 'The person does not have Parkinsons disease'
    else:
        return 'The person has Parkinsons disease'

In [35]:
input_data = [119.99200,157.30200,74.99700,0.00784,0.00007,0.00370,0.00554,0.01109,0.04374,0.42600,0.02182,0.03130,0.02971,0.06545,0.02211,21.03300,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654]
print(predict(input_data))



ValueError: X has 22 features, but StandardScaler is expecting 15 features as input.