# Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import accuracy_score

# Data Loading & Pre-Processing

In [17]:
# Loading the data from CSV to a pandas dataframe.
parkinsons_data = pd.read_csv("/content/parkinsons.csv")

# Basic overview of the data.
print(parkinsons_data.shape)
print(parkinsons_data.describe())
print(parkinsons_data.info())

# Check for missing values.
print(parkinsons_data.isnull().sum())

(195, 24)
       MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
count   195.000000    195.000000    195.000000      195.000000   
mean    154.228641    197.104918    116.324631        0.006220   
std      41.390065     91.491548     43.521413        0.004848   
min      88.333000    102.145000     65.476000        0.001680   
25%     117.572000    134.862500     84.291000        0.003460   
50%     148.790000    175.829000    104.315000        0.004940   
75%     182.769000    224.205500    140.018500        0.007365   
max     260.105000    592.030000    239.170000        0.033160   

       MDVP:Jitter(Abs)    MDVP:RAP    MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
count        195.000000  195.000000  195.000000  195.000000    195.000000   
mean           0.000044    0.003306    0.003446    0.009920      0.029709   
std            0.000035    0.002968    0.002759    0.008903      0.018857   
min            0.000007    0.000680    0.000920    0.002040      0.009540   
25%       

In [18]:
# Check the distribution of target variable.
# In this dataset : [target varaible == status]
parkinsons_data["status"].value_counts()
# Data is skewed 1:3 in the favour of "1"

1    147
0     48
Name: status, dtype: int64

In [21]:
# Grouping the data based on the target variable.
# Taking a breif look at the dataset.
parkinsons_data.groupby("status").mean()

Unnamed: 0_level_0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,MDVP:APQ,Shimmer:DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,181.937771,223.63675,145.207292,0.003866,2.3e-05,0.001925,0.002056,0.005776,0.017615,0.162958,...,0.013305,0.028511,0.011483,24.67875,0.442552,0.695716,-6.759264,0.160292,2.154491,0.123017
1,145.180762,188.441463,106.893558,0.006989,5.1e-05,0.003757,0.0039,0.011273,0.033658,0.321204,...,0.0276,0.053027,0.029211,20.974048,0.516816,0.725408,-5.33342,0.248133,2.456058,0.233828


# Data preprocessing

In [32]:
# Seaparating features on the basis of relevency.
# Target - Feature separation.
X = parkinsons_data.drop(columns=["name", "status"], axis=1)
Y = parkinsons_data["status"]
print(Y.shape)

(195,)


# Train - Test split

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, 
                                test_size=0.2, random_state=2)
print(X.shape, X_train.shape, X_test.shape)
print(Y.shape, Y_train.shape, Y_test.shape)

(195, 22) (156, 22) (39, 22)
(195,) (156,) (39,)


# Data standardization

In [37]:
# Instantiation
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
# We should not fit the test data. It is for the puropse of test only.
X_test = scaler.transform(X_test)

# Training the machine learning model

In [42]:
# Model instantiation
model = svm.SVC(kernel="linear")

# Fitting the dataset.
model.fit(X_train, Y_train)

SVC(kernel='linear')

# Model Evaluation

In [51]:
# Accuracy score on the training dataset.
X_train_prediction = model.predict(X_train)
train_accuracy = accuracy_score(Y_train, X_train_prediction)
print(train_accuracy)

# Accuracy score on the testing dataset.
X_test_prediction = model.predict(X_test)
test_accuracy = accuracy_score(Y_test, X_test_prediction)
print(test_accuracy)

# train_accuracy ~= test_accuracy.
# We have avoided overfitting and underfitting.

0.8846153846153846
0.8717948717948718


# Building the predictive system

In [60]:
input_data = (119.99200,157.30200,74.99700,0.00784,0.00007,
              0.00370,0.00554,0.01109,0.04374,0.42600,
              0.02182,0.03130,0.02971,0.06545,0.02211,
              21.03300,0.414783,0.815285,-4.813031,0.266482,
              2.301442,0.284654)

# Input conversion to np_array
input_data_as_nparray = np.asarray(input_data)
# Reshape the np_array
input_data_reshaped = input_data_as_nparray.reshape(1, -1)
# Standardize the input data.
input_data_reshaped = scaler.transform(input_data_reshaped)

# Prediction based on the input array.
prediction =  model.predict(input_data_reshaped)
print(prediction)

if prediction[0] :
  print("Patient has parkinsons disease")
else :
  print("Patient doesn't have parkinsons disease")

[1]
Patient has parkinsons disease


