# Machine Learning Final Project: Parkinsons Data Analysis

## Part 1: Setup

Instead of looking a Racial Profiling in police stop data, I instead decided to look at a Parkinsons dataset, to see if i could create a neural network to accurately predict if someone has Parkinsons.

I first processed and scaled the dataset, and then tried different configurations of neural nets, using cross fold validation, until I found one that worked well. I also compared the neural net with other models we learned about.

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn import ensemble # for fitting our model
from sklearn import svm
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 100)

Using TensorFlow backend.


In [2]:
# Create a copy of raw_data to process
raw_data = pd.read_csv('parkinsons.csv')
data_processed = raw_data

data_processed.head()

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,MDVP:APQ,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,0.426,0.02182,0.0313,0.02971,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,0.626,0.03134,0.04518,0.04368,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,0.482,0.02757,0.03858,0.0359,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,0.517,0.02924,0.04005,0.03772,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,0.584,0.0349,0.04825,0.04465,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


## Part 2: Model Building and Validation

In [3]:
# Split and sample test and train data
X = data_processed[data_processed.columns.difference(['name','status'])].values
y = data_processed['status'].values

### Baseline Model

In [4]:
### Create a baseline model with which to test modifications to the net
def create_baseline():
    # create model
    model = Sequential()

    # The first layer of your network needs to know the input shape
#     print('Building model...')
    model = Sequential()
    model.add(Dense(22, input_dim=22, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

#     print('Compiling model...')
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model

# Evaluate model using KFold Validation, and use binary_crossentropy loss function. 
# I chose to use accuracy as my scoring metric, because I wanted to minimize false positives
estimator = KerasClassifier(build_fn=create_baseline, epochs=25, batch_size=5, verbose=0)
# use the model 
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(estimator, X, y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 75.95% (3.07%)


### Baseline With Data Preparation

In [5]:
# Standardize data and then run model
def create_baseline():
    # create model
    model = Sequential()

    # The first layer of your network needs to know the input shape
#     print('Building model...')
    model = Sequential()
    model.add(Dense(22, input_dim=22, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

#     print('Compiling model...')
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    return model
estimators = []
# Add a data standardizing scaler to the pipeline
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=10, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X, y, cv=kfold, verbose=0)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 88.77% (5.67%)


### Model Tuning

In [6]:
# Try building a smaller neural net
def create_smaller():
    # create model
#     print('Building model...')
    model = Sequential()
    model.add(Dense(11, input_dim=22, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

#     print('Compiling model...')
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model
    
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',KerasClassifier(build_fn=create_smaller, epochs=100, batch_size=10, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X, y, cv=kfold,verbose=0)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 86.69% (2.35%)


In [7]:
# Try building a bigger neural net-
def create_bigger():
    # create model
    #print('Building model...')
    model = Sequential()
    model.add(Dense(22, input_dim=22, kernel_initializer='normal', activation='relu'))
    model.add(Dense(11, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

    #print('Compiling model...')
    model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model
    
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',KerasClassifier(build_fn=create_bigger, epochs=100, batch_size=10, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X, y, cv=kfold,verbose=0)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 88.70% (3.52%)


### Optimized Model

In [53]:
# Try building a bigger neural net-
def create_bigger_with_dropout():
    # create model
    #print('Building model...')
    model = Sequential()
    model.add(Dense(22, input_dim=22, kernel_initializer='normal', activation='relu'))
    model.add(Dense(11, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))

    #print('Compiling model...')
    model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
    
    return model
    
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',KerasClassifier(build_fn=create_bigger, epochs=100, batch_size=10, verbose=0)))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X, y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 92.78% (2.61%)


## Comparison with other models

In [9]:
# Logistic Regression
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',linear_model.LogisticRegression(max_iter=10000,tol=0.00001,solver='liblinear')))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X, y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 84.57% (4.70%)


In [8]:
# SVM
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',svm.SVC(verbose=0,kernel='linear',gamma='auto')))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X, y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 85.13% (3.71%)


In [10]:
# Random Forest
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp',ensemble.RandomForestClassifier(verbose=0,n_estimators=1000,max_features='log2')))
pipeline = Pipeline(estimators)
kfold = StratifiedKFold(n_splits=5, shuffle=True)
results = cross_val_score(pipeline, X, y, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Results: 90.80% (2.51%)
