In [2]:
# Machine Learning for Alzheimer's Disease Classification and Prediction
# Author: Nathan Barkdull
# Smart Medical Information Learning and Evaluation Lab (SMILE)
# University of Florida
# 6/20/2021
########################################################################
# Updated 7/22/2021 by NB

"""
Predicts cardiovascular dementia risk in an eight year longitudinal study, ARIC (Atherosclerosis Risk in Communities Study)
using ensemble model.
"""

# Ensemble majority vote
########################################################################
# Machine Learning for Alzheimer's Disease Classification and Prediction
# Author: Nathan Barkdull
# Smart Medical Information Learning and Evaluation Lab (SMILE)
# University of Florida
# 6/20/2021
########################################################################
# Updated 7/25/2021 by NB

# Imports
import numpy             as np
import pandas            as pd 
import matplotlib.pyplot as plt 

from scipy                   import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import StandardScaler
from sklearn.svm             import SVC
from sklearn.ensemble        import RandomForestClassifier
from xgboost                 import XGBClassifier
from sklearn.metrics         import accuracy_score
from sklearn.pipeline        import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics         import classification_report
from sklearn.model_selection import validation_curve
from sklearn.ensemble        import VotingClassifier

In [None]:

# Function to remove columns with p value greater than variable pval
def pValueElimation(X, y, pval, verbose):
    numVars = np.shape(X)[1]
    toRemove = set()
    
    # Loop over all combinations of variables, if the p value of pair is greater than pval
    # mark the second variable for deletion.
    for i in range(0, numVars):  
        # Get p value of pair
        _, p = stats.ttest_ind(X.iloc[np.where(y == 0)[0], i].values, X.iloc[np.where(y == 1)[0], i].values)
        if verbose == 1:
            print("The p-value of " + str(X.columns[i]) + " is: " + str(p))
        
        # If p value too high, add it to the set of indexs to be deleted.
        if p > pval:
            toRemove.add(i)
                
    # Delete the set of columns identified and return the remainder
    return X.drop(X.columns[list(toRemove)], axis=1)

In [None]:
def runVotingModel(pval=0.2):

    # Load data and train-test split
    df = pd.read_csv('ARIC_CVD_data.csv')
    X = df.drop(['CVD related dementia'], axis=1).drop(['ID_C'], axis=1)
    y = df['CVD related dementia']
    

    # Eliminate features and create train test split
    X_elim = pValueElimation(X, y, pval, verbose=0)
    X_train, X_test, y_train, y_test = train_test_split(X_elim, y, test_size = 0.3)
    
    print(X_elim.shape)
    X_elim.describe()
    
    # models for majority voting
    pipe_svc = Pipeline([('scl', StandardScaler()),
                     ('clf', SVC())])

    param_grid = {'clf__C':[0.1, 1, 10, 100],
                  'clf__kernel':['rbf','poly','linear','sigmoid'],
                  'clf__degree':[1,2,3,4],
                  'clf__gamma': [1, 0.1, 0.01, 0.001]}

    grid1 = GridSearchCV(pipe_svc, param_grid, refit = True, cv=10, verbose = 0)

    pipe_RFC = Pipeline([('scl', StandardScaler()),
                 ('clf', RandomForestClassifier())])


    param_grid = {'clf__bootstrap': [True],
                  'clf__max_depth': [5, 10, 15],
                  'clf__max_features': ['auto'],
                  'clf__min_samples_leaf': [2, 3],
                  'clf__min_samples_split': [10, 15, 20],
                  'clf__n_estimators': [800, 1000, 1200,]}

    grid2 = GridSearchCV(pipe_RFC, param_grid, refit = True, cv=10, verbose = 0)

    pipe_XGB = Pipeline([('scl', StandardScaler()),
                 ('clf', XGBClassifier(learning_rate=0.02, n_estimators=600,))])

    param_grid = {'clf__min_child_weight': [10, 15],
                  'clf__gamma': [5],
                  'clf__subsample': [0.6, 0.8, 1.0],
                  'clf__colsample_bytree': [1.0],
                  'clf__max_depth': [3, 4, 5]}

    grid3 = GridSearchCV(pipe_XGB, param_grid, refit = True, cv=10, verbose = 0)
    
    # Create voting classifier 
    eclf = VotingClassifier(estimators=[('svc', grid1), ('rfc', grid2), ('xgb', grid3)])
    
    # Train and test the model
    eclf.fit(X_train, y_train)

    y_pred = eclf.predict(X_test)

    print(classification_report(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))
    
    return accuracy_score(y_test, y_pred)

In [None]:
runVotingModel(pval=0.18)

In [3]:
## P values ## 
df = pd.read_csv('ARIC_CVD_data.csv')
X = df.drop(['CVD related dementia'], axis=1).drop(['ID_C'], axis=1)
y = df['CVD related dementia']

pval=0.18
X_elim = pValueElimation(X, y, pval, verbose=1)
print(X_elim.shape)

The p-value of REXA2A is: 0.008017490690484729

The p-value of REXA2B is: 0.15680098968701664

The p-value of REXA3A is: 0.5193096668054968

The p-value of REXA5A is: 0.8014966505958575

The p-value of REXA5C is: 0.7023445388529963

The p-value of REXA6A is: 0.7023445388529963

The p-value of RIPA2 is: 0.9608360110303903

The p-value of RIPA3 is: 0.9068096179342491

The p-value of RIPA4 is: 0.18031258316066984

The p-value of RIPA5 is: 0.6491058572612669

The p-value of RIPA6 is: 0.9973767408701945

The p-value of RIPA7 is: 0.7387017345749816

The p-value of RIPA8 is: 0.40200498392820516

The p-value of RIPA9 is: 0.24827493559775216

The p-value of RIPA10 is: 0.5360664604405619

The p-value of RIPA11 is: 0.6397856952131997

The p-value of RIPA12 is: 0.4891013893471361

The p-value of RIPA13 is: 0.6058435159668298

The p-value of RIPA14 is: 0.06526631870854448

The p-value of RIPA15 is: 0.6583419755982314

The p-value of RIPA16 is: 0.5630652174060875

The p-value of RIPA17 is: 0.3184613

In [87]:
# vascular dem results, svm
acc = np.array([0.5, 0.609375, 0.5625, 0.640625, 0.609375, 0.65625, 0.625, 0.640625, 0.59375, 0.5625,
                0.640625, 0.703125, 0.703125, 0.65625, 0.5625, 0.59375, 0.6875, 0.6875, 0.65625, 0.578125,
                0.640625, 0.625, 0.640625, 0.59375, 0.71875, 0.640625, 0.65625, 0.5625, 0.703125, 0.671875,
                0.671875, 0.59375, 0.625, 0.59375, 0.453125, 0.59375, 0.578125, 0.578125, 0.5625, 0.578125,
                0.46875, 0.5625, 0.546875, 0.625, 0.578125, 0.59375, 0.59375, 0.640625, 0.546875, 0.546875])


print("Performance of SVM classifier in predicting vascular dem diagnosis")
print("pval = 0.05 |  av. accuracy = " + str(acc[0:10].mean()))
print("pval = 0.1  |  av. accuracy = " + str(acc[10:20].mean()))
print("pval = 0.2  |  av. accuracy = " + str(acc[20:30].mean()))
print("pval = 0.4  |  av. accuracy = " + str(acc[30:40].mean()))
print("pval = 0.8  |  av. accuracy = " + str(acc[40:50].mean()))

Performance of SVM classifier in predicting vascular dem diagnosis

pval = 0.05 |  av. accuracy = 0.6

pval = 0.1  |  av. accuracy = 0.646875

pval = 0.2  |  av. accuracy = 0.6453125

pval = 0.4  |  av. accuracy = 0.5828125

pval = 0.8  |  av. accuracy = 0.5703125


In [88]:
# vascular dem results, xbg
acc = np.array([0.625, 0.671875, 0.640625, 0.5625, 0.59375, 0.5625, 0.515625, 0.546875, 0.578125, 0.671875,
                0.5625, 0.5625, 0.640625, 0.625, 0.640625, 0.5625, 0.625, 0.6875, 0.625, 0.640625,
                0.6875, 0.59375, 0.46875, 0.75, 0.625, 0.640625, 0.703125, 0.6875, 0.65625, 0.546875,
                0.625, 0.625, 0.59375, 0.65625, 0.53125, 0.625, 0.546875, 0.65625, 0.65625, 0.546875,
                0.5625, 0.640625, 0.578125, 0.5625, 0.625, 0.5625, 0.5625, 0.578125, 0.65625, 0.640625])

print("Performance of XBG classifier in predicting vascular dem diagnosis")
print("pval = 0.05 |  av. accuracy = " + str(acc[0:10].mean()))
print("pval = 0.1  |  av. accuracy = " + str(acc[10:20].mean()))
print("pval = 0.2  |  av. accuracy = " + str(acc[20:30].mean()))
print("pval = 0.4  |  av. accuracy = " + str(acc[30:40].mean()))
print("pval = 0.8  |  av. accuracy = " + str(acc[40:50].mean()))

Performance of XBG classifier in predicting vascular dem diagnosis

pval = 0.05 |  av. accuracy = 0.596875

pval = 0.1  |  av. accuracy = 0.6171875

pval = 0.2  |  av. accuracy = 0.6359375

pval = 0.4  |  av. accuracy = 0.60625

pval = 0.8  |  av. accuracy = 0.596875
