# Pipeline to perform t-test on performances different combinations

In [1]:
# Run this to use from colab environment
#!pip install -q --upgrade git+https://github.com/karinvangarderen/tm10007_project.git
#!pip install sklearn 

## Import Modules

In [11]:
# Import data module
from adni.load_data import load_data

# Import needed modules
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import KernelPCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

## Data loading and cleaning

Data loading

In [12]:
# Data loading 
data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
df= pd.DataFrame(data)

# Reset index, add patient ID's as column
df.reset_index(inplace=True)
df = df.rename(columns = {'index':'ID'})

# Set AD to 1, CN to 0
df['label']= df['label'].replace(['AD'],1) 
df['label']= df['label'].replace(['CN'],0) 

# set seed

# display data frame
#display(df)

The number of samples: 855
The number of columns: 268


Data Cleaning

In [13]:
# Check wheter there is missing data (NaN)
df.notnull().values.any() # Geen missing data

# Als SD 0 dan feature weggooien
df_new = df.drop(df.std()[df.std() == 0].index.values, axis = 1)

print(f'The number of samples after cleaning + std: {len(df_new.index)}')
print(f'The number of columns after cleaning + std: {len(df_new.columns)}')

# Count number of duplicated patiient ID's
df.index.duplicated().sum() # ID's are indices in df
X = df.drop('ID', axis=1) # Drop patient ID)

The number of samples after cleaning + std: 855
The number of columns after cleaning + std: 262


  df_new = df.drop(df.std()[df.std() == 0].index.values, axis = 1)


## Data split in test, train and validation set 

Split data in test-set & train/validation-set

In [14]:
# Test / Train split: stratified op label --> nagaan of we dit ook willen
y = df['label'] # Define label y (output)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, stratify = X['label'])
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, stratify = X_train['label'])

# Test of het gelukt is 
# print(len(X_train))
# print(len(X_test))
# print(sum(X_test['label']=='AD'))
# print(sum(X_train['label']=='AD'))

# Drop labels and drop patient ID
X_train = X_train.drop('label', axis=1)
X_val = X_val.drop('label', axis=1)
X_test = X_test.drop('label', axis=1)
X = X.drop('label', axis=1)

## Pipeline PCA + KNN

1. Scaler
2. Feature extraction: PCA
3. Classifier: KNN 

In [15]:
# Define steps in pipeline
pca = PCA()
knn = KNeighborsClassifier()
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('pca', pca), ('knn', knn)])

# Define parameters for gridsearch: depending on which classifier
param_grid = {
    "pca__n_components": [5, 15, 30, 45, 60],
    
}
# Perform Grid Search on pipe
search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5)
#search = GridSearchCV(pipe, param_grid, n_jobs=2)
search.fit(X_train, y_train)

# Print outcome Grid Search
print("Best parameter (CV score=%0.3f):" % search.best_score_)
best_params = search.best_params_
print(best_params) 

pipe_after_grid = Pipeline([('scaler', scaler), ('pca', PCA((best_params['pca__n_components']))), ('knn', knn)])

# Fit pipe_after_grid on data
bst = pipe_after_grid.fit(X_train, y_train)

#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))



Best parameter (CV score=0.791):
{'pca__n_components': 60}
0.7829457364341085
0.8589743589743589
0.7976190476190477
0.762820512820513


## Pipeline LDA + KNN

1. Scaler
2. Feature extraction: LDA
3. Classifier: KNN 

In [28]:
# Define steps in pipeline
lda = LDA()
knn = KNeighborsClassifier() 
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('lda', lda), ('knn', knn)])

# Define parameters for gridsearch: depending on which classifier
# param_grid = {
    
# }
# # Perform Grid Search on pipe
# search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5,
# scoring = 'roc_auc')
# search.fit(X_train, y_train)

# # Print outcome Grid Search
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# best_params = search.best_params_
# print(best_params) 

# pipe_after_grid = Pipeline([('scaler', scaler), ('lda', lda), ('svc', svc)])

# Fit pipe_after_grid on data
bst = pipe.fit(X_train, y_train)

#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))

0.7596899224806202
0.782051282051282
0.8133333333333334
0.7537707390648567


## Pipeline kernel-PCA + KNN

1. Scaler
2. Feature extraction: kernel-PCA
3. Classifier: KNN 

In [17]:
# Define steps in pipeline
kpca = KernelPCA()
knn = KNeighborsClassifier()
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('kpca', kpca), ('knn', knn)])

# Define parameters for gridsearch: depending on which classifier
param_grid = {
    "kpca__n_components": [5, 15, 30, 45, 60],
    "kpca__kernel": ['linear', 'poly', 'rbf', 'sigmoid', 'cosine'],
}
# Perform Grid Search on pipe
search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5)
#scoring = 'roc_auc')
search.fit(X_train, y_train)

# Print outcome Grid Search
print("Best parameter (CV score=%0.3f):" % search.best_score_)
best_params = search.best_params_
print(best_params) 
print(search._get_param_names)
pipe_after_grid = Pipeline([('scaler', scaler), ('kpca', KernelPCA((best_params['kpca__n_components']))), ('knn', knn)])

# Fit pipe_after_grid on data
bst = pipe_after_grid.fit(X_train, y_train)


#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))

Best parameter (CV score=0.797):
{'kpca__n_components': 60, 'kpca__kernel': 'cosine'}
<bound method BaseEstimator._get_param_names of <class 'sklearn.model_selection._search.RandomizedSearchCV'>>
0.7829457364341085
0.8589743589743589
0.7976190476190477
0.762820512820513


## Pipeline PCA + RF

1. Scaler
2. Feature extraction: PCA
3. Classifier: RF 

In [18]:
# Define steps in pipeline
scaler = StandardScaler()
pca = PCA()
rf = RandomForestClassifier()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('pca', pca), ('rf', rf)])

# Define parameters for gridsearch: depending on which classifier
param_grid = {
    "pca__n_components": [5, 15, 30, 45, 60],
    
}
# Perform Grid Search on pipe
search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5)
#search = GridSearchCV(pipe, param_grid, n_jobs=2)
search.fit(X_train, y_train)

# Print outcome Grid Search
print("Best parameter (CV score=%0.3f):" % search.best_score_)
best_params = search.best_params_
print(best_params) 

pipe_after_grid = Pipeline([('scaler', scaler), ('pca', PCA((best_params['pca__n_components']))), ('rf', rf)])

# Fit pipe_after_grid on data
bst = pipe_after_grid.fit(X_train, y_train)


#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))



Best parameter (CV score=0.801):
{'pca__n_components': 60}
0.7906976744186046
0.8717948717948718
0.8
0.7692307692307694


## Pipeline LDA + RF

1. Scaler
2. Feature extraction: LDA
3. Classifier: RF

In [19]:
# Define steps in pipeline
lda = LDA()
rf = RandomForestClassifier()
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('lda', lda), ('rf', rf)])

# Define parameters for gridsearch: depending on which classifier
# param_grid = {
    
# }
# # Perform Grid Search on pipe
# search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5,
# scoring = 'roc_auc')
# search.fit(X_train, y_train)

# # Print outcome Grid Search
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# best_params = search.best_params_
# print(best_params) 

# pipe_after_grid = Pipeline([('scaler', scaler), ('lda', lda), ('svc', svc)])

# Fit pipe_after_grid on data
bst = pipe.fit(X_train, y_train)

#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))

0.7596899224806202
0.7564102564102564
0.8309859154929577
0.7605580693815988


## Pipeline kernel-PCA + RF

1. Scaler
2. Feature extraction: kernel-PCA
3. Classifier: RF 

In [20]:
# Define steps in pipeline
kpca = KernelPCA()
rf = RandomForestClassifier()
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('kpca', kpca), ('rf', rf)])

# Define parameters for gridsearch: depending on which classifier
param_grid = {
    "kpca__n_components": [5, 15, 30, 45, 60],
    "kpca__kernel": ['linear', 'poly', 'rbf', 'sigmoid', 'cosine'],
}
# Perform Grid Search on pipe
search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5)
#scoring = 'roc_auc')
search.fit(X_train, y_train)

# Print outcome Grid Search
print("Best parameter (CV score=%0.3f):" % search.best_score_)
best_params = search.best_params_
print(best_params) 
print(search._get_param_names)
pipe_after_grid = Pipeline([('scaler', scaler), ('kpca', KernelPCA((best_params['kpca__n_components']))), ('rf', rf)])

# Fit pipe_after_grid on data
bst = pipe_after_grid.fit(X_train, y_train)


#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))

Best parameter (CV score=0.809):
{'kpca__n_components': 45, 'kpca__kernel': 'linear'}
<bound method BaseEstimator._get_param_names of <class 'sklearn.model_selection._search.RandomizedSearchCV'>>
0.8294573643410853
0.8974358974358975
0.8333333333333334
0.8114630467571644


# Pipeline with Gaussian Naive Bayes

## Pipeline PCA + NB

1. Scaler
2. Feature extraction: PCA
3. Classifier: NB 

In [21]:
# Define steps in pipeline
pca = PCA()
gnb = GaussianNB()
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('pca', pca), ('gnb', gnb)])

# Define parameters for gridsearch: depending on which classifier
param_grid = {
    "pca__n_components": [5, 15, 30, 45, 60],
    
}
# Perform Grid Search on pipe
search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5)
#search = GridSearchCV(pipe, param_grid, n_jobs=2)
search.fit(X_train, y_train)

# Print outcome Grid Search
print("Best parameter (CV score=%0.3f):" % search.best_score_)
best_params = search.best_params_
print(best_params) 

pipe_after_grid = Pipeline([('scaler', scaler), ('pca', PCA((best_params['pca__n_components']))), ('gnb', gnb)])

# Fit pipe_after_grid on data
bst = pipe_after_grid.fit(X_train, y_train)


#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))



Best parameter (CV score=0.805):
{'pca__n_components': 5}
0.751937984496124
0.8717948717948718
0.7555555555555555
0.720211161387632


## Pipeline LDA + NB

1. Scaler
2. Feature extraction: LDA
3. Classifier: NB 

In [23]:
# Define steps in pipeline
lda = LDA()
gnb = GaussianNB()
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('lda', lda), ('gnb', gnb)])

# Define parameters for gridsearch: depending on which classifier
# param_grid = {
    
# }
# # Perform Grid Search on pipe
# search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5,
# scoring = 'roc_auc')
# search.fit(X_train, y_train)

# # Print outcome Grid Search
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# best_params = search.best_params_
# print(best_params) 

# pipe_after_grid = Pipeline([('scaler', scaler), ('lda', lda), ('svc', svc)])

# Fit pipe_after_grid on data
bst = pipe.fit(X_train, y_train)

#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))

0.751937984496124
0.7692307692307693
0.8108108108108109
0.7473604826546003


## Pipeline kernel-PCA + NB

1. Scaler
2. Feature extraction: kernel-PCA
3. Classifier: NB 

In [24]:
# Define steps in pipeline
kpca = KernelPCA()
gnb = GaussianNB()
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('kpca', kpca), ('gnb', gnb)])

# Define parameters for gridsearch: depending on which classifier
param_grid = {
    "kpca__n_components": [5, 15, 30, 45, 60],
    "kpca__kernel": ['linear', 'poly', 'rbf', 'sigmoid', 'cosine'],
}
# Perform Grid Search on pipe
search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5)
#scoring = 'roc_auc')
search.fit(X_train, y_train)

# Print outcome Grid Search
print("Best parameter (CV score=%0.3f):" % search.best_score_)
best_params = search.best_params_
print(best_params) 
print(search._get_param_names)
pipe_after_grid = Pipeline([('scaler', scaler), ('kpca', KernelPCA((best_params['kpca__n_components']))), ('gnb', gnb)])

# Fit pipe_after_grid on data
bst = pipe_after_grid.fit(X_train, y_train)

#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))

Best parameter (CV score=0.832):
{'kpca__n_components': 45, 'kpca__kernel': 'cosine'}
<bound method BaseEstimator._get_param_names of <class 'sklearn.model_selection._search.RandomizedSearchCV'>>
0.7441860465116279
0.8846153846153846
0.7419354838709677
0.7070135746606334


# Pipeline with SVM

## Pipeline PCA + SVM

1. Scaler
2. Feature extraction: PCA
3. Classifier: SVM 

In [25]:
# Define steps in pipeline
pca = PCA()
svc = SVC()
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('pca', pca), ('svc', svc)])

# Define parameters for gridsearch: depending on which classifier
param_grid = {
    "pca__n_components": [5, 15, 30, 45, 60],
    
}
# Perform Grid Search on pipe
search = GridSearchCV(pipe, param_grid, n_jobs=2)
search.fit(X_train, y_train)

# Print outcome Grid Search
print("Best parameter (CV score=%0.3f):" % search.best_score_)
best_params = search.best_params_
print(best_params) 

pipe_after_grid = Pipeline([('scaler', scaler), ('pca', PCA((best_params['pca__n_components']))), ('svc', svc)])

# Fit pipe_after_grid on data
bst = pipe_after_grid.fit(X_train, y_train)

y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))

Best parameter (CV score=0.818):
{'pca__n_components': 30}
0.8062015503875969
0.8717948717948718
0.8192771084337349
0.7888386123680241


# Pipeline LDA + SVM

1. Scaler
2. Feature extraction: LDA
3. Classifier: SVM 

In [26]:
# Define steps in pipeline
lda = LDA()
svc = SVC()
scaler = StandardScaler()

# Create pipeline with steps: scaler, PCA, classifier
pipe = Pipeline([('scaler', scaler), ('lda', lda), ('svc', svc)])

# Define parameters for gridsearch: depending on which classifier
# param_grid = {
    
# }
# # Perform Grid Search on pipe
# search = RandomizedSearchCV(pipe, param_grid, n_iter = 20, cv = 5,
# scoring = 'roc_auc')
# search.fit(X_train, y_train)

# # Print outcome Grid Search
# print("Best parameter (CV score=%0.3f):" % search.best_score_)
# best_params = search.best_params_
# print(best_params) 

# pipe_after_grid = Pipeline([('scaler', scaler), ('lda', lda), ('svc', svc)])

# Fit pipe_after_grid on data
bst = pipe.fit(X_train, y_train)

#TEST PERFORMANCE
y_pred = bst.predict(X_val)
y_val = (np.array(y_val))

print(bst.score(X_val, y_val))
print(recall_score(y_val, y_pred))
print(precision_score(y_val, y_pred))
print(roc_auc_score(y_val, y_pred))

0.751937984496124
0.7692307692307693
0.8108108108108109
0.7473604826546003
