# Testing presence of outliers

## Import Modules

In [1]:
# Import data module
from adni.load_data import load_data

# Import needed modules
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import KernelPCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

## Data loading and cleaning

Data loading

In [2]:
# Data loading 
data = load_data()
print(f'The number of samples: {len(data.index)}')
print(f'The number of columns: {len(data.columns)}')
df= pd.DataFrame(data)

# Reset index, add patient ID's as column
df.reset_index(inplace=True)
df = df.rename(columns = {'index':'ID'})

# Set AD to 1, CN to 0
df['label']= df['label'].replace(['AD'],1) 
df['label']= df['label'].replace(['CN'],0) 

# set seed

# display data frame
#display(df)

The number of samples: 855
The number of columns: 268


Data Cleaning

In [3]:
# Check wheter there is missing data (NaN)
df.notnull().values.any() # Geen missing data

# Als SD 0 dan feature weggooien
df_new = df.drop(df.std()[df.std() == 0].index.values, axis = 1)

print(f'The number of samples after cleaning + std: {len(df_new.index)}')
print(f'The number of columns after cleaning + std: {len(df_new.columns)}')

# Count number of duplicated patiient ID's
df.index.duplicated().sum() # ID's are indices in df
X = df.drop('ID', axis=1) # Drop patient ID)

The number of samples after cleaning + std: 855
The number of columns after cleaning + std: 262


  df_new = df.drop(df.std()[df.std() == 0].index.values, axis = 1)


## Data split in test, train and validation set 

Split data in test-set & train/validation-set

In [4]:
# Test / Train split: stratified op label --> nagaan of we dit ook willen
y = df['label'] # Define label y (output)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, stratify = X['label'])
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, stratify = X_train['label'])

# Test of het gelukt is 
# print(len(X_train))
# print(len(X_test))
# print(sum(X_test['label']=='AD'))
# print(sum(X_train['label']=='AD'))

# Drop labels and drop patient ID
X_train = X_train.drop('label', axis=1)
#X_val = X_val.drop('label', axis=1)
X_test = X_test.drop('label', axis=1)
X = X.drop('label', axis=1)

# Check outliers

In [5]:
print(X_train.skew())
X_train.describe()

hf_energy                                             1.993195
hf_entropy                                            0.255744
hf_kurtosis                                           0.008959
hf_max                                                0.218898
hf_mean                                               0.030236
                                                        ...   
vf_Frangi_inner_peak_SR(1.0, 10.0)_SS2.0              3.905838
vf_Frangi_inner_quartile_range_SR(1.0, 10.0)_SS2.0    0.000000
vf_Frangi_inner_range_SR(1.0, 10.0)_SS2.0             0.000000
vf_Frangi_inner_skewness_SR(1.0, 10.0)_SS2.0          1.053378
vf_Frangi_inner_std_SR(1.0, 10.0)_SS2.0               0.000000
Length: 267, dtype: float64


Unnamed: 0,hf_energy,hf_entropy,hf_kurtosis,hf_max,hf_mean,hf_median,hf_min,hf_peak,hf_quartile_range,hf_range,...,"vf_Frangi_inner_kurtosis_SR(1.0, 10.0)_SS2.0","vf_Frangi_inner_max_SR(1.0, 10.0)_SS2.0","vf_Frangi_inner_mean_SR(1.0, 10.0)_SS2.0","vf_Frangi_inner_median_SR(1.0, 10.0)_SS2.0","vf_Frangi_inner_min_SR(1.0, 10.0)_SS2.0","vf_Frangi_inner_peak_SR(1.0, 10.0)_SS2.0","vf_Frangi_inner_quartile_range_SR(1.0, 10.0)_SS2.0","vf_Frangi_inner_range_SR(1.0, 10.0)_SS2.0","vf_Frangi_inner_skewness_SR(1.0, 10.0)_SS2.0","vf_Frangi_inner_std_SR(1.0, 10.0)_SS2.0"
count,641.0,641.0,641.0,641.0,641.0,641.0,641.0,641.0,641.0,641.0,...,641.0,641.0,641.0,641.0,641.0,641.0,641.0,641.0,641.0,641.0
mean,3169.121247,4.721676,1.749049,1.810302,1.134416,1.214417,0.076787,303.820593,0.508057,1.733515,...,1.93808,1.288356e-09,4.643172e-10,3.768559e-10,1.111773e-10,16.293292,3.797255e-10,1.177178e-09,1.159847,3.303358e-10
std,2808.675495,0.318351,1.853399,0.275969,0.300527,0.29009,0.410774,96.883067,0.296578,0.362175,...,5.528944,1.089748e-09,4.039252e-10,3.723624e-10,1.931836e-10,24.849134,3.507204e-10,1.049088e-09,1.040008,2.792191e-10
min,59.6424,4.044018,-1.556002,1.115581,0.159041,0.048083,-0.738934,21.0,0.205472,0.790367,...,-3.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-1.228925,0.0
25%,1304.153514,4.464182,0.222601,1.61338,0.94571,1.0478,-0.253151,229.0,0.312355,1.453964,...,-0.873328,6.116668e-10,1.877611e-10,1.043949e-10,1.723063e-21,2.0,1.602746e-10,4.780048e-10,0.381527,1.578897e-10
50%,2173.812465,4.678895,1.96954,1.800337,1.131071,1.211321,0.016415,305.0,0.377201,1.739557,...,0.614541,1.055589e-09,3.7823e-10,2.908824e-10,1.288314e-11,8.0,3.134697e-10,9.274076e-10,1.061919,2.767091e-10
75%,3884.882249,4.972635,3.040678,2.000854,1.326839,1.39043,0.3657,373.0,0.569189,1.98935,...,2.880313,1.652591e-09,6.335517e-10,5.293412e-10,1.60447e-10,20.0,5.0541e-10,1.563307e-09,1.725167,4.290602e-10
max,15722.331725,5.402261,7.842693,2.790874,2.131128,2.141425,1.373523,636.0,1.578203,2.669252,...,64.852716,8.2118e-09,3.071739e-09,2.844013e-09,1.641528e-09,236.0,2.751437e-09,7.857531e-09,6.723008,2.092181e-09
