In [36]:
import pandas as pd
df = pd.read_csv("OUTCOME_DIAGNOSIS_processed.csv", header=0)

- NA's we already dropped via `df.dropna(how='all',axis=1,inplace=True)`
- Also, features with more than 30% NA's were dropped via
  ```
  perc = 30.0
  min_count =  int(((100-perc)/100)*df.shape[0] + 1)
  df = df.dropna( axis=1, thresh=min_count)
  ```

In [37]:
df.shape

(1648, 623)

In [38]:
df.dropna(how='all',axis=1,inplace=True)

In [39]:
perc = 30.0
min_count =  int(((100-perc)/100)*df.shape[0] + 1)
df = df.dropna( axis=1, thresh=min_count)

In [40]:
df.shape

(1648, 623)

### Feature SelectKBest method
___
Pick the 60 best features out from all of the variables included in the dataset.

In [41]:
df.dropna(subset=['ZCO'], inplace=True)   # Drop only those rows where the target column has a missing value 
df.reset_index(drop=True, inplace=True)

X = df.drop(['ZCO','ZME','ZEX','ZVS','ZLA','ZAS'], axis=1)
Y = df['ZCO']

In [43]:
import numpy as np
cor_matrix = X.corr().abs()

upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

X.drop(to_drop, axis=1, inplace=True)

In [None]:
discrete =  []
continues = []

for col in X.columns:
    if (X[col].sum()).is_integer():
        discrete.append(col)
    else:
        continues.append(col)

X.fillna(X[discrete].mode().iloc[0], inplace=True)
X.fillna(X[continues].mean(), inplace=True)

Unnamed: 0.1,Unnamed: 0,A0,A1b,A12b,B2,B6,B7,B10,B11,B12,...,frontaltestB1s,newexecutive1,SexC,EduC,MedDietScore,WEIGHT,HEIGHT,WAISTCIRCUMFERENCE,BMI,AgeC
0,0,1.0,1.0,4.0,4.0,1.0,12.0,0.0,2.0,0.0,...,20.0,1.0,1.0,8.0,34.0,77.0,165.0,101.0,28.28,76.0
1,1,1.0,0.0,10.0,5.0,1.0,0.0,0.0,7.0,2.0,...,18.0,1.0,1.0,6.0,38.0,80.9,157.0,115.0,32.82,83.0
2,2,1.0,0.0,10.0,0.0,3.0,0.0,6.0,3.0,2.0,...,20.0,1.0,2.0,7.0,38.0,63.4,147.0,98.0,29.34,87.0
3,4,1.0,0.0,23.0,1.0,1.0,0.0,0.0,3.0,1.0,...,18.0,1.0,1.0,3.0,38.0,84.6,184.0,104.0,24.99,78.0
4,5,1.0,0.0,23.0,4.0,1.0,0.0,0.0,3.0,1.0,...,20.0,1.0,2.0,4.0,43.0,85.8,165.0,109.0,31.52,73.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1612,1643,1.0,1.0,28.0,0.0,1.0,1.0,0.0,3.0,1.0,...,18.0,1.0,2.0,6.0,31.0,84.7,170.0,103.0,29.31,67.0
1613,1644,1.0,1.0,5.0,0.0,1.0,0.0,0.0,1.0,1.0,...,19.0,1.0,2.0,6.0,37.0,82.4,165.0,109.0,30.27,72.0
1614,1645,1.0,1.0,59.0,0.0,1.0,0.0,0.0,1.0,1.0,...,20.0,1.0,1.0,13.0,22.0,77.6,155.0,108.0,32.30,70.0
1615,1646,1.0,1.0,59.0,1.0,1.0,1.0,4.0,2.0,1.0,...,12.0,1.0,2.0,6.0,27.0,86.6,150.0,112.0,38.49,69.0


In [45]:
from sklearn.feature_selection import SelectKBest, f_regression

no_of_features = 60

select_univariate = SelectKBest(f_regression, k=no_of_features).fit(X,Y)
features_mask = select_univariate.get_support()

selected_columns = X.columns[features_mask]
selected_features = X[selected_columns]

In [46]:
list(selected_features)

['B10',
 'F12',
 'F13',
 'F14',
 'F15',
 'G1',
 'G2',
 'G3',
 'G4',
 'G5',
 'G6',
 'G6_TOTAL',
 'G21',
 'minimental30',
 'MMSE28',
 'MMSE29',
 'll1cor1',
 'll2cor1',
 'll2sim1',
 'll3cor1',
 'll3sim1',
 'll4cor1',
 'll4sim1',
 'll5cor1',
 'll5sim1',
 'Sum1_5correct1st',
 'llkampilimathisis1',
 'll6cor1',
 'll6sim1',
 'll7cor1',
 'll7sim1',
 'll8cor1',
 'll9cor1',
 'll9sim1',
 'll10cor1',
 'llta1',
 'Recognrecalldiscr',
 'Encodingdefindex1',
 'figurecopy',
 'figureimmediate',
 'figuredelayed',
 'ST_D_2_immedpercentile1st',
 'jlo1stdiades',
 'jloolesmax20',
 'TMTA_tmtAtime1',
 'TMTA_tmtAerr1',
 'vfsem1stobj',
 'vfsemobjswich',
 'vfphon1stalphobj',
 'vfphaswi',
 'naming1st',
 'comphr1st',
 'repetition1st',
 'sentrepeatcorrect1',
 'sentrepeattargetr',
 'grafseqcorrect1',
 'frontaltestA1st',
 'frontaltestB1s',
 'EduC',
 'AgeC']