## Load Dataset

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cp '/content/gdrive/MyDrive/ml_project_dataset.pkl' '/content/'

In [None]:
!pip install pickle5

In [None]:
import pickle5 as pickle
import numpy as np

with open('ml_project_dataset.pkl', "rb") as fh:
  data = pickle.load(fh)

In [None]:
import re
data.Gender.unique()
data.Gender=data.Gender.apply(lambda x: re.sub('^[fF]\w*','f',x))
data.Gender=data.Gender.apply(lambda x: re.sub('^[mM]\w*','m',x))

data.Status = data.Status.apply(lambda x : re.sub('^[sS]+.*','s',x))
data.Status = data.Status.apply(lambda x : re.sub('^[hH]+.*','h',x))
data.Status = data.Status.apply(lambda x : re.sub('^[nN]+.*','n',x))

#find cropted label and drop the row data thats containe it 
data.Status = data.Status.apply(lambda x : re.sub('^\d+.*','cropted',x))
data.Status.replace('cropted',np.nan,inplace=True)
data.dropna(subset=['Status'],inplace=True)

data.Gender=data.Gender.astype('category').cat.codes
data.Status=data.Status.astype('category').cat.codes

In [None]:
data_f1 = data[['Age','Gender','Status','f1']].copy()
data_f2 = data[['Age','Gender','Status','f2']].copy()
data_f3 = data[['Age','Gender','Status','f3']].copy()
data_f4 = data[['Age','Gender','Status','f4']].copy()
data_f5 = data[['Age','Gender','Status','f5']].copy()

In [None]:
data_f1.f1=data_f1.f1.apply(lambda x: np.nan if np.isnan(x[0]) else x)
data_f1.dropna(subset=['f1'],inplace=True)
data_f2.f2=data_f2.f2.apply(lambda x: np.nan if np.isnan(x[0]) else x)
data_f2.dropna(subset=['f2'],inplace=True)
data_f3.f3=data_f3.f3.apply(lambda x: np.nan if np.isnan(x[0]) else x)
data_f3.dropna(subset=['f3'],inplace=True)
data_f4.f4=data_f4.f4.apply(lambda x: np.nan if np.isnan(x[0]) else x)
data_f4.dropna(subset=['f4'],inplace=True)
data_f5.f5=data_f5.f5.apply(lambda x: np.nan if np.isnan(x[0]) else x)
data_f5.dropna(subset=['f5'],inplace=True)

data_f1.reset_index(drop=True,inplace=True)
data_f2.reset_index(drop=True,inplace=True)
data_f3.reset_index(drop=True,inplace=True)
data_f4.reset_index(drop=True,inplace=True)
data_f5.reset_index(drop=True,inplace=True)

In [None]:
!pip install -U imbalanced-learn

## Ranking Dataset

In [None]:
import pandas as pd
import numpy as np

def make_dataset(X,feature_set):
  x=X[feature_set].values.copy()
  u = np.zeros((x.shape[0],x[0].shape[0]))
  for i in range(x.shape[0]):
    for j in range(x[0].shape[0]):
      u[i][j]=x[i][j]
    
  cols = np.arange(0,x[0].shape[0],1)
  df = pd.DataFrame(u, columns = cols)
  return df

In [None]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from warnings import filterwarnings
filterwarnings('ignore')
importance = 40

def get_feature_importance(df,y_target):
  regressor = LinearRegression()
  regressor.fit(df,y_target)
  intercept = regressor.intercept_
  features = pd.DataFrame(regressor.coef_,df.columns,columns=['coefficient'])
  features.coefficient = features.coefficient.abs()
  stdvs=[]
  for i in df.columns:
    stdv=df[i].std()
    stdvs.append(stdv)
  features['stdev']=np.array(stdvs).reshape(-1,1)
  features['importance']=features['coefficient']*features['stdev']
  features['importance_normalized'] = 100*features['importance'] / features['importance'].max()
  return features.importance_normalized

In [None]:
from sklearn.feature_selection import mutual_info_classif
import matplotlib.pyplot as plt

def feature_importance_mutual(x_features, y_target):
  gain = mutual_info_classif(x_features, y_target, discrete_features=True)
  for i in range(0, len(gain)):
    gain[i] = round(gain[i], 1)
  return gain

In [None]:
def find_best_pca_n(x):
  pca = PCA()
  x_pca=pca.fit_transform(x)
  exv=pca.explained_variance_ratio_
  yy = []

  a,s=0,0
  flag =True 
  for k,j in enumerate(exv):
    s+=j
    if (s>0.9 and flag):
      a = k
      flag=False
    yy.append(s)

  print('PCA : Optimum Feature number is %d\n'%a)

## F1

### Linear imporatnce

In [None]:
X1 = make_dataset(data_f1.drop(columns=['Age','Gender','Status']),'f1')
x1_reduced = get_feature_importance(X1,data_f1.Gender)
print(x1_reduced[x1_reduced>importance])
print("Label : Gender")
print("number of all features: ", X1.shape[1])
print("number of selected features : ", x1_reduced[x1_reduced>importance].shape[0])

1      40.662589
10     48.325035
14     70.568047
20     48.920700
26     51.094370
         ...    
500    49.581685
501    65.867406
503    44.494662
504    51.048309
509    84.286308
Name: importance_normalized, Length: 76, dtype: float64
Label : Gender
number of all features:  512
number of selected features :  76


In [None]:
X1 = make_dataset(data_f1.drop(columns=['Age','Gender','Status']),'f1')
x1_reduced = get_feature_importance(X1,data_f1.Age)
print(x1_reduced[x1_reduced>importance])
print("Label : Age")
print("number of all features: ", X1.shape[1])
print("number of selected features : ", x1_reduced[x1_reduced>importance].shape[0])

6      48.803350
8      42.529093
11     60.778654
12     51.561685
14     76.921195
         ...    
491    62.772495
492    77.168452
496    50.461343
497    52.283782
507    44.867593
Name: importance_normalized, Length: 107, dtype: float64
Label : Age
number of all features:  512
number of selected features :  107


In [None]:
X1 = make_dataset(data_f1.drop(columns=['Age','Gender','Status']),'f1')
x1_reduced = get_feature_importance(X1,data_f1.Status)
print(x1_reduced[x1_reduced>importance])
print("Label : Status")
print("number of all features: ", X1.shape[1])
print("number of selected features : ", x1_reduced[x1_reduced>importance].shape[0])

4      75.183874
6      43.933728
16     47.449897
17     50.130834
20     49.851995
         ...    
471    46.983496
472    43.700416
476    46.034371
481    45.115978
489    52.964279
Name: importance_normalized, Length: 95, dtype: float64
Label : Status
number of all features:  512
number of selected features :  95


### Mutual Information

In [None]:
X1 = make_dataset(data_f1.drop(columns=['Age','Gender','Status']),'f1')
gains = feature_importance_mutual(X1,data_f1.Status)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.016, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017, 1.017

{1.016: 6, 1.017: 506}

In [None]:
X1 = make_dataset(data_f1.drop(columns=['Age','Gender','Status']),'f1')
gains = feature_importance_mutual(X1,data_f1.Gender)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.64, 0.6

{0.639: 3, 0.64: 509}

In [None]:
X1 = make_dataset(data_f1.drop(columns=['Age','Gender','Status']),'f1')
gains = feature_importance_mutual(X1,data_f1.Age)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.373, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.373, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374, 3.374

{3.373: 3, 3.374: 509}

### LDA - PCA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

X1 = make_dataset(data_f1.drop(columns=['Age','Gender','Status']),'f1')
clf = LinearDiscriminantAnalysis()
x1_reduced = clf.fit_transform(X1,data_f1.Status)

print('Label : Status')
print('LDA features :', x1_reduced.shape)
find_best_pca_n(X1)

Label : Status
LDA features : (2242, 2)
PCA : Optimum Feature number is 34



In [None]:
X1 = make_dataset(data_f1.drop(columns=['Age','Gender','Status']),'f1')
clf = LinearDiscriminantAnalysis()
x1_reduced = clf.fit_transform(X1,data_f1.Gender)

print('Label : Gender')
print('LDA features :', x1_reduced.shape)
find_best_pca_n(X1)

Label : Gender
LDA features : (2242, 1)
PCA : Optimum Feature number is 34



In [None]:
X1 = make_dataset(data_f1.drop(columns=['Age','Gender','Status']),'f1')
clf = LinearDiscriminantAnalysis()
x1_reduced = clf.fit_transform(X1,data_f1.Age)

print('Label : Age')
print('LDA features :', x1_reduced.shape)
find_best_pca_n(X1)

Label : Age
LDA features : (2242, 46)
PCA : Optimum Feature number is 34



## F2

### Linear imporatnce

In [None]:
X2 = make_dataset(data_f2.drop(columns=['Age','Gender','Status']),'f2')
x2_reduced = get_feature_importance(X2,data_f2.Status)
print(x2_reduced[x2_reduced>importance])
print("Label : Status")
print("number of all features: ", X2.shape[1])
print("number of selected features : ", x2_reduced[x2_reduced>importance].shape[0])

19       42.771318
99       45.772786
118      49.705941
235      42.501695
279      47.356652
281      57.414064
296      40.589908
435      45.478049
479      54.740663
487      96.201562
494      40.651712
525      79.901843
611      43.278133
647      53.906825
655      40.773030
680      40.932826
733      40.633748
765      60.076580
894      70.009682
937      74.221837
1019     42.767756
1045     51.106328
1206     42.786445
1246     41.888191
1272    100.000000
Name: importance_normalized, dtype: float64
Label : Status
number of all features:  1536
number of selected features :  25


In [None]:
X2 = make_dataset(data_f2.drop(columns=['Age','Gender','Status']),'f2')
x2_reduced = get_feature_importance(X2,data_f2.Gender)
print(x2_reduced[x2_reduced>importance])
print("Label : Gender")
print("number of all features: ", X2.shape[1])
print("number of selected features : ", x2_reduced[x2_reduced>importance].shape[0])

9        43.107276
118      43.143900
130      40.428187
172      47.068188
208      40.089903
209      42.544183
236      40.411353
310      45.731388
345      53.999840
395      91.210761
415      63.129689
464      56.274301
494      66.335289
506      44.493626
579      46.555035
647      41.949976
662      50.069940
682      49.212857
808      70.137080
809      48.032532
850      48.406977
852      43.694273
855     100.000000
869      44.616266
958      55.189305
1017     41.488404
1045     53.792421
1053     57.800565
1097     73.582526
1163     41.211611
1164     58.164578
1206     54.938813
1320     93.167294
1398     53.620148
Name: importance_normalized, dtype: float64
Label : Gender
number of all features:  1536
number of selected features :  34


In [None]:
X2 = make_dataset(data_f2.drop(columns=['Age','Gender','Status']),'f2')
x2_reduced = get_feature_importance(X2,data_f2.Age)
print(x2_reduced[x2_reduced>importance])
print("Label : Age")
print("number of all features: ", X2.shape[1])
print("number of selected features : ", x2_reduced[x2_reduced>importance].shape[0])

19       47.624608
28       40.749263
73       52.560624
98       64.329901
130      78.443200
212      47.297545
279      62.828294
345      53.989500
395      57.990453
427      44.928349
482      58.983685
486      43.210920
494      52.616096
505      40.173026
506      56.545210
508      41.351704
518      65.349953
588      96.826420
623      64.237580
629      61.284301
636      48.366569
647      80.707749
655      56.833210
749      43.902808
765      46.978851
788      51.015536
824      59.244772
850      43.992198
863      53.281752
872      62.339834
890      62.334366
951      41.312186
1034     47.172461
1053     46.121717
1077     50.814073
1150     42.873956
1152     40.138196
1189    100.000000
1206     45.513065
1250     54.774162
1272     73.389774
1279     49.958768
1286     54.803167
1320     48.229315
1359     64.468089
1424     72.157790
1483     41.367267
1500     68.246939
Name: importance_normalized, dtype: float64
Label : Age
number of all features:  1536
nu

### Mutual Information

In [None]:
X2 = make_dataset(data_f2.drop(columns=['Age','Gender','Status']),'f2')
gains = feature_importance_mutual(X2,data_f2.Status)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031

{1.03: 6, 1.031: 1530}

In [None]:
X2 = make_dataset(data_f2.drop(columns=['Age','Gender','Status']),'f2')
gains = feature_importance_mutual(X2,data_f2.Gender)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.618, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619

{0.618: 8, 0.619: 1528}

In [None]:
X2 = make_dataset(data_f2.drop(columns=['Age','Gender','Status']),'f2')
gains = feature_importance_mutual(X2,data_f2.Age)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.354, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355

{3.354: 16, 3.355: 1520}

### LDA - PCA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

X2 = make_dataset(data_f2.drop(columns=['Age','Gender','Status']),'f2')
clf = LinearDiscriminantAnalysis()
x2_reduced = clf.fit_transform(X2,data_f2.Status)

print('Label : Status')
print('LDA features :', x2_reduced.shape)
find_best_pca_n(X2)

Label : Status
LDA features : (2283, 2)
PCA : Optimum Feature number is 28



In [None]:
X2 = make_dataset(data_f2.drop(columns=['Age','Gender','Status']),'f2')
clf = LinearDiscriminantAnalysis()
x2_reduced = clf.fit_transform(X2,data_f2.Age)

print('Label : Age')
print('LDA features :', x2_reduced.shape)
find_best_pca_n(X2)

Label : Age
LDA features : (2283, 46)
PCA : Optimum Feature number is 28



In [None]:
X2 = make_dataset(data_f2.drop(columns=['Age','Gender','Status']),'f2')
clf = LinearDiscriminantAnalysis()
x2_reduced = clf.fit_transform(X2,data_f2.Gender)

print('Label : Gender')
print('LDA features :', x2_reduced.shape)
find_best_pca_n(X2)

Label : Gender
LDA features : (2283, 1)
PCA : Optimum Feature number is 28



## F3

### Linear imporatnce

In [None]:
X3 = make_dataset(data_f3.drop(columns=['Age','Gender','Status']),'f3')
x3_reduced = get_feature_importance(X3,data_f3.Age)
print(x3_reduced[x3_reduced>importance])
print("Label : Age")
print("number of all features: ", X3.shape[1])
print("number of selected features : ", x3_reduced[x3_reduced>importance].shape[0])

54       69.865301
79       77.662362
136      41.206079
209      49.288022
292      52.305740
300      68.890913
303      40.877035
307      44.141173
322      41.795982
350      76.137983
351      59.121589
358      51.159178
468      42.448353
504      43.776041
510      44.863409
579      55.392225
592      56.843318
615      72.002524
642      40.175018
679      41.871385
704      95.388409
708      42.960507
765      50.870214
974      57.891202
1045     58.878541
1055     79.258467
1094     68.681599
1111     46.893010
1187     44.352266
1199     52.468725
1245     50.329595
1255     40.925838
1303     47.760363
1339     66.048928
1351     75.421641
1373     49.468344
1501     90.008792
1513     45.906132
1548     51.490429
1591     48.291423
1716     42.990606
1726     46.822229
1832     43.428805
1842     73.931803
1850     43.292548
1863     60.231462
1894     79.044718
1912    100.000000
1921     56.549945
1953     51.104189
1958     83.264685
1999     61.967709
2055     44.

In [None]:
X3 = make_dataset(data_f3.drop(columns=['Age','Gender','Status']),'f3')
x3_reduced = get_feature_importance(X3,data_f3.Gender)
print(x3_reduced[x3_reduced>importance])
print("Label : Gender")
print("number of all features: ", X3.shape[1])
print("number of selected features : ", x3_reduced[x3_reduced>importance].shape[0])

91       79.391024
114      49.399559
253      42.064377
307      46.453180
321      45.901497
337      50.514899
342      45.196420
463      74.138048
482      44.322751
489      47.501237
510      69.342125
592      60.554105
616      72.031024
627      80.178028
639      45.316166
800      64.679179
825      43.396843
858      59.368678
907      48.111347
970      52.798432
980      47.680880
995      42.322096
1039     54.336649
1102     78.733079
1187     46.764684
1237     55.104565
1247     67.701675
1330     45.015784
1339     43.646039
1469     50.269603
1511     43.829031
1545     45.648616
1554     46.534441
1591     43.567140
1615     53.745926
1734     44.333219
1806     43.207394
1856     66.380817
1882     49.562456
2116     75.470051
2139     75.504331
2157     46.525157
2181     61.512471
2204     64.390094
2215    100.000000
2293     57.747588
Name: importance_normalized, dtype: float64
Label : Gender
number of all features:  2304
number of selected features :  46


In [None]:
X3 = make_dataset(data_f3.drop(columns=['Age','Gender','Status']),'f3')
x3_reduced = get_feature_importance(X3,data_f3.Status)
print(x3_reduced[x3_reduced>importance])
print("Label : Status")
print("number of all features: ", X3.shape[1])
print("number of selected features : ", x3_reduced[x3_reduced>importance].shape[0])

94       56.496407
158      40.068855
178      57.962982
209      53.444372
297      81.374927
331      56.768407
350      82.192437
468      57.313971
538      54.386329
592      41.035154
594      46.077544
661      40.228730
763      51.590159
771      50.344487
800      60.129043
822      43.450002
855      44.300462
908      41.264367
914      63.124484
932      44.019114
1229     44.919515
1237     68.550490
1245     44.444360
1339    100.000000
1345     63.394138
1373     72.625054
1584     41.172464
1591     66.166444
1611     41.845179
1615     64.980121
1801     41.686816
1861     55.329944
1958     42.525410
2116     45.324351
2136     51.165010
2139     51.816987
2148     78.788281
Name: importance_normalized, dtype: float64
Label : Status
number of all features:  2304
number of selected features :  37


### Mutual Information

In [None]:
X3 = make_dataset(data_f3.drop(columns=['Age','Gender','Status']),'f3')
gains = feature_importance_mutual(X3,data_f3.Age)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[3.354, 3.355, 3.355, 3.355, 3.352, 3.355, 3.354, 3.355, 3.355, 3.353, 3.355, 3.355, 3.355, 3.354, 3.355, 3.355, 3.355, 3.355, 3.355, 3.354, 3.355, 3.355, 3.354, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.354, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.354, 3.355, 3.355, 3.355, 3.355, 3.354, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.307, 3.355, 3.354, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.354, 3.355, 3.355, 3.353, 3.355, 3.354, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.355, 3.354, 3.354, 3.355

{3.06: 1,
 3.2: 1,
 3.21: 1,
 3.29: 1,
 3.305: 1,
 3.306: 1,
 3.307: 1,
 3.333: 1,
 3.336: 1,
 3.337: 2,
 3.338: 1,
 3.339: 1,
 3.34: 1,
 3.342: 3,
 3.343: 1,
 3.346: 4,
 3.349: 2,
 3.35: 4,
 3.351: 3,
 3.352: 12,
 3.353: 43,
 3.354: 189,
 3.355: 2029}

In [None]:
X3 = make_dataset(data_f3.drop(columns=['Age','Gender','Status']),'f3')
gains = feature_importance_mutual(X3,data_f3.Gender)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[0.618, 0.619, 0.619, 0.619, 0.618, 0.619, 0.619, 0.619, 0.619, 0.618, 0.619, 0.619, 0.619, 0.618, 0.619, 0.619, 0.619, 0.619, 0.619, 0.618, 0.619, 0.619, 0.618, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.618, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.591, 0.619, 0.618, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.617, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619, 0.619

{0.514: 1,
 0.556: 1,
 0.557: 1,
 0.59: 1,
 0.591: 1,
 0.592: 1,
 0.604: 1,
 0.609: 3,
 0.61: 1,
 0.611: 2,
 0.612: 1,
 0.613: 2,
 0.614: 2,
 0.615: 3,
 0.616: 4,
 0.617: 13,
 0.618: 103,
 0.619: 2163}

In [None]:
X3 = make_dataset(data_f3.drop(columns=['Age','Gender','Status']),'f3')
gains = feature_importance_mutual(X3,data_f3.Status)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[1.03, 1.031, 1.031, 1.031, 1.029, 1.031, 1.031, 1.031, 1.031, 1.03, 1.031, 1.031, 1.031, 1.029, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.03, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.03, 1.03, 1.031, 1.031, 1.031, 1.031, 1.03, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 0.999, 1.031, 1.03, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.03, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031, 1.031,

{0.836: 1,
 0.926: 1,
 0.942: 1,
 0.982: 1,
 0.992: 1,
 0.994: 1,
 0.999: 1,
 1.017: 1,
 1.018: 2,
 1.019: 1,
 1.02: 3,
 1.021: 1,
 1.022: 1,
 1.023: 2,
 1.024: 1,
 1.025: 3,
 1.026: 1,
 1.027: 3,
 1.028: 5,
 1.029: 26,
 1.03: 130,
 1.031: 2117}

### LDA - PCA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

X3 = make_dataset(data_f3.drop(columns=['Age','Gender','Status']),'f3')
clf = LinearDiscriminantAnalysis()
x3_reduced = clf.fit_transform(X3,data_f3.Status)

print('Label : Status')
print('LDA features :', x3_reduced.shape)
find_best_pca_n(X3)

Label : Status
LDA features : (2283, 2)
PCA : Optimum Feature number is 12



In [None]:
X3 = make_dataset(data_f3.drop(columns=['Age','Gender','Status']),'f3')
clf = LinearDiscriminantAnalysis()
x3_reduced = clf.fit_transform(X3,data_f3.Age)

print('Label : Age')
print('LDA features :', x3_reduced.shape)
find_best_pca_n(X3)

Label : Age
LDA features : (2283, 46)
PCA : Optimum Feature number is 12



In [None]:
X3 = make_dataset(data_f3.drop(columns=['Age','Gender','Status']),'f3')
clf = LinearDiscriminantAnalysis()
x3_reduced = clf.fit_transform(X3,data_f3.Gender)

print('Label : Gender')
print('LDA features :', x3_reduced.shape)
find_best_pca_n(X3)

Label : Gender
LDA features : (2283, 1)
PCA : Optimum Feature number is 12



## F4

### Linear imporatnce

In [None]:
X4 = make_dataset(data_f4.drop(columns=['Age','Gender','Status']),'f4')
x4_reduced = get_feature_importance(X4,data_f4.Gender)
print(x4_reduced[x4_reduced>importance])
print("Label : Gender")
print("number of all features: ", X4.shape[1])
print("number of selected features : ", x4_reduced[x4_reduced>importance].shape[0])

5       46.872860
23      53.232659
81      53.899251
95      50.582726
114     52.049895
          ...    
2361    55.292128
2380    41.477080
2405    47.601690
2418    71.023680
2534    42.548469
Name: importance_normalized, Length: 83, dtype: float64
Label : Gender
number of all features:  2560
number of selected features :  83


In [None]:
X4 = make_dataset(data_f4.drop(columns=['Age','Gender','Status']),'f4')
x4_reduced = get_feature_importance(X4,data_f4.Status)
print(x4_reduced[x4_reduced>importance])
print("Label : Status")
print("number of all features: ", X4.shape[1])
print("number of selected features : ", x4_reduced[x4_reduced>importance].shape[0])

5       46.872861
23      53.232659
81      53.899253
95      50.582729
114     52.049896
          ...    
2361    55.292127
2380    41.477081
2405    47.601691
2418    71.023682
2534    42.548470
Name: importance_normalized, Length: 83, dtype: float64
Label : Status
number of all features:  2560
number of selected features :  83


In [None]:
X4 = make_dataset(data_f4.drop(columns=['Age','Gender','Status']),'f4')
x4_reduced = get_feature_importance(X4,data_f4.Age)
print(x4_reduced[x4_reduced>importance])
print("Label : Age")
print("number of all features: ", X4.shape[1])
print("number of selected features : ", x4_reduced[x4_reduced>importance].shape[0])

5       46.872859
23      53.232659
81      53.899251
95      50.582727
114     52.049893
          ...    
2361    55.292127
2380    41.477081
2405    47.601691
2418    71.023675
2534    42.548467
Name: importance_normalized, Length: 83, dtype: float64
Label : Age
number of all features:  2560
number of selected features :  83


### Mutual Information

In [None]:
X4 = make_dataset(data_f4.drop(columns=['Age','Gender','Status']),'f4')
gains = feature_importance_mutual(X4,data_f4.Age)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[2.0, 2.0, 3.0, 3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 1.0, 3.0, 3.0, 0.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 3.0, 2.0, 1.0, 3.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 3.0, 2.0, 0.0, 3.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 2.0, 2.0, 0.0, 1.0, 3.0, 3.0, 3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0, 2.0, 1.0, 3.0, 0.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 1.0, 3.0, 2.0, 2.0, 1.0, 3.0, 3.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0, 1.0, 2.0, 2.0, 3.0, 3.0, 2.0, 2.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 3.0, 1.0, 3.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 2.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 1.0, 3.0, 3.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0,

{0.0: 36, 1.0: 234, 2.0: 931, 3.0: 1359}

In [None]:
X4 = make_dataset(data_f4.drop(columns=['Age','Gender','Status']),'f4')
gains = feature_importance_mutual(X4,data_f4.Gender)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0,

{0.0: 1587, 1.0: 973}

In [None]:
X4 = make_dataset(data_f4.drop(columns=['Age','Gender','Status']),'f4')
gains = feature_importance_mutual(X4,data_f4.Status)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,

{0.0: 381, 1.0: 2179}

### LDA - PCA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

X4 = make_dataset(data_f4.drop(columns=['Age','Gender','Status']),'f4')
clf = LinearDiscriminantAnalysis()
x4_reduced = clf.fit_transform(X4,data_f4.Status)

print('Label : Status')
print('LDA features :', x4_reduced.shape)
find_best_pca_n(X4)

Label : Status
LDA features : (2545, 2)
PCA : Optimum Feature number is 15



In [None]:
X4 = make_dataset(data_f4.drop(columns=['Age','Gender','Status']),'f4')
clf = LinearDiscriminantAnalysis()
x4_reduced = clf.fit_transform(X4,data_f4.Age)

print('Label : Age')
print('LDA features :', x4_reduced.shape)
find_best_pca_n(X4)

Label : Age
LDA features : (2545, 46)
PCA : Optimum Feature number is 15



In [None]:
X4 = make_dataset(data_f4.drop(columns=['Age','Gender','Status']),'f4')
clf = LinearDiscriminantAnalysis()
x4_reduced = clf.fit_transform(X4,data_f4.Gender)

print('Label : Gender')
print('LDA features :', x4_reduced.shape)
find_best_pca_n(X4)

Label : Gender
LDA features : (2283, 1)
PCA : Optimum Feature number is 12



## F5

### Linear imporatnce

In [None]:
X5 = make_dataset(data_f5.drop(columns=['Age','Gender','Status']),'f5')
x5_reduced = get_feature_importance(X5,data_f5.Gender)
print(x5_reduced[x5_reduced>importance])
print("Label : Gender")
print("number of all features: ", X5.shape[1])
print("number of selected features : ", x5_reduced[x5_reduced>importance].shape[0])

11     100.000000
14      66.158807
23      46.880898
32      44.050996
35      87.024475
41      91.481471
44      87.055679
56      56.667346
74      52.060591
77      77.817557
80      49.234667
104     70.776079
107     46.455489
110     46.585136
113     41.896019
116     42.383765
125     74.701680
131     49.694342
134     70.901680
137     75.674657
143     69.195258
155     74.294450
Name: importance_normalized, dtype: float64
Label : Gender
number of all features:  204
number of selected features :  22


In [None]:
X5 = make_dataset(data_f5.drop(columns=['Age','Gender','Status']),'f5')
x5_reduced = get_feature_importance(X5,data_f5.Status)
print(x5_reduced[x5_reduced>importance])
print("Label : Status")
print("number of all features: ", X5.shape[1])
print("number of selected features : ", x5_reduced[x5_reduced>importance].shape[0])

2       52.120189
8       80.838325
14     100.000000
17      52.505512
41      47.513934
53      82.237525
62      60.717558
65      58.365530
80      43.399427
89      86.075975
92      57.671860
146     42.558951
182     44.506613
197     69.204788
200     61.749765
Name: importance_normalized, dtype: float64
Label : Status
number of all features:  204
number of selected features :  15


In [None]:
X5 = make_dataset(data_f5.drop(columns=['Age','Gender','Status']),'f5')
x5_reduced = get_feature_importance(X5,data_f5.Age)
print(x5_reduced[x5_reduced>importance])
print("Label : Age")
print("number of all features: ", X5.shape[1])
print("number of selected features : ", x5_reduced[x5_reduced>importance].shape[0])

8       62.442331
14      69.737892
17      50.676344
35      63.539591
53     100.000000
68      48.422690
71      55.406517
74      43.436874
98      49.154877
110     42.398705
134     57.389116
137     66.901990
140     47.704385
146     40.079302
158     42.660919
161     72.730652
179     40.494512
182     52.067255
188     40.328162
200     55.520192
203     49.751490
Name: importance_normalized, dtype: float64
Label : Age
number of all features:  204
number of selected features :  21


### Mutual Information

In [None]:
X5 = make_dataset(data_f5.drop(columns=['Age','Gender','Status']),'f5')
gains = feature_importance_mutual(X5,data_f5.Age)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.8, 3.3, 2.8, 2.8, 3.3, 2.8, 2.9, 3.3, 2.8, 2.8, 3.3, 2.8, 2.8, 3.3, 2.8, 2.8, 3.3, 2.9, 2.8, 3.3, 2.9, 2.9, 3.3, 2.9, 2.8, 3.3, 2.9, 2.8, 3.3, 2.9, 2.8, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.8, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.8, 2.8, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.8, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9, 3.3, 2.9, 2.9,

{2.8: 46, 2.9: 90, 3.3: 68}

In [None]:
X5 = make_dataset(data_f5.drop(columns=['Age','Gender','Status']),'f5')
gains = feature_importance_mutual(X5,data_f5.Gender)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4, 0.6, 0.4, 0.4,

{0.4: 136, 0.6: 68}

In [None]:
X5 = make_dataset(data_f5.drop(columns=['Age','Gender','Status']),'f5')
gains = feature_importance_mutual(X5,data_f5.Status)
print(list(gains))
my_dict = {i:list(gains).count(i) for i in list(gains)}
my_dict

[0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.8, 1.0, 0.7, 0.7, 1.0, 0.7, 0.8, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.8, 1.0, 0.7, 0.7, 1.0, 0.7, 0.8, 1.0, 0.7, 0.7, 1.0, 0.7, 0.8, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.8, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.8, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.8, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.7, 1.0, 0.7, 0.8,

{0.7: 126, 0.8: 10, 1.0: 68}

### LDA - PCA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA

X5 = make_dataset(data_f5.drop(columns=['Age','Gender','Status']),'f5')
clf = LinearDiscriminantAnalysis()
x5_reduced = clf.fit_transform(X5,data_f5.Status)

print('Label : Status')
print('LDA features :', x5_reduced.shape)
find_best_pca_n(X5)

Label : Status
LDA features : (1668, 2)
PCA : Optimum Feature number is 1



In [None]:
X5 = make_dataset(data_f5.drop(columns=['Age','Gender','Status']),'f5')
clf = LinearDiscriminantAnalysis()
x5_reduced = clf.fit_transform(X5,data_f5.Gender)

print('Label : Gender')
print('LDA features :', x5_reduced.shape)
find_best_pca_n(X5)

Label : Gender
LDA features : (1668, 1)
PCA : Optimum Feature number is 1



In [None]:
X5 = make_dataset(data_f5.drop(columns=['Age','Gender','Status']),'f5')
clf = LinearDiscriminantAnalysis()
x5_reduced = clf.fit_transform(X5,data_f5.Age)

print('Label : Age')
print('LDA features :', x5_reduced.shape)
find_best_pca_n(X5)

Label : Age
LDA features : (1668, 42)
PCA : Optimum Feature number is 1

