Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

- Pregnancies: Number of times pregnant
- Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
- BloodPressure: Diastolic blood pressure (mm Hg)
- SkinThickness: Triceps skin fold thickness (mm)
- Insulin: 2-Hour serum insulin (mu U/ml)
- BMI: Body mass index (weight in kg/(height in m)^2)
- DiabetesPedigreeFunction: Diabetes pedigree function
- Age: Age (years)
- Outcome: Class variable (0 or 1)

0 means that a female has non-diabetic and 1 means that a female is diabetes.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os
import random
import string
import sys

# sampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler


from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import *
    
import pickle

import wandb

import warnings
warnings.filterwarnings('ignore')

# feature selection
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

# feature selection with library
from xverse.ensemble import VotingSelector
from shaphypetune import BoostSearch, BoostBoruta, BoostRFE, BoostRFA

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [4]:
df.shape

(768, 9)

In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
for d in df.columns:
    print(d)
    print(df[d].value_counts())

Pregnancies
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: Pregnancies, dtype: int64
Glucose
99     17
100    17
111    14
129    14
125    14
       ..
191     1
177     1
44      1
62      1
190     1
Name: Glucose, Length: 136, dtype: int64
BloodPressure
70     57
74     52
78     45
68     45
72     44
64     43
80     40
76     39
60     37
0      35
62     34
66     30
82     30
88     25
84     23
90     22
86     21
58     21
50     13
56     12
52     11
54     11
75      8
92      8
65      7
85      6
94      6
48      5
96      4
44      4
100     3
106     3
98      3
110     3
55      2
108     2
104     2
46      2
30      2
122     1
95      1
102     1
61      1
24      1
38      1
40      1
114     1
Name: BloodPressure, dtype: int64
SkinThickness
0     227
32     31
30     27
27     23
23     22
33     20
28     20
18     20
31     19
19   

In [8]:
df['Outcome'].value_counts() * 100 / len(df)

0    65.104167
1    34.895833
Name: Outcome, dtype: float64

In [9]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [10]:
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction']] = \
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction']].replace(0, np.NaN)

In [11]:
df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [12]:
msno.bar(df);

In [13]:
imputer = KNNImputer(n_neighbors=2)
df_ = pd.DataFrame(imputer.fit_transform(df),columns=df.columns)

In [14]:
def translateBMI(bmi):
    if bmi <= 18.5:
        return 'Underweight'
    elif bmi > 18.5 and bmi <= 24.9:
        return 'Normal'
    elif bmi > 24.9 and bmi <= 29.9:
        return 'Overweight'
    elif bmi > 29.9 and bmi <= 34.9:
        return 'Obesity 1'
    elif bmi > 34.9 and bmi <= 39.9:
        return 'Obesity 2'
    elif bmi > 39.9:
        return 'Obesity 3'

In [15]:
df_['Fat'] = df_['BMI'].apply(translateBMI)

In [16]:
sns.histplot(data=df_['Age'], kde=True);

In [17]:
sns.catplot(x='Fat', y='Age', data=df_, kind='box', height=10);

In [18]:
sns.scatterplot(x='Age', y='Glucose', hue='Outcome', data=df_);

In [19]:
df_.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Fat'],
      dtype='object')

In [20]:
df_['Outcome'].value_counts().plot.pie(autopct='%.2f');

In [21]:
df_ = pd.get_dummies(df_, columns=['Fat'])
df_

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Fat_Normal,Fat_Obesity 1,Fat_Obesity 2,Fat_Obesity 3,Fat_Overweight,Fat_Underweight
0,6.0,148.0,72.0,35.0,113.0,33.6,0.627,50.0,1.0,0,1,0,0,0,0
1,1.0,85.0,66.0,29.0,88.5,26.6,0.351,31.0,0.0,0,0,0,0,1,0
2,8.0,183.0,64.0,24.0,227.5,23.3,0.672,32.0,1.0,1,0,0,0,0,0
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0,0,0,0,0,1,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0.0,0,1,0,0,0,0
764,2.0,122.0,70.0,27.0,202.5,36.8,0.340,27.0,0.0,0,0,1,0,0,0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0.0,0,0,0,0,1,0
766,1.0,126.0,60.0,33.5,123.5,30.1,0.349,47.0,1.0,0,1,0,0,0,0


In [22]:
df_.columns = df_.columns.str.replace(' ', '_').str.lower()

In [23]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   pregnancies               768 non-null    float64
 1   glucose                   768 non-null    float64
 2   bloodpressure             768 non-null    float64
 3   skinthickness             768 non-null    float64
 4   insulin                   768 non-null    float64
 5   bmi                       768 non-null    float64
 6   diabetespedigreefunction  768 non-null    float64
 7   age                       768 non-null    float64
 8   outcome                   768 non-null    float64
 9   fat_normal                768 non-null    uint8  
 10  fat_obesity_1             768 non-null    uint8  
 11  fat_obesity_2             768 non-null    uint8  
 12  fat_obesity_3             768 non-null    uint8  
 13  fat_overweight            768 non-null    uint8  
 14  fat_underw

In [24]:
X = df_.drop(['outcome'], axis=1)
y = df_['outcome']

In [25]:
y.value_counts()

0.0    500
1.0    268
Name: outcome, dtype: int64

# Feature Selection

#### remove low variance

In [26]:
var = VarianceThreshold(threshold=0.3)
var = var.fit(X, y)

In [27]:
cols = var.get_support(indices=True)
cols

array([0, 1, 2, 3, 4, 5, 7])

In [28]:
features = X.columns[cols]
features

Index(['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin',
       'bmi', 'age'],
      dtype='object')

#### remove not correlation

In [29]:
plt.figure(figsize=(12,12))
cor = df_.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [30]:
cor_target = abs(cor['outcome'])

features = cor_target[cor_target>0.1]
features

pregnancies                 0.221898
glucose                     0.497174
bloodpressure               0.175754
skinthickness               0.264798
insulin                     0.304302
bmi                         0.311075
diabetespedigreefunction    0.173844
age                         0.238356
outcome                     1.000000
fat_normal                  0.230167
fat_obesity_1               0.132691
fat_obesity_3               0.166739
fat_overweight              0.146065
Name: outcome, dtype: float64

In [31]:
features.index

Index(['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin',
       'bmi', 'diabetespedigreefunction', 'age', 'outcome', 'fat_normal',
       'fat_obesity_1', 'fat_obesity_3', 'fat_overweight'],
      dtype='object')

#### K-Best fit

In [32]:
KBest = SelectKBest(score_func=chi2, k='all')
KBest = KBest.fit(X, y)

In [33]:
df_scores = pd.DataFrame({'features': X.columns, 'Chi2Score': KBest.scores_, 'pValue': KBest.pvalues_})
df_scores.plot.bar(x='features', y='Chi2Score');

In [34]:
k = 3
cols = df_scores.sort_values(by='Chi2Score', ascending=False).head(k).index
cols

Int64Index([4, 1, 7], dtype='int64')

In [35]:
features = X.columns[cols]
features

Index(['insulin', 'glucose', 'age'], dtype='object')

#### Select Percentile

In [36]:
SPercentile = SelectPercentile(score_func=chi2, percentile=80)
SPercentile = SPercentile.fit(X, y)

In [37]:
cols = SPercentile.get_support(indices=True)
cols

array([ 0,  1,  2,  3,  4,  5,  7,  8,  9, 11, 12])

In [38]:
features = X.columns[cols]
features

Index(['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin',
       'bmi', 'age', 'fat_normal', 'fat_obesity_1', 'fat_obesity_3',
       'fat_overweight'],
      dtype='object')

#### Feature importance score

In [39]:
forest = RandomForestClassifier(n_estimators=10, n_jobs=-1)
forest.fit(X, y)

RandomForestClassifier(n_estimators=10, n_jobs=-1)

In [40]:
df_scores = pd.DataFrame({'features': X.columns, 'importanceScore': forest.feature_importances_})
df_scores.plot.bar(x='features', y='importanceScore');

In [41]:
features = df_scores[df_scores['importanceScore']>0.1]
features

Unnamed: 0,features,importanceScore
1,glucose,0.217886
4,insulin,0.143848
5,bmi,0.138941
6,diabetespedigreefunction,0.108755
7,age,0.103279


In [42]:
cols = features.index
cols

Int64Index([1, 4, 5, 6, 7], dtype='int64')

In [43]:
features = X.columns[cols]
features

Index(['glucose', 'insulin', 'bmi', 'diabetespedigreefunction', 'age'], dtype='object')

#### Sequential Forward Selection (SFS)

In [44]:
len(X.columns)

14

In [45]:
estimator = RandomForestClassifier(n_estimators=10, n_jobs=-1)
#estimator = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)

sfs = SFS(estimator, k_features=(1, len(X.columns)), forward=True, floating=False, verbose=2, scoring='accuracy', cv=5)       
sfs.fit(X, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:   10.5s finished

[2022-12-25 00:42:27] Features: 1/14 -- score: 0.7071131482896188[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    8.5s finished

[2022-12-25 00:42:36] Features: 2/14 -- score: 0.7213564213564213[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    7.8s finished

[2022-12-25 00:42:44] Features: 3/14 -- score: 0.746031746031746[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   

SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=10,
                                                           n_jobs=-1),
                          k_features=(1, 14), scoring='accuracy', verbose=2)

In [46]:
sfs.subsets_

{1: {'feature_idx': (1,),
  'cv_scores': array([0.68831169, 0.69480519, 0.67532468, 0.73856209, 0.73856209]),
  'avg_score': 0.7071131482896188,
  'feature_names': ('glucose',)},
 2: {'feature_idx': (1, 5),
  'cv_scores': array([0.73376623, 0.69480519, 0.73376623, 0.75163399, 0.69281046]),
  'avg_score': 0.7213564213564213,
  'feature_names': ('glucose', 'bmi')},
 3: {'feature_idx': (1, 3, 5),
  'cv_scores': array([0.80519481, 0.70779221, 0.77272727, 0.7254902 , 0.71895425]),
  'avg_score': 0.746031746031746,
  'feature_names': ('glucose', 'skinthickness', 'bmi')},
 4: {'feature_idx': (1, 3, 5, 7),
  'cv_scores': array([0.74025974, 0.7012987 , 0.75974026, 0.79738562, 0.71895425]),
  'avg_score': 0.7435277141159494,
  'feature_names': ('glucose', 'skinthickness', 'bmi', 'age')},
 5: {'feature_idx': (1, 3, 5, 7, 13),
  'cv_scores': array([0.75324675, 0.72727273, 0.77272727, 0.78431373, 0.73856209]),
  'avg_score': 0.7552245140480435,
  'feature_names': ('glucose',
   'skinthickness',
   

In [47]:
sfs.k_feature_names_

('pregnancies',
 'glucose',
 'skinthickness',
 'insulin',
 'bmi',
 'diabetespedigreefunction',
 'age',
 'fat_obesity_1',
 'fat_obesity_3',
 'fat_overweight',
 'fat_underweight')

In [48]:
pd.DataFrame.from_dict(sfs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
1,"(1,)","[0.6883116883116883, 0.6948051948051948, 0.675...",0.707113,"(glucose,)",0.033974,0.026433,0.013217
2,"(1, 5)","[0.7337662337662337, 0.6948051948051948, 0.733...",0.721356,"(glucose, bmi)",0.030113,0.023429,0.011714
3,"(1, 3, 5)","[0.8051948051948052, 0.7077922077922078, 0.772...",0.746032,"(glucose, skinthickness, bmi)",0.047505,0.03696,0.01848
4,"(1, 3, 5, 7)","[0.7402597402597403, 0.7012987012987013, 0.759...",0.743528,"(glucose, skinthickness, bmi, age)",0.042865,0.033351,0.016675
5,"(1, 3, 5, 7, 13)","[0.7532467532467533, 0.7272727272727273, 0.772...",0.755225,"(glucose, skinthickness, bmi, age, fat_underwe...",0.027055,0.021049,0.010525
6,"(1, 3, 5, 6, 7, 13)","[0.7337662337662337, 0.7207792207792207, 0.772...",0.757873,"(glucose, skinthickness, bmi, diabetespedigree...",0.037685,0.02932,0.01466
7,"(1, 3, 5, 6, 7, 12, 13)","[0.7727272727272727, 0.7077922077922078, 0.779...",0.764366,"(glucose, skinthickness, bmi, diabetespedigree...",0.036666,0.028527,0.014264
8,"(0, 1, 3, 5, 6, 7, 12, 13)","[0.7337662337662337, 0.7337662337662337, 0.772...",0.75132,"(pregnancies, glucose, skinthickness, bmi, dia...",0.028943,0.022518,0.011259
9,"(0, 1, 3, 5, 6, 7, 9, 12, 13)","[0.7337662337662337, 0.7597402597402597, 0.766...",0.759138,"(pregnancies, glucose, skinthickness, bmi, dia...",0.025036,0.019479,0.00974
10,"(0, 1, 3, 4, 5, 6, 7, 9, 12, 13)","[0.7402597402597403, 0.7792207792207793, 0.785...",0.757771,"(pregnancies, glucose, skinthickness, insulin,...",0.026197,0.020382,0.010191


In [49]:
plot_sfs(sfs.get_metric_dict(), kind='std_err');

In [50]:
cols = list(sfs.k_feature_idx_)
cols

[0, 1, 3, 4, 5, 6, 7, 9, 11, 12, 13]

In [51]:
features = X.columns[cols]
features

Index(['pregnancies', 'glucose', 'skinthickness', 'insulin', 'bmi',
       'diabetespedigreefunction', 'age', 'fat_obesity_1', 'fat_obesity_3',
       'fat_overweight', 'fat_underweight'],
      dtype='object')

#### Sequential Backward Selection (SBS)

In [52]:
estimator = RandomForestClassifier(n_estimators=10, n_jobs=-1)
#estimator = KNeighborsClassifier(n_neighbors=4, n_jobs=-1)

sbs = SFS(estimator, k_features=(1, len(X.columns)), forward=False, floating=False, verbose=2, scoring='accuracy', cv=5)       
sbs.fit(X, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    9.0s finished

[2022-12-25 00:43:36] Features: 13/1 -- score: 0.7656480774127832[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    8.3s finished

[2022-12-25 00:43:45] Features: 12/1 -- score: 0.7682454800101859[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    7.7s finished

[2022-12-25 00:43:52] Features: 11/1 -- score: 0.7565656565656566[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  

SequentialFeatureSelector(estimator=RandomForestClassifier(n_estimators=10,
                                                           n_jobs=-1),
                          forward=False, k_features=(1, 14), scoring='accuracy',
                          verbose=2)

In [53]:
sbs.subsets_

{14: {'feature_idx': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13),
  'cv_scores': array([0.74675325, 0.71428571, 0.72077922, 0.77777778, 0.73856209]),
  'avg_score': 0.7396316102198455,
  'feature_names': ('pregnancies',
   'glucose',
   'bloodpressure',
   'skinthickness',
   'insulin',
   'bmi',
   'diabetespedigreefunction',
   'age',
   'fat_normal',
   'fat_obesity_1',
   'fat_obesity_2',
   'fat_obesity_3',
   'fat_overweight',
   'fat_underweight')},
 13: {'feature_idx': (0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13),
  'cv_scores': array([0.74675325, 0.75324675, 0.77922078, 0.81045752, 0.73856209]),
  'avg_score': 0.7656480774127832,
  'feature_names': ('pregnancies',
   'glucose',
   'bloodpressure',
   'insulin',
   'bmi',
   'diabetespedigreefunction',
   'age',
   'fat_normal',
   'fat_obesity_1',
   'fat_obesity_2',
   'fat_obesity_3',
   'fat_overweight',
   'fat_underweight')},
 12: {'feature_idx': (0, 1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13),
  'cv_scores': array([0.77272727

In [54]:
cols = list(sbs.k_feature_idx_)
cols

[0, 1, 2, 4, 5, 7, 8, 11, 12, 13]

In [55]:
pd.DataFrame.from_dict(sbs.get_metric_dict()).T

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
14,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)","[0.7467532467532467, 0.7142857142857143, 0.720...",0.739632,"(pregnancies, glucose, bloodpressure, skinthic...",0.028768,0.022382,0.011191
13,"(0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13)","[0.7467532467532467, 0.7532467532467533, 0.779...",0.765648,"(pregnancies, glucose, bloodpressure, insulin,...",0.033696,0.026217,0.013108
12,"(0, 1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13)","[0.7727272727272727, 0.7337662337662337, 0.785...",0.768245,"(pregnancies, glucose, bloodpressure, insulin,...",0.041458,0.032256,0.016128
11,"(0, 1, 2, 4, 5, 7, 8, 9, 11, 12, 13)","[0.7532467532467533, 0.7207792207792207, 0.753...",0.756566,"(pregnancies, glucose, bloodpressure, insulin,...",0.034342,0.026719,0.01336
10,"(0, 1, 2, 4, 5, 7, 8, 11, 12, 13)","[0.7662337662337663, 0.7402597402597403, 0.766...",0.770885,"(pregnancies, glucose, bloodpressure, insulin,...",0.029011,0.022571,0.011286
9,"(0, 1, 2, 4, 5, 7, 8, 11, 13)","[0.7597402597402597, 0.7142857142857143, 0.766...",0.756549,"(pregnancies, glucose, bloodpressure, insulin,...",0.028202,0.021942,0.010971
8,"(0, 1, 2, 4, 5, 7, 11, 13)","[0.7857142857142857, 0.7272727272727273, 0.720...",0.75525,"(pregnancies, glucose, bloodpressure, insulin,...",0.03716,0.028912,0.014456
7,"(0, 1, 2, 4, 5, 7, 11)","[0.7532467532467533, 0.7337662337662337, 0.740...",0.747415,"(pregnancies, glucose, bloodpressure, insulin,...",0.026426,0.020561,0.01028
6,"(0, 1, 4, 5, 7, 11)","[0.7272727272727273, 0.7272727272727273, 0.740...",0.76053,"(pregnancies, glucose, insulin, bmi, age, fat_...",0.05062,0.039384,0.019692
5,"(1, 4, 5, 7, 11)","[0.7467532467532467, 0.6948051948051948, 0.720...",0.742271,"(glucose, insulin, bmi, age, fat_obesity_3)",0.046498,0.036177,0.018088


In [56]:
plot_sfs(sbs.get_metric_dict(), kind='std_err');

In [57]:
cols = list(sbs.k_feature_idx_)
cols

[0, 1, 2, 4, 5, 7, 8, 11, 12, 13]

In [58]:
features = X.columns[cols]
features

Index(['pregnancies', 'glucose', 'bloodpressure', 'insulin', 'bmi', 'age',
       'fat_normal', 'fat_obesity_3', 'fat_overweight', 'fat_underweight'],
      dtype='object')

###  Xverse

In [59]:
# Library: Xverse handles only binary target

In [60]:
clf = VotingSelector()
clf.fit(X, y)
print(clf.available_techniques)

['WOE', 'RF', 'RFE', 'ETC', 'CS', 'L_ONE']


In [61]:
clf.feature_importances_

Unnamed: 0,Variable_Name,Information_Value,Random_Forest,Recursive_Feature_Elimination,Extra_Trees,Chi_Square,L_One
0,glucose,1.006845,0.204139,0.0,0.204768,14.72489,0.007775
1,insulin,0.594699,0.143291,0.0,0.11026,6.693139,0.000761
2,bmi,0.451343,0.125736,0.0,0.092251,5.069542,0.0
3,age,0.415184,0.116718,0.0,0.124604,8.205691,-0.00041
4,fat_normal,0.355391,0.012147,-1.202419,0.030359,35.282529,-0.04849
5,skinthickness,0.339136,0.100033,0.0,0.093173,2.523053,0.004215
6,pregnancies,0.230562,0.07136,0.148075,0.097577,6.559982,0.044424
7,bloodpressure,0.136901,0.076018,0.0,0.076495,0.754455,-0.021835
8,diabetespedigreefunction,0.123829,0.108479,0.944327,0.103469,2.758584,0.0
9,fat_obesity_3,0.115389,0.011061,1.377338,0.015998,18.599522,0.0
