In [1]:
def get_selected_features(X, X_new):
    selected_features= []
    for i in range(len(X_new.columns)):
        for j in range(len(X.columns)):
            if (X_new.iloc[:,i].equals(X.iloc[:,j])):
                print(X.columns[j])
                selected_features.append(X.columns[j])
    return selected_features

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
def build_model(x, y, frac):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=frac)
    model = LogisticRegression(solver='liblinear').fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print('Test score: ', accuracy_score(y_test, y_pred))

In [4]:
df = pd.read_csv('../datasets/diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
X = df.drop('Outcome', axis=1)
Y = df['Outcome']

In [6]:
from sklearn.feature_selection import RFE

In [7]:
model = DecisionTreeClassifier(max_depth=4)

In [None]:
fit = RFE(model, n_features_to_select=4).fit(X, Y)

In [9]:
print('Number of features: ', fit.n_features_)
print('Selected features: ', fit.support_)
print('Feature ranking: ', fit.ranking_)

Number of features:  4
Selected features:  [False  True False False  True  True False  True]
Feature ranking:  [5 1 4 3 1 1 2 1]


In [10]:
feature_rank = pd.DataFrame({'Feature':X.columns,
                             'Rank':fit.ranking_,
                             'Selected':fit.support_})
feature_rank

Unnamed: 0,Feature,Rank,Selected
0,Pregnancies,5,False
1,Glucose,1,True
2,BloodPressure,4,False
3,SkinThickness,3,False
4,Insulin,1,True
5,BMI,1,True
6,DiabetesPedigreeFunction,2,False
7,Age,1,True


In [11]:
feature_rank = feature_rank.sort_values(by='Rank', ascending=True)
feature_rank

In [12]:
feature_rank

Unnamed: 0,Feature,Rank,Selected
1,Glucose,1,True
4,Insulin,1,True
5,BMI,1,True
7,Age,1,True
6,DiabetesPedigreeFunction,2,False
3,SkinThickness,3,False
2,BloodPressure,4,False
0,Pregnancies,5,False


In [13]:
RFE_selected_features = feature_rank[feature_rank['Selected'] == True]
RFE_selected_features

In [14]:
RFE_selected_features

Unnamed: 0,Feature,Rank,Selected
1,Glucose,1,True
4,Insulin,1,True
5,BMI,1,True
7,Age,1,True


In [15]:
RFE_features = X[RFE_selected_features['Feature']]

In [16]:
RFE_features.head()

Unnamed: 0,Glucose,Insulin,BMI,Age
0,148,0,33.6,50
1,85,0,26.6,31
2,183,0,23.3,32
3,89,94,28.1,21
4,137,168,43.1,33


In [18]:
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier

In [19]:
model = RandomForestClassifier(n_estimators=10)
feature_selector = SequentialFeatureSelector(model, k_features=4, forward=True, scoring='accuracy', cv=4)

In [21]:
features = feature_selector.fit(np.array(X), Y)

In [25]:
forward_features = list(X.columns[list(features.k_feature_idx_)])

In [28]:
forward_features

['Glucose', 'SkinThickness', 'BMI', 'Age']

In [29]:
forward_features = X[forward_features]

In [31]:
forward_features.head(3)

Unnamed: 0,Glucose,SkinThickness,BMI,Age
0,148,35,33.6,50
1,85,29,26.6,31
2,183,0,23.3,32


In [34]:
feature_selector = SequentialFeatureSelector(model, k_features=4, forward=False, scoring='accuracy', cv=4)
features = feature_selector.fit(np.array(X), Y)

In [35]:
backward_features = list(X.columns[list(features.k_feature_idx_)])
backward_features = X[backward_features]
backward_features.head(3)

Unnamed: 0,Glucose,BloodPressure,BMI,Age
0,148,72,33.6,50
1,85,66,26.6,31
2,183,64,23.3,32


In [36]:
build_model(X, Y, 0.2)

Test score:  0.8181818181818182


In [37]:
build_model(forward_features, Y, 0.2)

Test score:  0.8051948051948052


In [38]:
build_model(backward_features, Y, 0.2)

Test score:  0.7142857142857143
