In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [26]:
data = pd.read_csv("santander.csv")

In [27]:
data.shape

(76020, 371)

In [28]:
# Target variable will be "TARGET"
x_train, x_test, y_train, y_test = train_test_split(data.drop(labels = ['TARGET'], axis = 1), 
                                                    data['TARGET'], 
                                                    test_size = 0.3,
                                                    random_state = 123)

In [29]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(x_train, 0.8)
print('correlated features: ', len(set(corr_features)) )

correlated features:  202


In [31]:
# removed correlated  features
x_train.drop(labels=corr_features, axis=1, inplace=True)
x_test.drop(labels=corr_features, axis=1, inplace=True)

x_train.shape, x_test.shape

((53214, 168), (22806, 168))

#### Important

In all feature selection procedures, it is good practice to select the features by examining only the training set. And this is to avoid overfit.

In [32]:
# linear models benefit from feature scaling

scaler = StandardScaler()
scaler.fit(x_train.fillna(0))

StandardScaler(copy=True, with_mean=True, with_std=True)

In [33]:
# here I will do the model fitting and feature selection
# altogether in one line of code

# first I specify the Logistic Regression model, and I
# make sure I select the Lasso (l1) penalty.

# Then I use the selectFromModel object from sklearn, which
# will select in theory the features which coefficients are non-zero

sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l1'))
sel_.fit(scaler.transform(x_train.fillna(0)), y_train)

SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
        norm_order=1, prefit=False, threshold=None)

In [34]:
# this command let's me visualise those features that were kept
sel_.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False,  True,  True,  True,
       False,  True,  True,  True, False,  True,  True,  True, False,
        True,  True,  True, False, False, False, False,  True, False,
        True,  True,  True,  True,  True, False,  True, False, False,
        True,  True,  True,  True, False,  True,  True,  True,  True,
       False, False, False, False,  True, False, False, False,  True,
        True,  True,  True,  True,  True, False, False,  True, False,
       False, False, False,  True, False,  True, False,  True,  True,
        True, False,  True,  True,  True, False, False, False, False,
       False,  True, False, False, False, False,  True,  True, False,
       False, False,  True,  True, False,  True, False, False,  True,
       False, False,  True,  True, False, False, False, False, False,
       False,  True,  True,  True, False, False, False,  True,  True,
       False, False,

In [35]:
# Now I make a list with the selected features
selected_feat = x_train.columns[(sel_.get_support())]

print('total features: {}'.format((x_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
    np.sum(sel_.estimator_.coef_ == 0)))

total features: 168
selected features: 91
features with coefficients shrank to zero: 77


In [36]:
# we can identify the removed features like this:
# numpy.ravel(array, order = ‘C’) : returns contiguous flattened array
# (1D array with all the input-array elements and with the same type as it). 
removed_feats = x_train.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index(['ind_var2_0', 'ind_var2', 'ind_var6', 'ind_var13_medio_0',
       'ind_var18_0', 'ind_var27_0', 'ind_var28_0', 'ind_var28', 'ind_var27',
       'ind_var31_0', 'ind_var41', 'ind_var46_0', 'ind_var46',
       'num_op_var40_hace3', 'num_var27_0', 'num_var28_0', 'num_var28',
       'num_var27', 'num_var41', 'num_var46_0', 'num_var46', 'saldo_var17',
       'saldo_var20', 'saldo_var28', 'saldo_var27', 'saldo_var30',
       'saldo_var33', 'saldo_var41', 'saldo_var46',
       'delta_imp_aport_var33_1y3', 'delta_imp_reemb_var33_1y3',
       'delta_imp_trasp_var17_in_1y3', 'delta_imp_trasp_var17_out_1y3',
       'delta_imp_trasp_var33_in_1y3', 'delta_imp_trasp_var33_out_1y3',
       'delta_num_reemb_var33_1y3', 'delta_num_trasp_var33_out_1y3',
       'imp_amort_var18_hace3', 'imp_amort_var34_hace3',
       'imp_aport_var17_ult1', 'imp_aport_var33_hace3', 'imp_aport_var33_ult1',
       'imp_compra_var44_hace3', 'imp_reemb_var13_hace3',
       'imp_reemb_var17_hace3', 'imp_reemb_var33_hace

In [37]:
# we can then remove the features from the training and testing set
# like this
x_train_selected = sel_.transform(x_train.fillna(0))
x_test_selected = sel_.transform(x_test.fillna(0))

x_train_selected.shape, x_test_selected.shape

((53214, 91), (22806, 91))

### Select features by random forests derived importance

In [40]:
# select features using the impotance derived from
# random forests

sel_ = SelectFromModel(RandomForestClassifier(n_estimators=400))
sel_.fit(x_train, y_train)

# remove features with zero coefficient from dataset
# and parse again as dataframe (output of sklearn is
# numpy array)
x_train_rf = pd.DataFrame(sel_.transform(x_train))
x_test_rf = pd.DataFrame(sel_.transform(x_test))

# add the columns name
x_train_rf.columns = x_train.columns[(sel_.get_support())]
x_test_rf.columns = x_train.columns[(sel_.get_support())]

In [None]:
x_train_rf.shape, X_test_rf.shape