In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from feature_engine.selection import DropConstantFeatures, DropDuplicateFeatures

In [41]:
data = pd.read_csv("dataset_1.csv")

In [42]:
data.shape

(50000, 301)

In [43]:
feature_cols = ["var_{}".format(i) for i in range(1, 301)]

In [44]:
X_train, X_test, y_train, y_test = train_test_split(data[feature_cols], data["target"], test_size=0.2, random_state=42)

In [45]:
sel = DropConstantFeatures(tol=1, variables=None, missing_values="raise")
sel.fit(X_train)

DropConstantFeatures(variables=['var_1', 'var_2', 'var_3', 'var_4', 'var_5',
                                'var_6', 'var_7', 'var_8', 'var_9', 'var_10',
                                'var_11', 'var_12', 'var_13', 'var_14',
                                'var_15', 'var_16', 'var_17', 'var_18',
                                'var_19', 'var_20', 'var_21', 'var_22',
                                'var_23', 'var_24', 'var_25', 'var_26',
                                'var_27', 'var_28', 'var_29', 'var_30', ...])

In [46]:
sel.features_to_drop_

['var_14',
 'var_23',
 'var_33',
 'var_34',
 'var_44',
 'var_61',
 'var_66',
 'var_67',
 'var_69',
 'var_73',
 'var_80',
 'var_81',
 'var_87',
 'var_89',
 'var_92',
 'var_97',
 'var_99',
 'var_112',
 'var_113',
 'var_120',
 'var_122',
 'var_127',
 'var_129',
 'var_135',
 'var_158',
 'var_167',
 'var_171',
 'var_178',
 'var_180',
 'var_182',
 'var_183',
 'var_195',
 'var_196',
 'var_201',
 'var_212',
 'var_215',
 'var_225',
 'var_227',
 'var_248',
 'var_287',
 'var_294',
 'var_297']

In [47]:
print("features to remove: {}".format(len(sel.features_to_drop_)))

features to remove: 42


In [48]:
X_train[sel.features_to_drop_[0]].unique()

array([0])

In [49]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((40000, 258), (10000, 258))

In [50]:
X_train, X_test, y_train, y_test = train_test_split(data[feature_cols], data["target"], test_size=0.2, random_state=42)

In [51]:
%%time
# tol: percentage of observations that contain only one value in order to consider this feature quasi-constant
sel = DropConstantFeatures(tol=0.998, variables=None, missing_values="raise")
sel.fit(X_train)

CPU times: user 525 ms, sys: 28.1 ms, total: 554 ms
Wall time: 554 ms


DropConstantFeatures(tol=0.998,
                     variables=['var_1', 'var_2', 'var_3', 'var_4', 'var_5',
                                'var_6', 'var_7', 'var_8', 'var_9', 'var_10',
                                'var_11', 'var_12', 'var_13', 'var_14',
                                'var_15', 'var_16', 'var_17', 'var_18',
                                'var_19', 'var_20', 'var_21', 'var_22',
                                'var_23', 'var_24', 'var_25', 'var_26',
                                'var_27', 'var_28', 'var_29', 'var_30', ...])

In [52]:
print("features to remove: {}".format(len(sel.features_to_drop_)))

features to remove: 139


The number of features is equal in both methods (pandas and feature_engine)

In [53]:
X_train[sel.features_to_drop_[0]].value_counts() / X_train.shape[0]

0    0.99945
3    0.00030
6    0.00020
9    0.00005
Name: var_1, dtype: float64

Here we can see that 99.94% of all observations have a value of 0

In [54]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((40000, 161), (10000, 161))

In [55]:
%%time
sel_dup = DropDuplicateFeatures(variables=None, missing_values="raise")
sel_dup.fit(X_train)

CPU times: user 943 ms, sys: 20.8 ms, total: 963 ms
Wall time: 969 ms


DropDuplicateFeatures(missing_values='raise',
                      variables=['var_4', 'var_5', 'var_8', 'var_13', 'var_15',
                                 'var_17', 'var_18', 'var_19', 'var_21',
                                 'var_22', 'var_25', 'var_26', 'var_27',
                                 'var_29', 'var_30', 'var_31', 'var_35',
                                 'var_37', 'var_38', 'var_41', 'var_46',
                                 'var_47', 'var_49', 'var_50', 'var_51',
                                 'var_52', 'var_54', 'var_55', 'var_56',
                                 'var_57', ...])

In [56]:
len(sel_dup.features_to_drop_)

6

In [57]:
sel_dup.duplicated_feature_sets_

[{'var_148', 'var_37'},
 {'var_199', 'var_84'},
 {'var_143', 'var_296'},
 {'var_177', 'var_250'},
 {'var_226', 'var_232'},
 {'var_229', 'var_269'}]

In [58]:
X_train = sel_dup.transform(X_train)
X_test = sel_dup.transform(X_test)

X_train.shape, X_test.shape

((40000, 155), (10000, 155))

## Use both, Duplicated and Quasi-constant features elimination
## Sklearn Pipeline

In [59]:
from sklearn.pipeline import Pipeline

In [62]:
X_train, X_test, y_train, y_test = train_test_split(data[feature_cols], data["target"], test_size=0.2, random_state=42)

In [64]:
%%time
pipe = Pipeline([
    ("constant", DropConstantFeatures(tol=0.998)),
    ("duplicated", DropDuplicateFeatures())
])
pipe.fit(X_train)

CPU times: user 1.29 s, sys: 60.3 ms, total: 1.35 s
Wall time: 1.35 s


Pipeline(steps=[('constant',
                 DropConstantFeatures(tol=0.998,
                                      variables=['var_1', 'var_2', 'var_3',
                                                 'var_4', 'var_5', 'var_6',
                                                 'var_7', 'var_8', 'var_9',
                                                 'var_10', 'var_11', 'var_12',
                                                 'var_13', 'var_14', 'var_15',
                                                 'var_16', 'var_17', 'var_18',
                                                 'var_19', 'var_20', 'var_21',
                                                 'var_22', 'var_23', 'var_24',
                                                 'var_25', 'var_26', 'var_27',
                                                 'var_28', 'var_29', 'var_30', ...])),
                ('duplicated',
                 DropDuplicateFeatures(variables=['var_4', 'var_5', 'var_8',
                         

In [65]:
X_train = pipe.transform(X_train)
X_test = pipe.transform(X_test)

X_train.shape, X_test.shape

((40000, 155), (10000, 155))

In [70]:
len(pipe.named_steps["constant"].features_to_drop_)

139

In [71]:
pipe.named_steps["duplicated"].features_to_drop_

{'var_148', 'var_199', 'var_232', 'var_250', 'var_269', 'var_296'}