### Feature selection in the presence of constant features

In [1]:
#!pip3 install feature-engine
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold
from feature_engine.selection import DropConstantFeatures

In [12]:
# create a toy dataset with redundant and constant features

X,y = make_classification(
        n_samples = 1000,
        n_features = 10,
        n_classes = 2,
        random_state = 42
)

X = pd.DataFrame(X)
y = pd.Series(y)

# Add constant feature
X[[0,5,9]] = 1

X.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,-0.066449,0.986768,-0.358079,0.997266,1,-1.615679,-1.210161,-0.628077,1
1,1,-0.566395,-1.008614,0.831617,-1.176962,1,1.752375,-0.984534,0.363896,1
2,1,-0.432774,-0.457649,0.793818,-0.268646,1,1.239086,-0.246383,-1.058145,1
3,1,2.023606,1.688159,0.0068,-1.607661,1,-2.619427,-0.357445,-1.473127,1
4,1,-0.711303,-0.220778,0.117124,1.536061,1,0.348645,-0.939156,0.175915,1


In [13]:
# split dataset into train and test

X_train,X_test,y_train,y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 42
)

X_train.shape, X_test.shape

((700, 10), (300, 10))

### VarianceThreshold from Scikit-learn

Only works with numerical variables. Categorical variables need to be encoded first.

In [15]:
# To remove constant feature

selector = VarianceThreshold(threshold = 0)

# fit finds the features with zero variance
selector.fit(X_train)

In [17]:
# get_support is a boolean vector which flags the features to keep

# Number of selected features(non-constant)
selector.get_support(), sum(selector.get_support())

(array([False,  True,  True,  True,  True, False,  True,  True,  True,
        False]),
 7)

In [18]:
# the constant features

constant = X_train.columns[~selector.get_support()]
constant

Int64Index([0, 5, 9], dtype='int64')

In [20]:
# drop constant features

X_train_t = selector.transform(X_train)
X_test_t = selector.transform(X_test)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))

In [21]:
# Sklearn returns numpy arrays. Convert to dataframe

X_train_t = pd.DataFrame(X_train_t, columns = selector.get_feature_names_out())
X_test_t = pd.DataFrame(X_test_t, columns = selector.get_feature_names_out())

# show result
X_train_t.head()

Unnamed: 0,x1,x2,x3,x4,x6,x7,x8
0,0.913772,-0.149724,-1.54573,-1.092164,0.521719,0.574071,-0.666738
1,-0.745942,0.603095,-0.661541,-1.239562,-1.237353,-0.614539,0.30463
2,0.860183,0.663881,0.81026,-0.458877,-0.812271,0.725081,-1.17953
3,0.193449,-1.019683,1.309865,0.023081,1.375817,-0.184551,1.458419
4,-0.320668,0.620582,1.795211,0.534506,-0.821929,0.497743,-0.93002


### Drop Constant Features using Feature-Engine

Works with numerical and categorical variables
- https://feature-engine.readthedocs.io/en/latest/index.html

In [22]:
# To remove constant features

selector = DropConstantFeatures(tol = 1)

# fit finds the features with only 1 value
selector.fit(X_train)

In [23]:
# the constant features
selector.features_to_drop_


[0, 5, 9]

In [24]:
# drop constant features

X_train_t = selector.transform(X_train)
X_test_t = selector.transform(X_test)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))

In [25]:
# the result is already a dataframe
X_train_t.head()

Unnamed: 0,1,2,3,4,6,7,8
541,0.913772,-0.149724,-1.54573,-1.092164,0.521719,0.574071,-0.666738
440,-0.745942,0.603095,-0.661541,-1.239562,-1.237353,-0.614539,0.30463
482,0.860183,0.663881,0.81026,-0.458877,-0.812271,0.725081,-1.17953
422,0.193449,-1.019683,1.309865,0.023081,1.375817,-0.184551,1.458419
778,-0.320668,0.620582,1.795211,0.534506,-0.821929,0.497743,-0.93002


### Pandas.std()

Using the variable standard deviation. Only works with numerical variables

In [26]:
# find constant features

constant_features = [
    
    col for col in X_train.columns if X_train[col].std() == 0
]

len(constant_features)

3

In [27]:
# drop these columns from the train and test sets

X_train_t = X_train.drop(labels = constant_features, axis = 1)
X_test_t =  X_test.drop(labels = constant_features, axis = 1)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))

### Pandas.nunique()

Using the number of unique values. Works with numerical and categorical variables


In [30]:
# the nuinque() method from pandas returns the number of different values in a variable

constant_features = [
    col for col in X_train.columns if X_train[col].nunique() == 1
]

len(constant_features)

3

In [31]:
# drop these columns from the train and test sets

X_train_t = X_train.drop(labels = constant_features, axis = 1)
X_test_t =  X_test.drop(labels = constant_features, axis = 1)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))