### Quasi-constant features

Quasi-constant featuers are those that show the same value for the large majority of the observations.

In [3]:
import pandas as pd
import random

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold
from feature_engine.selection import DropConstantFeatures

In [6]:
# create a toy dataset with redundant and constant features

X,y = make_classification(
        n_samples = 1000,
        n_features = 10,
        n_classes = 2,
        random_state = 42
)

X = pd.DataFrame(X)
y = pd.Series(y)

# Add quasi-constant features
random.seed(10)
X.iloc[random.sample(range(0,1000), 990), [5,7,9]] = 1

X.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.964799,-0.066449,0.986768,-0.358079,0.997266,1.0,-1.615679,1.0,-0.628077,1.0
1,-0.916511,-0.566395,-1.008614,0.831617,-1.176962,1.0,1.752375,1.0,0.363896,1.0
2,-0.109484,-0.432774,-0.457649,0.793818,-0.268646,1.0,1.239086,1.0,-1.058145,1.0
3,1.750412,2.023606,1.688159,0.0068,-1.607661,1.0,-2.619427,1.0,-1.473127,1.0
4,-0.224726,-0.711303,-0.220778,0.117124,1.536061,1.0,0.348645,1.0,0.175915,1.0


In [7]:
# example of quasi-constant feature

X[5].value_counts()

 1.000000    990
 0.885231      1
 0.295512      1
-0.712221      1
-0.660834      1
-0.948348      1
 0.739675      1
-1.257868      1
-1.147691      1
-1.499181      1
-0.872989      1
Name: 5, dtype: int64

In [8]:
# split dataset into train and test

X_train,X_test,y_train,y_test = train_test_split(
        X, y, test_size = 0.3, random_state = 42
)

X_train.shape, X_test.shape

((700, 10), (300, 10))

### VarianceThreshold from Scikit-learn
- only works with numerical variables.Categorical variables need to be encoded first.

In [9]:
# To remove constant feature

selector = VarianceThreshold(threshold = 0.2)

# fit finds the features with zero variance
selector.fit(X_train)

In [10]:
# get_support is a boolean vector which flags the features to keep

# Number of selected features(non-constant)
sum(selector.get_support())

7

In [11]:
# the constant features

constant = X_train.columns[~selector.get_support()]
constant

Int64Index([5, 7, 9], dtype='int64')

In [12]:
# drop constant features

X_train_t = selector.transform(X_train)
X_test_t = selector.transform(X_test)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))

In [13]:
# Sklearn returns numpy arrays. Convert to dataframe

X_train_t = pd.DataFrame(X_train_t, columns = selector.get_feature_names_out())
X_test_t = pd.DataFrame(X_test_t, columns = selector.get_feature_names_out())

# show result
X_train_t.head()

Unnamed: 0,x0,x1,x2,x3,x4,x6,x8
0,0.044467,0.913772,-0.149724,-1.54573,-1.092164,0.521719,-0.666738
1,0.417228,-0.745942,0.603095,-0.661541,-1.239562,-1.237353,0.30463
2,0.838689,0.860183,0.663881,0.81026,-0.458877,-0.812271,-1.17953
3,-1.1997,0.193449,-1.019683,1.309865,0.023081,1.375817,1.458419
4,0.740765,-0.320668,0.620582,1.795211,0.534506,-0.821929,-0.93002


In [14]:
# To remove constant features

selector = DropConstantFeatures(tol = 0.95)

# fit finds the features with only 1 value
selector.fit(X_train)

In [15]:
# the constant features
selector.features_to_drop_

[5, 7, 9]

In [16]:
# drop constant features

X_train_t = selector.transform(X_train)
X_test_t = selector.transform(X_test)

X_train_t.shape, X_test_t.shape

((700, 7), (300, 7))

In [17]:
# the result is already a dataframe
X_train_t.head()

Unnamed: 0,0,1,2,3,4,6,8
541,0.044467,0.913772,-0.149724,-1.54573,-1.092164,0.521719,-0.666738
440,0.417228,-0.745942,0.603095,-0.661541,-1.239562,-1.237353,0.30463
482,0.838689,0.860183,0.663881,0.81026,-0.458877,-0.812271,-1.17953
422,-1.1997,0.193449,-1.019683,1.309865,0.023081,1.375817,1.458419
778,0.740765,-0.320668,0.620582,1.795211,0.534506,-0.821929,-0.93002


### Implementation using pandas

The pandas implementation of the DropConstantFeatures 

In [25]:
# create an emplty list

quasi_constant = []

# iterate over every feature
for feature in X_train.columns:
    # find the predominant value, i.e. the value shared by most observations
    predominant = X_train[feature].value_counts(
    normalize = True).sort_values(ascending = False).values[0]
    
    # evaluate the predominant feature
    if predominant > 0.95:
        quasi_constant.append(feature)
        
len(quasi_constant)

3

In [27]:
# print the quasi-constant feature indexes
quasi_constant

[5, 7, 9]