# Constant Features 

Constant features are those that show the same value, just one value, for all the observations of the dataset. In other words, the same value for all the rows of the dataset. These features provide no information that allows a machine learning model to discriminate or predict a target.

Identifying and removing constant features is an easy first step towards feature selection and more easily interpretable machine learning models.


To identify constant features, we can use the VarianceThreshold from Scikit-learn, or we can code it ourselves. If using the VarianceThreshold, all our variables need to be numerical. If we do it manually however, we can apply the code to both numerical and categorical variables.




In [58]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold

In [59]:
# Here I will take 2 different datasets 

In [60]:
df = pd.read_csv('../data/dataset_1.csv')

In [61]:
df.head(2)

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,...,var_287,var_288,var_289,var_290,var_291,var_292,var_293,var_294,var_295,var_296,var_297,var_298,var_299,var_300,target
0,0,0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,...,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0
1,0,0,0.0,3.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0,3,...,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0


In [62]:
df.tail(2)

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,...,var_287,var_288,var_289,var_290,var_291,var_292,var_293,var_294,var_295,var_296,var_297,var_298,var_299,var_300,target
49998,0,0,0.0,2.76,0.0,0,0,0,0,0,0.0,0.0,0.0,0,0,...,0,0.0,0,0.0,0,0.0,0,0,0,0,0,0,0.0,0.0,0
49999,0,0,0.0,5.64,0.0,0,0,0,0,0,0.0,0.0,0.0,0,3,...,0,0.0,0,0.0,0,2.85,0,0,0,0,0,0,0.0,0.0,0


In [None]:
df.shape
# We have 50K obersvations & 301 Features 
# We need to do feature selection in order to work with important features only

In [95]:
df.dtypes.value_counts()

int64      174
float64    127
dtype: int64

In [96]:
#df.info(verbose = True)

 Its always a good practice to split the dataset into train & test & do feature selection only to a training set.<br> 
 This way we can avoid overfitting

In [64]:
X_train ,X_test, y_train, y_test = train_test_split(
        df.drop(labels=['target'], axis = 1), # Drop Target
        df['target'], # Target
        test_size = 0.3, 
        random_state = 0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

# Using VarianceThreshold from Scikit-learn

 The VarianceThreshold from sklearn provides a simple baseline approach to feature selection.<br> 
 It removes all features which variance doesn’t meet a certain threshold. <br>
 By default, it removes all zero-variance features, i.e., features that have the same value in all samples.

In [65]:
sel = VarianceThreshold(threshold=0)
sel.fit(X_train) # It will find features with variance zero - that is constant

VarianceThreshold(threshold=0)

In [66]:
# get_support is a boolean vector that indicates which features are retained
# if we sum over get_support, we get the number of features that are not constant

sum(sel.get_support())



266

In [67]:
# Getting true for those columns that are not constant
sel.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False, False,
        True,  True,  True,  True,  True, False,  True, False,  True,
        True, False,  True,  True,  True,  True, False,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False, False,  True,  True,  True,  True,
        True,  True, False,  True, False,  True,  True,  True,  True,
       False,  True,

In [68]:
# Number of constant features

len(X_train.columns) - sum(sel.get_support())

34

In [69]:
X_train.shape[1] - sum(sel.get_support())

34

In [70]:
#Tilda will give us that are not non-constant
~sel.get_support()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False,  True, False,  True, False,
       False,  True, False, False, False, False,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True,  True, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
        True, False,

In [71]:
# GEtting only constant features
constant = X_train.columns[~sel.get_support()]
len(constant)

34

In [72]:
# We can look at the name of the features that are constant
constant

Index(['var_23', 'var_33', 'var_44', 'var_61', 'var_80', 'var_81', 'var_87',
       'var_89', 'var_92', 'var_97', 'var_99', 'var_112', 'var_113', 'var_120',
       'var_122', 'var_127', 'var_135', 'var_158', 'var_167', 'var_170',
       'var_171', 'var_178', 'var_180', 'var_182', 'var_195', 'var_196',
       'var_201', 'var_212', 'var_215', 'var_225', 'var_227', 'var_248',
       'var_294', 'var_297'],
      dtype='object')

In [73]:
# Looking at the values of each of these constants

In [74]:
X_train['var_23'].unique()

array([0], dtype=int64)

In [75]:
for col in constant:
    print(col, X_train[col].unique())
# All have value 0

var_23 [0]
var_33 [0]
var_44 [0]
var_61 [0]
var_80 [0]
var_81 [0]
var_87 [0]
var_89 [0.]
var_92 [0]
var_97 [0]
var_99 [0]
var_112 [0]
var_113 [0]
var_120 [0]
var_122 [0]
var_127 [0]
var_135 [0]
var_158 [0]
var_167 [0]
var_170 [0]
var_171 [0]
var_178 [0.]
var_180 [0.]
var_182 [0]
var_195 [0]
var_196 [0]
var_201 [0]
var_212 [0]
var_215 [0]
var_225 [0]
var_227 [0.]
var_248 [0]
var_294 [0]
var_297 [0]


In [76]:
# We will use transform method of the VarianceThreshold to reduce training & testing sets to its non-constant features

In [77]:
# Getting the non-constant feature names 
feat_names = X_train.columns[sel.get_support()]

In [78]:
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((35000, 266), (15000, 266))

In [79]:
# Our output X_Train & X_test are numpy array
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [80]:
# Converting into dataframe
X_train = pd.DataFrame(X_train, columns=feat_names)

In [81]:
X_train.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,...,var_284,var_285,var_286,var_287,var_288,var_289,var_290,var_291,var_292,var_293,var_295,var_296,var_298,var_299,var_300
0,0.0,0.0,0.0,2.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,2.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,2.79,85435.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,5.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Without Using VarianceThreshold package to remove constants from the datasets
#### Only works with numerical data

In [113]:
# Splitting the data into train & test again as can't use already transformed train & test data

X_train ,X_test, y_train, y_test = train_test_split(
        df.drop(labels=['target'], axis = 1), # Drop Target
        df['target'], # Target
        test_size = 0.3, 
        random_state = 0)

X_train.shape, X_test.shape

((35000, 300), (15000, 300))

In [114]:
# As we are dealing only with numeric data
# we will filter only those features for wihc the standard deviation is zero

# X_train.std()==0
# X_train['var_1'].std(), X_train['var_297'].std()

constant_feat = [feat for feat in X_train.columns if X_train[feat].std()==0]
len(constant_feat)

34

In [115]:
# Looking at the name of the constant features
constant_feat

['var_23',
 'var_33',
 'var_44',
 'var_61',
 'var_80',
 'var_81',
 'var_87',
 'var_89',
 'var_92',
 'var_97',
 'var_99',
 'var_112',
 'var_113',
 'var_120',
 'var_122',
 'var_127',
 'var_135',
 'var_158',
 'var_167',
 'var_170',
 'var_171',
 'var_178',
 'var_180',
 'var_182',
 'var_195',
 'var_196',
 'var_201',
 'var_212',
 'var_215',
 'var_225',
 'var_227',
 'var_248',
 'var_294',
 'var_297']

In [116]:
# Dropping the constant features from the train & test 
print(X_train.shape, X_test.shape)
print('Dropping Constant Features')
X_train.drop(labels = constant_feat, axis = 1, inplace = True)
X_test.drop(labels = constant_feat, axis = 1, inplace = True)

X_train.shape, X_test.shape

(35000, 300) (15000, 300)
Dropping COnstant Features


((35000, 266), (15000, 266))