# Constant features 

* only one value in all the observations
* No discrimination power 
* First step in any feature selection

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [2]:
df = pd.read_csv('../datasets/dataset_1.csv')

In [3]:
df.head()

Unnamed: 0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,...,var_292,var_293,var_294,var_295,var_296,var_297,var_298,var_299,var_300,target
0,0,0,0.0,0.0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
1,0,0,0.0,3.0,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
2,0,0,0.0,5.88,0.0,0,0,0,0,0,...,0.0,0,0,3,0,0,0,0.0,67772.7216,0
3,0,0,0.0,14.1,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0
4,0,0,0.0,5.76,0.0,0,0,0,0,0,...,0.0,0,0,0,0,0,0,0.0,0.0,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), 
                                                    df['target'], 
                                                    test_size=0.3,
                                                    random_state=0)

# Method 1: VarianceThreshold

* Stores constant features 
* Cons: Only work for numiercal variable 

In [5]:
sel = VarianceThreshold(threshold=0)
sel.fit(X_train)

VarianceThreshold(threshold=0)

In [6]:
# features that will be removed 
# get_support is a boolean vector that indicates which features are retained 
constant_features = X_train.columns[~sel.get_support()]

constant_features

Index(['var_23', 'var_33', 'var_44', 'var_61', 'var_80', 'var_81', 'var_87',
       'var_89', 'var_92', 'var_97', 'var_99', 'var_112', 'var_113', 'var_120',
       'var_122', 'var_127', 'var_135', 'var_158', 'var_167', 'var_170',
       'var_171', 'var_178', 'var_180', 'var_182', 'var_195', 'var_196',
       'var_201', 'var_212', 'var_215', 'var_225', 'var_227', 'var_248',
       'var_294', 'var_297'],
      dtype='object')

In [7]:
X_train_filter = sel.transform(X_train)

In [8]:
X_train.shape, X_train_filter.shape

((35000, 300), (35000, 266))

# Method 2: std()

* Quick 
* Does not store the constant features 
* Cons: Only numerical variable

In [9]:
constant_features = [col for col in X_train.columns if X_train[col].std() == 0]

In [10]:
len(constant_features)

34

In [11]:
X_train_filter2 = X_train.drop(labels=constant_features, axis=1)

In [12]:
X_train_filter2.shape

(35000, 266)

# Method 3: nunique()

* Quick 
* Numerical & categorical variables 
* Cons: Does not store the constant features

In [13]:
constant_features = [col for col in X_train.columns if X_train[col].nunique() == 1]

In [14]:
len(constant_features)

34

In [15]:
X_train_filter3 = X_train.drop(labels=constant_features, axis=1)

In [16]:
X_train_filter2.shape

(35000, 266)