# Quasi-constant features 

* single value shared in the great majority (95-99%)

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold

In [2]:
df = pd.read_csv('../datasets/dataset_1.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), 
                                                    df['target'], 
                                                    test_size=0.3,
                                                    random_state=0)

# Method 1: VarianceThreshold

* stores quasi-constant features 
* cons: only for numerical variables 

In [4]:
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_train)

VarianceThreshold(threshold=0.01)

In [5]:
# check variables that will be removed
quasi_constant = X_train.columns[~sel.get_support()]
quasi_constant

Index(['var_1', 'var_2', 'var_7', 'var_9', 'var_10', 'var_19', 'var_23',
       'var_28', 'var_33', 'var_36', 'var_43', 'var_44', 'var_45', 'var_53',
       'var_56', 'var_59', 'var_61', 'var_66', 'var_67', 'var_69', 'var_71',
       'var_80', 'var_81', 'var_87', 'var_89', 'var_92', 'var_97', 'var_99',
       'var_104', 'var_106', 'var_112', 'var_113', 'var_116', 'var_120',
       'var_122', 'var_127', 'var_133', 'var_135', 'var_137', 'var_141',
       'var_146', 'var_158', 'var_167', 'var_170', 'var_171', 'var_177',
       'var_178', 'var_180', 'var_182', 'var_187', 'var_189', 'var_194',
       'var_195', 'var_196', 'var_197', 'var_198', 'var_201', 'var_202',
       'var_212', 'var_215', 'var_218', 'var_219', 'var_223', 'var_225',
       'var_227', 'var_233', 'var_234', 'var_235', 'var_245', 'var_247',
       'var_248', 'var_249', 'var_250', 'var_251', 'var_256', 'var_260',
       'var_267', 'var_274', 'var_282', 'var_285', 'var_287', 'var_289',
       'var_294', 'var_297', 'var_298']

In [6]:
# check the composition of the first variable to remove 
X_train[quasi_constant[0]].value_counts()/len(X_train)

0    0.999629
3    0.000200
6    0.000143
9    0.000029
Name: var_1, dtype: float64

In [7]:
# remove 
X_train_filter = sel.transform(X_train)

In [8]:
# compare the size
X_train.shape, X_train_filter.shape

((35000, 300), (35000, 215))

# Method 2: value_counts()

* quick 
* both for numerical and categorical 
* cons: does not store quasi-constant features 

In [9]:
quasi_constant_feat = []

for feature in X_train.columns:
    predominant = (X_train[feature].value_counts()/len(X_train)).sort_values(ascending=False).values[0]
  
    if predominant > 0.998:
        quasi_constant_feat.append(feature)
        
X_train_filter = X_train.drop(labels=quasi_constant_feat, axis=1)

In [10]:
np.array(quasi_constant_feat)

array(['var_1', 'var_2', 'var_3', 'var_6', 'var_7', 'var_9', 'var_10',
       'var_11', 'var_12', 'var_14', 'var_16', 'var_20', 'var_23',
       'var_24', 'var_28', 'var_32', 'var_33', 'var_34', 'var_36',
       'var_39', 'var_40', 'var_42', 'var_43', 'var_44', 'var_45',
       'var_48', 'var_53', 'var_56', 'var_59', 'var_60', 'var_61',
       'var_65', 'var_66', 'var_67', 'var_69', 'var_71', 'var_72',
       'var_73', 'var_77', 'var_78', 'var_80', 'var_81', 'var_87',
       'var_89', 'var_90', 'var_92', 'var_95', 'var_97', 'var_98',
       'var_99', 'var_102', 'var_104', 'var_106', 'var_111', 'var_112',
       'var_113', 'var_115', 'var_116', 'var_120', 'var_122', 'var_124',
       'var_125', 'var_126', 'var_127', 'var_129', 'var_130', 'var_133',
       'var_135', 'var_136', 'var_138', 'var_141', 'var_142', 'var_146',
       'var_149', 'var_150', 'var_151', 'var_153', 'var_158', 'var_159',
       'var_167', 'var_170', 'var_171', 'var_178', 'var_180', 'var_182',
       'var_183', 'var_

In [11]:
# check the composition of the first variable to remove 
X_train[quasi_constant[0]].value_counts()/len(X_train)

0    0.999629
3    0.000200
6    0.000143
9    0.000029
Name: var_1, dtype: float64

In [12]:
# this removed features more aggressively 
X_train.shape, X_train_filter.shape

((35000, 300), (35000, 158))