### Quasi-constant features
Quasi-constant features are those that show the same value for the great majority of the observations of the dataset. In general, these features provide little, if any, information that allows a machine learning model to discriminate or predict a target. But there can be exceptions. So you should be careful when removing these type of features.

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_selection import VarianceThreshold

In [2]:
df = pd.read_csv('../featdata/home_credit.csv')

In [3]:
df.shape

(307511, 361)

In [5]:
df.head(2)

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,client_installments_AMT_PAYMENT_min_sum,DAYS_BIRTH,AMT_CREDIT,AMT_ANNUITY,DAYS_EMPLOYED,bureau_DAYS_CREDIT_ENDDATE_max,bureau_DAYS_CREDIT_max,DAYS_ID_PUBLISH,bureau_AMT_CREDIT_MAX_OVERDUE_mean,bureau_DAYS_ENDDATE_FACT_max,bureau_AMT_CREDIT_SUM_DEBT_mean,OWN_CAR_AGE,...,previous_loans_CHANNEL_TYPE_Regional / Local_count,ORGANIZATION_TYPE_Transport: type 3,client_cash_NAME_CONTRACT_STATUS_Returned to the store_count_norm_max,ORGANIZATION_TYPE_School,previous_loans_NAME_TYPE_SUITE_Other_B_count_norm,bureau_CNT_CREDIT_PROLONG_mean,client_credit_AMT_DRAWINGS_CURRENT_max_max,client_bureau_balance_MONTHS_BALANCE_max_sum,client_cash_NAME_CONTRACT_STATUS_Active_count_min,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,OCCUPATION_TYPE_High skill tech staff,previous_loans_NAME_PAYMENT_TYPE_XNA_count,FLAG_PHONE,TARGET,SK_ID_CURR
0,0.083037,0.262949,0.139376,175783.725,-9461,406597.5,24700.5,-637,780.0,-103.0,-2120,1681.029,-36.0,49156.2,,...,0.0,0,0.0,0,0.0,0.0,,-124.0,19.0,1,0,1.0,1,1,100002
1,0.311267,0.622246,,1154108.295,-16765,1293502.5,35698.5,-1188,1216.0,-606.0,-291,0.0,-540.0,0.0,,...,0.0,0,0.0,1,0.0,0.0,,0.0,7.0,0,0,1.0,1,0,100003


In [6]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['TARGET'], axis=1), # drop the target
    df['TARGET'], # just the target
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((215257, 360), (92254, 360))

## Remove quasi-constant features
### Using the VarianceThreshold from sklearn
The VarianceThreshold from sklearn provides a simple baseline approach to feature selection. It removes all features which variance doesn’t meet a certain threshold. By default, it removes all zero-variance features

Here, we will change the default threshold to remove quasi-constant features, or, I should better say, features with low-variance:

In [7]:
# Features with a training-set variance lower than this threshold will be removed. The default is to keep all features \
# with non-zero variance i.e. remove the features that have the same value in all samples.
# 1 percent
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_train) # Fit find the features with low variance

VarianceThreshold(threshold=0.01)

In [8]:
# get_support is a boolean vector that indicates which features are retained
# if we sum over get_support, we get the number of features that are not quasi-constant

sum(sel.get_support())

318

In [9]:
# GEtting only quasi-constant features
quasi_constant = X_train.columns[~sel.get_support()]
len(quasi_constant)

42

In [10]:
quasi_constant

Index(['REGION_POPULATION_RELATIVE', 'YEARS_BEGINEXPLUATATION_AVG',
       'BASEMENTAREA_AVG', 'LANDAREA_AVG',
       'bureau_CREDIT_TYPE_Mortgage_count_norm',
       'previous_loans_PRODUCT_COMBINATION_Cash X-Sell: high_count_norm',
       'NONLIVINGAREA_AVG',
       'client_cash_NAME_CONTRACT_STATUS_Active_count_norm_mean',
       'previous_loans_PRODUCT_COMBINATION_Cash Street: low_count_norm',
       'COMMONAREA_AVG', 'previous_loans_RATE_DOWN_PAYMENT_mean',
       'previous_loans_RATE_DOWN_PAYMENT_min', 'ENTRANCES_AVG',
       'FLAG_DOCUMENT_18', 'bureau_CREDIT_TYPE_Car loan_count_norm',
       'bureau_CREDIT_TYPE_Microloan_count_norm',
       'previous_loans_CHANNEL_TYPE_Channel of corporate sales_count_norm',
       'client_bureau_balance_STATUS_1_count_norm_mean',
       'client_cash_NAME_CONTRACT_STATUS_Completed_count_norm_mean',
       'previous_loans_CODE_REJECT_REASON_SCO_count_norm',
       'NONLIVINGAPARTMENTS_AVG',
       'client_cash_NAME_CONTRACT_STATUS_Signed_count_n

In [11]:
quasi_constant.dtype

dtype('O')

In [12]:
type(quasi_constant)

pandas.core.indexes.base.Index

In [13]:
X_train[quasi_constant].head()

Unnamed: 0,REGION_POPULATION_RELATIVE,YEARS_BEGINEXPLUATATION_AVG,BASEMENTAREA_AVG,LANDAREA_AVG,bureau_CREDIT_TYPE_Mortgage_count_norm,previous_loans_PRODUCT_COMBINATION_Cash X-Sell: high_count_norm,NONLIVINGAREA_AVG,client_cash_NAME_CONTRACT_STATUS_Active_count_norm_mean,previous_loans_PRODUCT_COMBINATION_Cash Street: low_count_norm,COMMONAREA_AVG,previous_loans_RATE_DOWN_PAYMENT_mean,previous_loans_RATE_DOWN_PAYMENT_min,ENTRANCES_AVG,FLAG_DOCUMENT_18,bureau_CREDIT_TYPE_Car loan_count_norm,...,bureau_CREDIT_ACTIVE_Sold_count_norm,previous_loans_CODE_REJECT_REASON_SCOFR_count_norm,client_cash_NAME_CONTRACT_STATUS_Active_count_norm_max,previous_loans_PRODUCT_COMBINATION_Cash Street: middle_count_norm,previous_loans_CODE_REJECT_REASON_VERIF_count_norm,previous_loans_CODE_REJECT_REASON_LIMIT_count_norm,FLAG_DOCUMENT_13,previous_loans_CODE_REJECT_REASON_CLIENT_count_norm,previous_loans_NAME_GOODS_CATEGORY_Jewelry_count_norm,client_cash_NAME_CONTRACT_STATUS_Signed_count_norm_mean,previous_loans_PRODUCT_COMBINATION_POS industry without interest_count_norm,ORGANIZATION_TYPE_Transport: type 3,client_cash_NAME_CONTRACT_STATUS_Returned to the store_count_norm_max,previous_loans_NAME_TYPE_SUITE_Other_B_count_norm,bureau_CNT_CREDIT_PROLONG_mean
125587,0.002042,0.9876,0.1,0.0,0.0,0.0,0.1764,0.833333,0.0,0.0,0.531385,0.531385,0.2069,0,0.0,...,0.0,0.0,0.833333,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
51500,0.026392,0.9881,,,0.0,0.0,0.069,0.944444,0.0,,0.0,0.0,0.2759,0,0.0,...,0.0,0.0,1.0,0.166667,0.0,0.166667,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
276439,0.018634,0.9791,,,0.0,0.0,0.0329,0.952381,0.0,,0.031775,0.0,0.0345,0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
244558,0.072508,0.9816,0.1346,0.0,0.0,0.0,0.0006,0.833333,0.0,0.058,0.0,0.0,0.1983,0,0.0,...,0.0,0.0,0.833333,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0
98003,0.018801,,,,0.0,0.0,,0.8125,0.0,,0.105643,0.100771,,0,0.0,...,0.0,0.0,0.888889,0.0,0.0,0.0,0,0.0,0.0,0.03125,0.0,0,0.0,0.0,0.0


In [15]:
X_train['REGION_POPULATION_RELATIVE'].var()

0.00019157966053260026

We can then remove the quasi-constant features utilizing the transform() method from the VarianceThreshold. Remember that this returns a NumPy array without feature names, so if we want a dataframe we need to reconstitute it.

In [16]:
# capture feature names

feat_names = X_train.columns[sel.get_support()]

In [17]:
#remove the quasi-constant features

X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

X_train.shape, X_test.shape

((215257, 318), (92254, 318))

By removing constant and almost constant features, we reduced the feature space from 360 to 318. This means, that 42 features were removed from this dataset. Almost a third!!

In [18]:
# trasnform the array into a dataframe

X_train = pd.DataFrame(X_train, columns=feat_names)
X_test = pd.DataFrame(X_test, columns=feat_names)

X_test.head()

Unnamed: 0,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,client_installments_AMT_PAYMENT_min_sum,DAYS_BIRTH,AMT_CREDIT,AMT_ANNUITY,DAYS_EMPLOYED,bureau_DAYS_CREDIT_ENDDATE_max,bureau_DAYS_CREDIT_max,DAYS_ID_PUBLISH,bureau_AMT_CREDIT_MAX_OVERDUE_mean,bureau_DAYS_ENDDATE_FACT_max,bureau_AMT_CREDIT_SUM_DEBT_mean,OWN_CAR_AGE,...,client_credit_AMT_BALANCE_min_sum,previous_loans_NAME_CONTRACT_STATUS_Refused_count,previous_loans_PRODUCT_COMBINATION_Card Street_count_norm,client_installments_NUM_INSTALMENT_NUMBER_max_min,client_credit_AMT_PAYMENT_CURRENT_min_sum,previous_loans_CHANNEL_TYPE_Regional / Local_count,ORGANIZATION_TYPE_School,client_credit_AMT_DRAWINGS_CURRENT_max_max,client_bureau_balance_MONTHS_BALANCE_max_sum,client_cash_NAME_CONTRACT_STATUS_Active_count_min,WEEKDAY_APPR_PROCESS_START_WEDNESDAY,OCCUPATION_TYPE_High skill tech staff,previous_loans_NAME_PAYMENT_TYPE_XNA_count,FLAG_PHONE,SK_ID_CURR
0,0.484936,0.18271,0.145543,584.595,-21774.0,417024.0,20191.5,-2125.0,1673.0,-153.0,-4653.0,0.0,-214.0,117669.0,,...,,2.0,0.0,10.0,,0.0,0.0,,-7.0,19.0,0.0,0.0,4.0,1.0,149741.0
1,0.624574,0.669053,0.680139,89071.2,-18499.0,640080.0,31261.5,-2055.0,9690.0,-265.0,-2028.0,22057.5,-1417.0,195927.8,,...,0.0,1.0,0.0,5.0,0.0,0.0,0.0,450000.0,0.0,5.0,0.0,0.0,1.0,0.0,363290.0
2,,0.454279,0.173527,,-18084.0,900000.0,45000.0,-4159.0,6612.0,-94.0,-1616.0,491.814,-86.0,1059146.0,4.0,...,,,,,,,0.0,,0.0,,0.0,0.0,,0.0,436006.0
3,,0.688145,0.440058,270419.805,-12319.0,1125000.0,33025.5,-2532.0,31162.0,-215.0,-4215.0,0.0,-75.0,40611.38,6.0,...,0.0,0.0,0.0,11.0,0.0,1.0,0.0,302710.005,0.0,11.0,0.0,0.0,1.0,0.0,377703.0
4,,0.426409,0.081726,379392.48,-10151.0,835380.0,42840.0,-2516.0,1766.0,-223.0,-2834.0,12411.0,-367.0,405062.2,6.0,...,0.0,1.0,0.0,3.0,0.0,0.0,0.0,67500.0,0.0,3.0,0.0,0.0,2.0,1.0,188624.0


# Without Using the Package

First, I will separate the dataset into train and test and remove the constant features again. Then, I will provide an alternative method to find out quasi-constant features.

This method, as opposed to the VarianceThreshold, can be used for both numerical and categorical variables.

In [25]:
# separate train and test
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(labels=['TARGET'], axis=1),
    df['TARGET'],
    test_size=0.3,
    random_state=0)

# remove constant features
# using the code from the previous lecture

constant_features = [
    feat for feat in X_train.columns if X_train[feat].std() == 0
]

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True)

X_train.shape, X_test.shape

((215257, 360), (92254, 360))

In [26]:
quasi_features = [
    feat for feat in X_train.columns if X_train[feat].var() < 0.01
]

In [27]:
len(quasi_features)

42

In [29]:
quasi_features

['REGION_POPULATION_RELATIVE',
 'YEARS_BEGINEXPLUATATION_AVG',
 'BASEMENTAREA_AVG',
 'LANDAREA_AVG',
 'bureau_CREDIT_TYPE_Mortgage_count_norm',
 'previous_loans_PRODUCT_COMBINATION_Cash X-Sell: high_count_norm',
 'NONLIVINGAREA_AVG',
 'client_cash_NAME_CONTRACT_STATUS_Active_count_norm_mean',
 'previous_loans_PRODUCT_COMBINATION_Cash Street: low_count_norm',
 'COMMONAREA_AVG',
 'previous_loans_RATE_DOWN_PAYMENT_mean',
 'previous_loans_RATE_DOWN_PAYMENT_min',
 'ENTRANCES_AVG',
 'FLAG_DOCUMENT_18',
 'bureau_CREDIT_TYPE_Car loan_count_norm',
 'bureau_CREDIT_TYPE_Microloan_count_norm',
 'previous_loans_CHANNEL_TYPE_Channel of corporate sales_count_norm',
 'client_bureau_balance_STATUS_1_count_norm_mean',
 'client_cash_NAME_CONTRACT_STATUS_Completed_count_norm_mean',
 'previous_loans_CODE_REJECT_REASON_SCO_count_norm',
 'NONLIVINGAPARTMENTS_AVG',
 'client_cash_NAME_CONTRACT_STATUS_Signed_count_norm_max',
 'previous_loans_NAME_CASH_LOAN_PURPOSE_Other_count_norm',
 'previous_loans_NAME_GOODS_

In [30]:
(X_train['REGION_POPULATION_RELATIVE'].value_counts() / np.float64(
        len(X_train))).sort_values(ascending=False).values[0]

0.05320616751139336

'''
# create an empty list
quasi_constant_feat = []

# iterate over every feature
for feature in X_train.columns:

    # find the predominant value, that is the value that is shared
    # by most observations
    predominant = (X_train[feature].value_counts() / np.float64(
        len(X_train))).sort_values(ascending=False).values[0]

    # evaluate the predominant feature: do more than 99% of the observations
    # show 1 value?
    if predominant > 0.998:
        
        # if yes, add the variable to the list
        quasi_constant_feat.append(feature)

len(quasi_constant_feat)
'''