In [68]:
import pandas as pd
import numpy as np

In [81]:
data = pd.read_csv("train.csv")

In [82]:
data.shape

(200000, 202)

In [83]:
data.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [84]:
feature_cols = ["var_{}".format(i) for i in range(0, 200)]

In [85]:
data.isnull().sum().values

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

In [86]:
features = data[feature_cols]
target = data["target"]

In [87]:
np.unique([typ for typ in features.dtypes])

array([dtype('float64')], dtype=object)

All variables are numeric.

In [127]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

## Remove constant features

Constant features are features that show same value for all observations.

In [89]:
from sklearn.feature_selection import VarianceThreshold

In [90]:
sel = VarianceThreshold(threshold=0)
sel.fit(X_train)

VarianceThreshold(threshold=0)

In [91]:
print("features to remove: {}".format(len(feature_cols) - sum(sel.get_support())))

features to remove: 0


Thus, there are no features with constant values.

## Remove quasi-constant

Quasi-constant features are features that exhibit the same value for the majority of observations of the dataset

In [133]:
sel = VarianceThreshold(threshold=0.1)# Variance of 0.1 threshold
sel.fit(X_train)

VarianceThreshold(threshold=0.1)

In [134]:
print("features to remove: {}".format(len(feature_cols) - sum(sel.get_support())))

features to remove: 10


In [135]:
quasi_cte_features = [feat for feat in feature_cols if feat not in X_train.columns[sel.get_support()]]

In [137]:
X_train[quasi_cte_features].var()

var_12     0.036098
var_25     0.081501
var_43     0.096004
var_68     0.000052
var_71     0.070965
var_91     0.023363
var_103    0.034280
var_108    0.029282
var_148    0.039639
var_161    0.047553
dtype: float64

In [138]:
# we can then drop these columns from the train and test sets
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

## Univariate statistical tests

Methods like F-test estimate linear dependency between two random variables (feature and target). These method assume that the variables follow a Gaussian Distribution. Features must be non negative.

There are mutual information based methods that can capture any kind of statistical dependency, but are non-parametric and thus, require more samples for accurate estimation.

The scoring function depends on task:

    For classification tasks: chi2, f_classif, mutual_info_classif
    For regression tasks: f_regression, mutual_info_regression

If you use sparse data (i.e. data represented as sparse matrices), chi2, mutual_info_regression, mutual_info_classif will deal with the data without making it dense.

In [145]:
from sklearn.feature_selection import SelectKBest, chi2, SelectPercentile, mutual_info_classif

# Input scoring function
X_new = SelectKBest(mutual_info_classif, k=100).fit_transform(X_train, y_train)

In [143]:
X_new = SelectPercentile(chi2, percentile=10).fit_transform(X_train, y_train)

ValueError: Input X must be non-negative.