In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_selection import VarianceThreshold

In [3]:
# load the train and test set with the engineered variables


X_train = pd.read_csv('/Users/patrycjapiechowicz/CYBER/xtrain.csv')
X_test = pd.read_csv('/Users/patrycjapiechowicz/CYBER/xtest.csv')

X_train.shape,X_test.shape 

((18000, 181), (2000, 181))

#### Removing quasi-constant features

In [4]:
sel = VarianceThreshold(
    threshold=0.001)  # 0.01 indicates 99,9% of observations approximately

sel.fit(X_train)  # fit finds the features with low variance

VarianceThreshold(threshold=0.001)

In [5]:
# finally we can print the quasi-constant features
print(
    len([
        x for x in X_train.columns
        if x not in X_train.columns[sel.get_support()]
    ]))

[x for x in X_train.columns if x not in X_train.columns[sel.get_support()]]

8


['header_optional_minor_operating_system_version',
 'header_optional_minor_subsystem_version',
 'header_coff_characteristics_system',
 'header_optional_dll_characteristics_wdm_driver',
 'datadirectories_GLOBAL_PTR_size',
 'datadirectories_GLOBAL_PTR_virtual_address',
 'section_high_entropy_ratio_na',
 'section_has_non_standard_sections_ratio_na']

We can see that 25 columns / variables are almost constant. This means that 25 variables show predominantly one value for ~99% the observations of the training set. 

In [6]:
# percentage of observations showing each of the different values
X_train['header_optional_minor_subsystem_version'].value_counts() / np.float(len(X_train))

0.000000    0.852667
0.019608    0.127000
0.039216    0.017111
0.196078    0.002167
0.058824    0.000722
0.980392    0.000222
1.000000    0.000111
Name: header_optional_minor_subsystem_version, dtype: float64

In [7]:
# percentage of observations showing each of the different values
X_train['datadirectories_GLOBAL_PTR_virtual_address'].value_counts() / np.float(len(X_train))

0.000000    0.999833
0.001247    0.000056
0.001261    0.000056
1.000000    0.000056
Name: datadirectories_GLOBAL_PTR_virtual_address, dtype: float64

In [8]:
# percentage of observations showing each of the different values
X_train['section_has_non_standard_sections_ratio_na'].value_counts() / np.float(len(X_train))

0.0    0.999944
1.0    0.000056
Name: section_has_non_standard_sections_ratio_na, dtype: float64

In [9]:
quasi_constant_feat = []
for feature in X_train.columns:

    # find the predominant value
    predominant = (X_train[feature].value_counts() / np.float(
        len(X_train))).sort_values(ascending=False).values[0]

    # evaluate predominant feature
    if predominant > 0.999:
        quasi_constant_feat.append(feature)

len(quasi_constant_feat)

6

In [10]:
quasi_constant_feat

['header_coff_characteristics_system',
 'header_optional_dll_characteristics_wdm_driver',
 'datadirectories_GLOBAL_PTR_size',
 'datadirectories_GLOBAL_PTR_virtual_address',
 'section_high_entropy_ratio_na',
 'section_has_non_standard_sections_ratio_na']

In [11]:
# percentage of observations showing each of the different values
X_train['header_coff_characteristics_system'].value_counts() / np.float(len(X_train))

0.0    0.999111
1.0    0.000889
Name: header_coff_characteristics_system, dtype: float64

In [12]:
# percentage of observations showing each of the different values
X_train['header_optional_dll_characteristics_wdm_driver'].value_counts() / np.float(len(X_train))

0.0    0.999722
1.0    0.000278
Name: header_optional_dll_characteristics_wdm_driver, dtype: float64

In [13]:
# percentage of observations showing each of the different values
X_train['datadirectories_GLOBAL_PTR_size'].value_counts() / np.float(len(X_train))

0.0    0.999944
1.0    0.000056
Name: datadirectories_GLOBAL_PTR_size, dtype: float64

In [14]:
# percentage of observations showing each of the different values
X_train['datadirectories_GLOBAL_PTR_virtual_address'].value_counts() / np.float(len(X_train))

0.000000    0.999833
0.001247    0.000056
0.001261    0.000056
1.000000    0.000056
Name: datadirectories_GLOBAL_PTR_virtual_address, dtype: float64

In [15]:
# percentage of observations showing each of the different values
X_train['section_high_entropy_ratio_na'].value_counts() / np.float(len(X_train))

0.0    0.999944
1.0    0.000056
Name: section_high_entropy_ratio_na, dtype: float64

In [16]:
# percentage of observations showing each of the different values
X_train['section_has_non_standard_sections_ratio_na'].value_counts() / np.float(len(X_train))

0.0    0.999944
1.0    0.000056
Name: section_has_non_standard_sections_ratio_na, dtype: float64

Those 6 features shows 0 for more than 99.9% of the observations.