In [1]:
import pandas as pd
pd.options.display.max_rows = 999
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style='darkgrid')
cmap = sns.diverging_palette(230, 20, as_cmap=True)
palette = sns.diverging_palette(220, 20)

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from category_encoders import TargetEncoder

import pickle

import random

import warnings
warnings.simplefilter('ignore', FutureWarning)

In [2]:
df = pd.read_pickle('../data/interim/train_clean.pickle')
df.head()

Unnamed: 0,ProductName,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections,AvSigVersion_encoded,AppVersion_encoded,EngineVersion_encoded,OsVer_encoded,Census_OSBranch_release_encoded
0,win8defender,7.0,0,53447.0,1.0,1.0,1,29,128035.0,18.0,...,0,0,0,10.0,0,273,18,15100,10.0,rs4_release
1,win8defender,7.0,0,53447.0,1.0,1.0,1,93,1482.0,18.0,...,0,0,0,8.0,0,Other,13,14600,10.0,rs4_release
2,win8defender,7.0,0,53447.0,1.0,1.0,1,86,153579.0,18.0,...,0,0,0,3.0,0,273,18,15100,10.0,rs4_release
3,win8defender,7.0,0,53447.0,1.0,1.0,1,88,20710.0,,...,0,0,0,3.0,1,273,18,15100,10.0,rs4_release
4,win8defender,7.0,0,53447.0,1.0,1.0,1,18,37376.0,,...,0,0,0,1.0,1,273,18,15100,10.0,rs4_release


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8921478 entries, 0 to 8921482
Data columns (total 71 columns):
 #   Column                                             Dtype   
---  ------                                             -----   
 0   ProductName                                        category
 1   RtpStateBitfield                                   category
 2   IsSxsPassiveMode                                   Int64   
 3   AVProductStatesIdentifier                          category
 4   AVProductsInstalled                                category
 5   AVProductsEnabled                                  category
 6   HasTpm                                             Int64   
 7   CountryIdentifier                                  category
 8   CityIdentifier                                     category
 9   OrganizationIdentifier                             category
 10  GeoNameIdentifier                                  category
 11  LocaleEnglishNameIdentifier          

In [4]:
df.shape

(8921478, 71)

In [5]:
# Store features in X and target in y
X = df.loc[:, df.columns != 'HasDetections']
y = df['HasDetections']

In [6]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 10)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7137182, 70)
(1784296, 70)
(7137182,)
(1784296,)


In [7]:
y_train.mean(), y_test.mean()

(0.49977540155204114, 0.4998621304985271)

In [8]:
y_train.dtype, y_test.dtype

(Int64Dtype(), Int64Dtype())

In [9]:
# Adjust target dtype
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [10]:
X_train.head()

Unnamed: 0,ProductName,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,AvSigVersion_encoded,AppVersion_encoded,EngineVersion_encoded,OsVer_encoded,Census_OSBranch_release_encoded
503304,win8defender,7.0,0,53447.0,1.0,1.0,1,214,58607.0,50.0,...,1,0,0,0,1.0,273,18,15100,10.0,rs1_release
8650206,win8defender,7.0,0,53447.0,1.0,1.0,1,50,96484.0,18.0,...,0,0,0,0,3.0,Other,12,Other,10.0,rs2_release
3828682,win8defender,7.0,0,53447.0,1.0,1.0,1,207,104722.0,27.0,...,0,1,0,1,13.0,273,18,15100,10.0,rs3_release
3470806,win8defender,7.0,0,53447.0,1.0,1.0,1,29,16280.0,27.0,...,0,0,0,0,10.0,273,18,15100,10.0,rs1_release
3642014,win8defender,7.0,0,53447.0,1.0,1.0,1,57,56452.0,18.0,...,0,0,0,0,10.0,273,18,15100,10.0,rs4_release


In [11]:
X_test.head()

Unnamed: 0,ProductName,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Census_IsTouchEnabled,Census_IsPenCapable,Census_IsAlwaysOnAlwaysConnectedCapable,Wdft_IsGamer,Wdft_RegionIdentifier,AvSigVersion_encoded,AppVersion_encoded,EngineVersion_encoded,OsVer_encoded,Census_OSBranch_release_encoded
2153865,win8defender,7.0,0,53447.0,1.0,1.0,1,53,140908.0,27.0,...,0,0,0,0,15.0,273,18,15100,10.0,rs4_release
5346868,win8defender,7.0,0,53447.0,1.0,1.0,1,43,160178.0,27.0,...,1,1,1,1,7.0,273,18,15100,10.0,rs4_release
7839864,win8defender,7.0,0,7945.0,2.0,1.0,1,207,106198.0,27.0,...,0,0,0,1,13.0,273,18,15100,10.0,rs4_release
5330352,win8defender,7.0,0,53447.0,1.0,1.0,1,57,117751.0,18.0,...,0,0,0,1,10.0,273,18,15100,10.0,rs4_release
7870216,win8defender,7.0,0,53447.0,1.0,1.0,1,89,37026.0,27.0,...,0,1,0,0,1.0,273,18,15100,10.0,rs3_release


In [12]:
y_train.head()

503304     1
8650206    1
3828682    0
3470806    1
3642014    1
Name: HasDetections, dtype: int32

In [13]:
y_test.head()

2153865    0
5346868    1
7839864    0
5330352    0
7870216    0
Name: HasDetections, dtype: int32

In [14]:
X_train.to_pickle('../data/interim/X_train.pickle')
X_test.to_pickle('../data/interim/X_test.pickle')
y_train.to_pickle('../data/interim/y_train.pickle')
y_test.to_pickle('../data/interim/y_test.pickle')

Missing values, one hot encoding and target encoding will be performed using sklearn column transformers.

In [15]:
# Separate features by data type: categorical, ID, numerical and binary
id_col = ['AVProductStatesIdentifier',
          'CountryIdentifier',
          'CityIdentifier',
          'OrganizationIdentifier',
          'GeoNameIdentifier',
          'LocaleEnglishNameIdentifier',
          'IeVerIdentifier',
          'Census_OEMNameIdentifier',
          'Census_OEMModelIdentifier',
          'Census_ProcessorManufacturerIdentifier',
          'Census_ProcessorModelIdentifier',
          'Census_OSInstallLanguageIdentifier',
          'Census_OSUILocaleIdentifier',
          'Census_FirmwareManufacturerIdentifier',
          'Census_FirmwareVersionIdentifier',
          'Wdft_RegionIdentifier']
cat_col = [col for col in X_train.select_dtypes('category').columns.tolist() 
           if col not in id_col]
binary_col = X_train.select_dtypes('Int64').columns.tolist()
num_col = [col for col in X_train.columns 
           if col not in cat_col
           if col not in id_col
           if col not in binary_col]

len(cat_col), len(id_col), len(binary_col), len(num_col) 

(33, 16, 14, 7)

In [16]:
len(cat_col) + len(id_col) + len(binary_col) + len(num_col)

70

In [17]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median')

# Preprocessing for binary data
binary_transformer = SimpleImputer(strategy='most_frequent')

# Preprocessing for categorical data encoded as numerical ID's
id_transformer = Pipeline(steps=[
    ('encoding', TargetEncoder())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_col),
        ('bin', binary_transformer, binary_col),
        ('id', id_transformer, id_col),
        ('cat', categorical_transformer, cat_col)
    ])

Let's apply the preprocessing steps to see the final dataset that'll be used for training.

In [18]:
X_train_transformed = preprocessor.fit_transform(X_train, y_train)

In [19]:
X_train_transformed

<7137182x511 sparse matrix of type '<class 'numpy.float64'>'
	with 421447160 stored elements in Compressed Sparse Row format>

In [20]:
onehot_col = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names(input_features=cat_col)
columns_names = num_col + binary_col + id_col + onehot_col.tolist()
len(columns_names)

511

In [21]:
X_train_transformed_df = pd.DataFrame.sparse.from_spmatrix(X_train_transformed, columns=columns_names)
X_train_transformed_df.head()

Unnamed: 0,Census_ProcessorCoreCount,Census_PrimaryDiskTotalCapacity,Census_SystemVolumeTotalCapacity,Census_TotalPhysicalRAM,Census_InternalPrimaryDiagonalDisplaySizeInInches,Census_InternalPrimaryDisplayResolutionHorizontal,Census_InternalPrimaryDisplayResolutionVertical,IsSxsPassiveMode,HasTpm,AutoSampleOptIn,...,EngineVersion_encoded_Other,OsVer_encoded_10.0,OsVer_encoded_6.1,OsVer_encoded_6.3,Census_OSBranch_release_encoded_Other,Census_OSBranch_release_encoded_rs1_release,Census_OSBranch_release_encoded_rs2_release,Census_OSBranch_release_encoded_rs3_release,Census_OSBranch_release_encoded_rs4_release,Census_OSBranch_release_encoded_th
0,4.0,476940.0,244414.0,4096.0,11.6,1366.0,768.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,4.0,476940.0,475573.0,4096.0,13.9,1366.0,768.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2.0,262144.0,261000.0,2048.0,17.700001,3274.0,2126.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,4.0,953869.0,940518.0,8192.0,15.5,1366.0,768.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4.0,476940.0,66385.0,12288.0,13.9,1366.0,768.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [22]:
# double check null values 
X_train_transformed_df.isnull().values.any()

False

In [23]:
# double check negative values 
(X_train_transformed_df < 0).values.any()

False

Note that these steps will be repeated in the modeling notebook as some imputer strategy may be changed, according to our needs, and some iteration may be necessary, depending on the modeling results. Furthermore, the preprocessor will be included in a pipeline with feature selection and machine learning model. A randomized search cross validation will be performed on this pipeline in order to tune the hyperparameters and select the final model. 