In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold
from category_encoders import TargetEncoder

import xgboost as xgb
from xgboost import XGBClassifier

import pickle

import sys

if not sys.warnoptions:
    import warnings
    warnings.filterwarnings(action='ignore', category=UserWarning)

In [2]:
df = pd.read_pickle('../data/interim/train_clean_roll_up.pickle')

In [3]:
df = df.sample(frac=0.05)

In [4]:
df.head()

Unnamed: 0,ProductName,RtpStateBitfield,IsSxsPassiveMode,AVProductStatesIdentifier,AVProductsInstalled,AVProductsEnabled,HasTpm,CountryIdentifier,CityIdentifier,OrganizationIdentifier,...,Wdft_IsGamer,Wdft_RegionIdentifier,HasDetections,AvSigVersion_encoded,AppVersion_encoded,EngineVersion_encoded,OsVer_encoded,Census_OSBranch_release_encoded,IsProtected_encoded,RAM_bins
3089904,win8defender,7.0,0,7945.0,2.0,1.0,1,164,77794.0,27.0,...,0,15.0,1,275,13,15200,10.0,rs4_release,2.0,4-8 GB
6664928,win8defender,0.0,1,55336.0,2.0,1.0,1,120,120697.0,,...,1,3.0,0,273,18,15100,10.0,rs1_release,2.0,4-8 GB
2960027,win8defender,7.0,0,53447.0,1.0,1.0,1,16,131181.0,,...,0,3.0,1,275,18,15200,10.0,rs4_release,2.0,8-16 GB
2804278,win8defender,7.0,0,53447.0,1.0,1.0,1,29,143155.0,18.0,...,1,10.0,0,275,18,15200,10.0,rs2_release,2.0,4-8 GB
7241421,win8defender,7.0,0,53447.0,1.0,1.0,1,53,136975.0,,...,0,15.0,0,273,18,15100,10.0,rs3_release,2.0,4-8 GB


In [5]:
df.shape

(419767, 72)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 419767 entries, 3089904 to 7152232
Data columns (total 72 columns):
 #   Column                                             Non-Null Count   Dtype   
---  ------                                             --------------   -----   
 0   ProductName                                        419767 non-null  category
 1   RtpStateBitfield                                   418233 non-null  category
 2   IsSxsPassiveMode                                   419767 non-null  Int64   
 3   AVProductStatesIdentifier                          418285 non-null  category
 4   AVProductsInstalled                                418285 non-null  category
 5   AVProductsEnabled                                  418285 non-null  category
 6   HasTpm                                             419767 non-null  Int64   
 7   CountryIdentifier                                  419767 non-null  category
 8   CityIdentifier                                     404510

In [7]:
X = df.loc[:, df.columns != 'HasDetections']
y = df['HasDetections']

In [8]:
#Seperate train and test data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size = 0.2,
                                                   random_state = 10)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(335813, 71)
(83954, 71)
(335813,)
(83954,)


In [9]:
y_train.mean(), y_test.mean()

(0.49764005562619673, 0.49690306596469497)

In [10]:
y_train.dtype, y_test.dtype

(Int64Dtype(), Int64Dtype())

In [11]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

In [12]:
target_enc_col = ['AVProductStatesIdentifier',
                  'CountryIdentifier',
                  'CityIdentifier',
                  'OrganizationIdentifier',
                  'GeoNameIdentifier',
                  'LocaleEnglishNameIdentifier',
                  'IeVerIdentifier',
                  'Census_OEMNameIdentifier',
                  'Census_OEMModelIdentifier',
                  'Census_ProcessorManufacturerIdentifier',
                  'Census_ProcessorModelIdentifier',
                  'Census_OSInstallLanguageIdentifier',
                  'Census_OSUILocaleIdentifier',
                  'Census_FirmwareManufacturerIdentifier',
                  'Census_FirmwareVersionIdentifier',
                  'Wdft_RegionIdentifier',
                  'Census_OSBuildRevision']
cat_col = [col for col in X_train.select_dtypes('category').columns.tolist() 
           if col not in target_enc_col]
binary_col = X_train.select_dtypes('Int64').columns.tolist()
num_col = [col for col in X_train.columns 
           if col not in cat_col
           if col not in target_enc_col
           if col not in binary_col]

len(cat_col), len(target_enc_col), len(binary_col), len(num_col) 

(33, 17, 14, 7)

In [13]:
len(cat_col) + len(target_enc_col) + len(binary_col) + len(num_col)

71

In [14]:
[X_train[col].dtype for col in cat_col]

[CategoricalDtype(categories=['mse', 'win8defender', 'mseprerelease', 'windowsintune',
                   'fep', 'scep'],
 , ordered=False),
 CategoricalDtype(categories=[0.0, 1.0, 3.0, 5.0, 7.0, 8.0, 35.0], ordered=False),
 CategoricalDtype(categories=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0], ordered=False),
 CategoricalDtype(categories=[0.0, 1.0, 2.0, 3.0, 4.0, 5.0], ordered=False),
 CategoricalDtype(categories=['windows10', 'windows2016', 'windows7', 'windows8'], ordered=False),
 CategoricalDtype(categories=['arm64', 'x64', 'x86'], ordered=False),
 CategoricalDtype(categories=['10240', '10586', '14393', '15063', '16299', '17134', '7601',
                   '9600', 'Other'],
 , ordered=False),
 CategoricalDtype(categories=['256', '768', 'Other'], ordered=False),
 CategoricalDtype(categories=['prers5', 'rs1', 'rs2', 'rs3', 'rs4', 'th1', 'th2',
                   'windows7', 'windows8.1'],
 , ordered=False),
 CategoricalDtype(categories=['Cloud', 'Education', 'Enterprise', 'Enterprise 

In [15]:
for col in cat_col:
    X_train[col] = X_train[col][X_train[col].notnull()].astype('str')
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col][X_test[col].notnull()].astype('str')
    X_test[col] = X_test[col].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col][X_train[col].notnull()].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[col] = X_train[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[col] = X_test[col][X_test[col].notnull()].astype('str')
A value is trying to be set o

In [16]:
[X_train[col].dtype for col in cat_col]

[CategoricalDtype(categories=['mse', 'mseprerelease', 'scep', 'win8defender'], ordered=False),
 CategoricalDtype(categories=['0.0', '1.0', '3.0', '5.0', '7.0', '8.0'], ordered=False),
 CategoricalDtype(categories=['1.0', '2.0', '3.0', '4.0', '5.0', '6.0'], ordered=False),
 CategoricalDtype(categories=['0.0', '1.0', '2.0', '3.0', '4.0', '5.0'], ordered=False),
 CategoricalDtype(categories=['windows10', 'windows2016', 'windows7', 'windows8'], ordered=False),
 CategoricalDtype(categories=['arm64', 'x64', 'x86'], ordered=False),
 CategoricalDtype(categories=['10240', '10586', '14393', '15063', '16299', '17134', '7601',
                   '9600', 'Other'],
 , ordered=False),
 CategoricalDtype(categories=['256', '768', 'Other'], ordered=False),
 CategoricalDtype(categories=['prers5', 'rs1', 'rs2', 'rs3', 'rs4', 'th1', 'th2',
                   'windows7', 'windows8.1'],
 , ordered=False),
 CategoricalDtype(categories=['Cloud', 'Education', 'Enterprise', 'Enterprise LTSB',
                   

In [17]:
#for col in target_enc_col:
#    X_train[col] = X_train[col][X_train[col].notnull()].astype('str')
#    X_train[col] = X_train[col].astype('category')
#    X_test[col] = X_test[col][X_test[col].notnull()].astype('str')
#    X_test[col] = X_test[col].astype('category')

In [18]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer()

# Preprocessing for binary data
binary_transformer = SimpleImputer()

# Preprocessing for categorical data encoded as numerical ID's
id_transformer = SimpleImputer() #TargetEncoder(smoothing=10000)

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer()),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_col),
        ('bin', binary_transformer, binary_col),
        ('id', id_transformer, target_enc_col),
        ('cat', categorical_transformer, cat_col)
    ])

In [19]:
# Define model
xgb_clf = xgb.XGBClassifier(objective = 'binary:logistic', 
                            eval_metric='logloss', 
                            use_label_encoder =False,
                            verbosity=0)
VarThresh = VarianceThreshold()
selector = SelectKBest(f_classif)

# Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('VarThresh', VarThresh),
                            ('selector', selector),
                            ('classifier', xgb_clf)
                            ])


parameters ={'preprocessor__num__strategy': ['mean', 'median', 'constant'],
              'preprocessor__bin__strategy': ['most_frequent', 'constant'],
              'preprocessor__cat_num__strategy': ['most_frequent', 'constant'],
              'preprocessor__cat__imputer__strategy': ['most_frequent', 'constant'],
              'classifier__learning_rate': [0.05, 0.1, 0.3, 0.05],
              'classifier__gamma' : [0, 0.01, 0.1],
              'classifier__max_depth': range(1, 11, 2),
              'classifier__colsample_bytree': [0.3, 0.6, 0.8, 1.0],
              'classifier__subsample': [0.2, 0.4, 0.5, 0.6, 0.7],
              'classifier__reg_alpha': [0, 0.5, 1, 1.5],
              'classifier__reg_lambda': [0, 0.5, 1, 1.5],
              'classifier__min_child_weight': [1, 3, 5, 7],
              'classifier__n_estimators': [500, 1000, 2000, 3000]}


# Grid search
search = RandomizedSearchCV(estimator=pipeline, 
                             param_distributions=parameters,
                             n_iter = 200,
                             cv=3,
                             scoring = 'roc_auc',
                             return_train_score=True,
                             random_state=42,
                             #n_jobs = -1,
                             verbose=3)

# Preprocessing of training data, fit model 
search.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


ValueError: Invalid parameter cat_num for estimator ColumnTransformer(transformers=[('num', SimpleImputer(),
                                 ['Census_ProcessorCoreCount',
                                  'Census_PrimaryDiskTotalCapacity',
                                  'Census_SystemVolumeTotalCapacity',
                                  'Census_TotalPhysicalRAM',
                                  'Census_InternalPrimaryDiagonalDisplaySizeInInches',
                                  'Census_InternalPrimaryDisplayResolutionHorizontal',
                                  'Census_InternalPrimaryDisplayResolutionVertical']),
                                ('bin', SimpleIm...
                                  'Census_PowerPlatformRoleName',
                                  'Census_InternalBatteryNumberOfCharges',
                                  'Census_OSArchitecture',
                                  'Census_OSEdition_encoded',
                                  'Census_OSSkuName_encoded',
                                  'Census_OSInstallTypeName',
                                  'Census_OSWUAutoUpdateOptionsName',
                                  'Census_GenuineStateName',
                                  'Census_ActivationChannel',
                                  'Census_FlightRing', 'AvSigVersion_encoded',
                                  'AppVersion_encoded', 'EngineVersion_encoded',
                                  'OsVer_encoded', ...])]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
# optimized model
opt_xgb_clf = search.best_estimator_
search.best_params_

In [None]:
from sklearn.metrics import roc_auc_score

# Predictions
y_pred_train = opt_xgb_clf.predict(X_train)
y_pred_test = opt_xgb_clf.predict(X_test)

print('Train Set roc auc:', roc_auc_score(y_train, y_pred_train))
print('Test Set roc auc:', roc_auc_score(y_test, y_pred_test))

In [None]:
# classification report
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred_test))

In [None]:
# plot ROC_AUC
import matplotlib.pyplot as plt 
import matplotlib as mpl 
%matplotlib inline
import seaborn as sns
sns.set()
sns.set(font_scale=1.5)

from sklearn import metrics

y_pred_proba_test = opt_xgb_clf.predict_proba(X_test)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_proba_test[:,1])
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr)
plt.plot([0,1], [0,1], '--', color = 'black')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.tight_layout()

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred_test)

lables = {'Negative': 0, 'Positive': 1} 

plt.figure(figsize=(10,8))
sns.heatmap(conf_matrix, 
            annot=True,
            annot_kws={'size':28},
            fmt="d",
            xticklabels=lables.keys(), 
            yticklabels=lables.keys(),
            cmap="Blues")
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix')

In [None]:
# plot precision - recall curve
from sklearn.metrics import precision_recall_curve

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba_test[:,1])
plt.figure(figsize=(8,6))
plt.plot(recall, precision)
plt.plot([0, 1], [0.5, 0.5], '--', color = 'black')
plt.title('Precision-Recall Curve for Optimized Logistic Regression')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.tight_layout()

In [None]:
search.best_params_

In [None]:
# Preprocessing for numerical data
numerical_transformer_rev = SimpleImputer(strategy=search.best_params_['preprocessor__num__strategy'])

# Preprocessing for binary data
binary_transformer_rev = SimpleImputer(strategy=search.best_params_['preprocessor__bin__strategy'])

# Preprocessing for categorical data encoded as numerical ID's
categorical_num_transformer_rev = SimpleImputer(strategy=search.best_params_['preprocessor__cat_num__strategy'])

# Preprocessing for categorical data
categorical_transformer_rev = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy=search.best_params_['preprocessor__cat__imputer__strategy'])),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


# Bundle preprocessing for numerical and categorical data
preprocessor_rev = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer_rev, numerical_columns),
        ('bin', binary_transformer_rev, binary_columns),
        ('cat_num', categorical_num_transformer_rev, cat_num_columns),
        ('cat', categorical_transformer_rev, categorical_columns)
    ])

# Bundle preprocessing and modeling code in a pipeline
pipeline_rev = Pipeline(steps=[('preprocessor', preprocessor_rev),
                              ('VarThresh', VarThresh)
                             ])

In [None]:
X_values = preprocessor_rev.fit_transform(X_train)
onehot_cat_columns = pipeline_rev.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names(input_features=categorical_columns)
onehot = onehot_cat_columns.tolist() 
columns_tot = numerical_columns + binary_columns + cat_num_columns + onehot

feature_importance = pd.Series(data=opt_xgb_clf.named_steps['classifier'].feature_importances_, index = np.array(columns_tot))

In [None]:
feature_importance = feature_importance.sort_values(ascending=False)

In [None]:
plt.figure(figsize=(20,100))
sns.barplot(y=feature_importance.index, 
            x=feature_importance.values, 
            orient='h')

In [None]:
from sklearn.model_selection import learning_curve

opt_xgb_clf_rev = opt_xgb_clf
train_sizes, train_scores, test_scores = learning_curve(opt_xgb_clf_rev, 
                                                        X_train, 
                                                        y_train, 
                                                        cv=5, 
                                                        scoring='roc_auc',
                                                        n_jobs=1,
                                                        train_sizes=np.linspace(.01, 1.0, 5), 
                                                        verbose=3)

In [None]:
# Plot learning curve
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10,8))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1,
                 color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.1,
                 color="g")
plt.plot(train_sizes, train_scores_mean, 'd-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    
plt.ylim(0.5, 1.01)
   
    
plt.legend(loc='best')
plt.xlabel('Training examples')
plt.ylabel('ROC_AUC Score')
plt.title('Learning Curve (XGB)')

In [None]:
X_train_full = [X_train, X_test]
y_train_full= [y_train, y_test]

X_train_full = pd.concat(X_train_full)
y_train_full = pd.concat(y_train_full)

X_train_full.shape, y_train_full.shape

In [None]:
opt_xgb_clf.fit(X_train_full, y_train_full)

In [None]:
with open('../models/optimized_XGB.pickle', 'wb') as handle:
    pickle.dump(opt_xgb_clf, handle)