## Test adfraud/models.py on data ##

In [1]:
# ----------- import useful packages ------------------

import pandas as pd
import numpy as np

# ---------- plotting imports and setup --------------
# import matplotlib.pyplot as plt
# import matplotlib as mpl
%matplotlib inline

# import seaborn as sns
# sns.set()

# mpl.rcParams['lines.linewidth']=2.0
# mpl.rcParams['xtick.labelsize']=13
# mpl.rcParams['ytick.labelsize']=13
# mpl.rcParams['axes.labelsize']=15
# mpl.rcParams['axes.labelweight']='heavy'
# mpl.rcParams['axes.titlesize']=18
# mpl.rcParams['axes.titleweight']='heavy'
# mpl.rcParams['legend.fontsize']=12

# --------------- ML imports -------------------------

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline

import category_encoders as ce

# ---------- filesystem imports -----------------------
import os
import sys

#### import own modules from adfraud ####

In [2]:
src_dir = os.path.join(os.path.dirname(os.getcwd()))
sys.path.append(src_dir)

from adfraud import models

#### load in the data ####

In [3]:
data_dir = os.path.join(os.path.dirname(os.getcwd()),'data')

data_filename = 'train_sample.csv'
data_location = os.path.join(data_dir,data_filename)
ad_data = pd.read_csv(data_location)

#### process the data ####

 - change 'click time' field into datetime format and only keep the hour
 - split data into features and labels

In [4]:

ad_data['click_time'] = pd.to_datetime(ad_data['click_time'],format='%Y-%m-%d %H:%M:%S')
ad_data['click_time'] = ad_data['click_time'].dt.hour 

feats_df = ad_data[['ip','app','channel','click_time','os','device']]
labels   = ad_data['is_attributed'].values


#### Split Data into train and test sets ####

Since the data is heavily imblanced - use a Stratified split - which keeps the number of minority class cases to be the same fraction of the data in both sets

keep 10% of data for testing. Note that in cross validation of models, the training data will be further split into training and validation sets (again with Stratified split)


In [5]:
train_test_splitter = StratifiedShuffleSplit(n_splits=1,test_size=0.8,random_state=42)
split = train_test_splitter.split(np.zeros((len(labels),)),labels)
train_inds,test_inds = next(split)
x_train = feats_df.iloc[train_inds]
y_train = labels[train_inds]
x_test  = feats_df.iloc[test_inds]
y_test  = labels[test_inds] 

#### set up a parameter grid to search over for optimal Hyperparameters ####


note that ideally would do a rough search followed by a finer search in the vicinity of the best point in the parameter space.
Alse note that a random search may perform better than grid search.

Above to be considered for further development of solution to problem


In [6]:
param_grid = {
    'forest__n_estimators':[50],
    'forest__max_depth': [10],
    'forest__class_weight': [
                           {0:0.001,1:0.999}
                            ],
    'hash__n_components': [40]
}

#### define models using models.py #####

Here define two different models, both using models.HashForest() which implements a model (pipeline) for the hashing trick on chosen features, and then applies a random forest to the hashed data.

Below, initialize two different models - looking at different sets of features, in the second case we also include the 'click_time' feature column - and it is also hashed.

In [7]:
# only include ip,app,channel in model - hash only app,channel
model_ip_app_chan      = models.HashForest(['ip','app','channel'],['app','channel'])
# as above but alos hash the ip (hash components will be same though, so expect worse performance)
#model_iphash_app_chan  = models.HashForest(['ip','app','channel'],['ip','app','channel'])
# here include ip,app,channel,click_time and hash all except ip
model_ip_app_chan_time = models.HashForest(['ip','app','channel','click_time'],['app','channel','click_time'])

#### train the models and find the Area Under the ROC ####

note that inside HashForest, the grid search cross validation is perfomred by scoring on AUC - as this is what we wish to optimize 

In [None]:
mods = [model_ip_app_chan,model_ip_app_chan_time]
auc = []
for m in mods:
    m.train_CV(x_train,y_train,param_grid,n_splits=3)
    auc.append(m.test_auc(x_test,y_test))


Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] forest__class_weight={0: 0.001, 1: 0.999}, forest__max_depth=10, forest__n_estimators=50, hash__n_components=40 
[CV]  forest__class_weight={0: 0.001, 1: 0.999}, forest__max_depth=10, forest__n_estimators=50, hash__n_components=40, score=0.837, total=  11.7s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.7s remaining:    0.0s
[CV] forest__class_weight={0: 0.001, 1: 0.999}, forest__max_depth=10, forest__n_estimators=50, hash__n_components=40 
[CV]  forest__class_weight={0: 0.001, 1: 0.999}, forest__max_depth=10, forest__n_estimators=50, hash__n_components=40, score=0.891, total=  11.7s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   23.4s remaining:    0.0s
[CV] forest__class_weight={0: 0.001, 1: 0.999}, forest__max_depth=10, forest__n_estimators=50, hash__n_components=40 
[CV]  forest__class_weight={0: 0.001, 1: 0.999}, fores

#### See how the models performed ####

 - We can check the area under the (roc) curve on the test set
 - compare that to the perfomrance on the training set (for the optimal hyperparams)
 - we can see which hyperparams were selected in validation
 - see auc score for each model during validation

In [None]:
print(auc)
print([m.train_auc for m in mods])
print([m.model_cv.best_params_ for m in mods])
print([m.model_cv.cv_results_['mean_test_score'] for m in mods])

In [None]:
mods[0].plot_roc_acc(x_test,y_test)

In [None]:
#model_ip_app_chan.plot_roc_acc(x_test)
#model_ip_app_chan.train_auc