In [1]:
# import useful packages

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

import seaborn as sns
sns.set()

mpl.rcParams['lines.linewidth']=2.0
mpl.rcParams['xtick.labelsize']=13
mpl.rcParams['ytick.labelsize']=13
mpl.rcParams['axes.labelsize']=15
mpl.rcParams['axes.labelweight']='heavy'
mpl.rcParams['axes.titlesize']=18
mpl.rcParams['axes.titleweight']='heavy'
mpl.rcParams['legend.fontsize']=12



from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score

from sklearn.pipeline import Pipeline

import category_encoders as ce

import os
import sys

In [2]:
src_dir = os.path.join(os.path.dirname(os.getcwd()),'adfraud')
sys.path.append(src_dir)

data_dir = os.path.join(os.path.dirname(os.getcwd()),'data')

data_filename = 'train_sample.csv'
data_location = os.path.join(data_dir,data_filename)
ad_data = pd.read_csv(data_location)
feats_df  = ad_data[['ip','app','channel']]
labels_df = ad_data['is_attributed'].values
#labels = ad_data['is_attributed'].values

In [3]:
# currently test size set to be huge to allow quicker testing....
#x_train,x_test,y_train,y_test = train_test_split(feats_df,labels_df,test_size=0.8)

train_test_splitter = StratifiedShuffleSplit(n_splits=1,test_size=0.1,random_state=42)
split = train_test_splitter.split(np.zeros((len(labels_df),)),labels_df)
train_inds,test_inds = next(split)
x_train = feats_df.iloc[train_inds]
y_train = labels_df[train_inds]
x_test  = feats_df.iloc[test_inds]
y_test  = labels_df[test_inds] 

In [4]:
print(x_train.shape)
print(x_test.shape)
print(np.sum(y_test)/len(y_test))
print(np.sum(y_train)/len(y_train))

(90000, 3)
(10000, 3)
0.0023
0.002266666666666667


In [5]:

# feature encoder
hasher = ce.hashing.HashingEncoder(n_components=50,cols=['app','channel'])
# classifier
clf = RandomForestClassifier(n_estimators=20, max_depth=5,class_weight={0:0.001,1:0.999})


In [6]:
cv = StratifiedShuffleSplit(n_splits=5,test_size=0.2,random_state=42)
#scorer = metrics.roc_auc_score()

In [7]:
model = Pipeline([('hash',hasher),('forest',clf)])

In [8]:
# set up grid of parameters to look through for optimization
param_grid = {
    'forest__n_estimators':[50],
    'forest__max_depth': [10],
    'forest__class_weight': [
                           {0:0.001,1:0.999}],
    'hash__n_components': [50]
}


In [9]:
model_cv = GridSearchCV(model,param_grid,cv=cv,scoring='roc_auc',verbose=50)
model_cv

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=5, random_state=42, test_size=0.2,
            train_size=None),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('hash',
                                        HashingEncoder(cols=['app', 'channel'],
                                                       drop_invariant=False,
                                                       hash_method='md5',
                                                       n_components=50,
                                                       return_df=True,
                                                       verbose=0)),
                                       ('forest',
                                        RandomForestClassifier(bootstrap=True,
                                                               class_w...
                                                               n_estimators=20,
                       

In [10]:
model_cv.fit(x_train,y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] forest__class_weight={0: 0.001, 1: 0.999}, forest__max_depth=10, forest__n_estimators=50, hash__n_components=50 
[CV]  forest__class_weight={0: 0.001, 1: 0.999}, forest__max_depth=10, forest__n_estimators=50, hash__n_components=50, score=0.929, total=  54.5s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   54.5s remaining:    0.0s
[CV] forest__class_weight={0: 0.001, 1: 0.999}, forest__max_depth=10, forest__n_estimators=50, hash__n_components=50 


KeyboardInterrupt: 

In [None]:
print(model_cv.best_params_)
print(model_cv.refit_time_)
print(model_cv.best_score_)
print(model_cv.cv_results_)

In [None]:
preds=model_cv.predict_proba(x_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds[:,1])
auc = metrics.roc_auc_score(y_test, preds[:,1])
print(auc)

In [None]:
model.fit(x_train,y_train)

In [None]:
preds=model.predict_proba(x_test)

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds[:,1])
auc = metrics.roc_auc_score(y_test, preds[:,1])
print(auc)

In [None]:
plt.plot(fpr,tpr,'r*', label='area={:.4f}'.format(auc))
plt.plot(np.linspace(0,1,20),np.linspace(0,1,20),'k--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')

plt.legend(frameon=False)

plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.show()

In [None]:
def pred_from_prob(a,thrshld):
    bin_preds = np.zeros((np.size(a,0),))
    bin_preds[np.where(a[:,1]>thrshld)]=1.0
    return bin_preds

In [None]:
acc_tot = []
acc_0 = []
acc_1 = []
for ind,th in enumerate(thresholds):
    binary_preds = pred_from_prob(preds,th)
    acc_tot.append(accuracy_score(y_test,binary_preds))
    acc_1.append(accuracy_score(y_test[np.where(y_test==1)],binary_preds[np.where(y_test==1)]))
    acc_0.append(accuracy_score(y_test[np.where(y_test==0)],binary_preds[np.where(y_test==0)]))


In [None]:
mpl.rcParams['figure.figsize']=[15.0,5.0]

plt.subplot(1,2,1)
plt.plot(fpr,tpr,'r*', label='roc auc={:.4f}'.format(auc))
plt.plot(np.linspace(0,1,20),np.linspace(0,1,20),'k--')

#plt.plot(fpr,acc_tot,'b*-',label='accuracy')
#plt.plot(fpr,acc_0,'go--',label='accuracy - 0',alpha=0.3)
plt.plot(fpr,acc_1,'y--',alpha=0.9,label='accuracy - cases:1')

plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC')

plt.legend(frameon=False)

plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])


plt.subplot(1,2,2)

#plt.plot(thresholds,acc_tot,'b*-',label='accuracy')
plt.plot(thresholds,acc_0,'go--',label='cases: 0',alpha=0.3)
plt.plot(thresholds,acc_1,'yo--',label='cases: 1')
plt.plot(thresholds[::5],fpr[::5],'c*',alpha=0.2,label='fpr')

plt.xlim([0,1])
plt.xlim([0,1])

plt.xlabel('threshold')
plt.ylabel('accuracy')
plt.title('accuracy')

plt.legend(frameon=False)

plt.show()