In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import os

import numpy as np
import pandas as pd
import tensorflow as tf

import atecml.data

from contextlib import contextmanager
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegressionCV
#build Models...
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from imblearn.over_sampling import SMOTE, ADASYN
import random

plt.style.use('ggplot')

In [39]:
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='binary_error', model_type='gbdt',
                 feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=50, categorical_features=None):
    
    lgb_params = {
        'boosting_type': model_type,
        'objective': objective,
        'metric': metrics,
        'use_missing' : 'true',
        'learning_rate': 0.05,
        'num_leaves': 64,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        #'min_child_samples': 600,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 511,  # Number of bucketed bin for feature values
        #'colsample_bytree': 0.9,
        #'subsample': 0.85,  # Subsample ratio of the training instance.
        #'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
        #'min_child_weight': 0.05,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        #'subsample_for_bin': 200000,  # Number of samples for constructing bin
        #'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        #'reg_alpha': 0.01,  # L1 regularization term on weights
        #'reg_lambda': 0.1,  # L2 regularization term on weights
        'nthread': 40,
        'verbose': -1,
    }

    lgb_params.update(params)
    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=verbose_eval, 
                     feval=feval)

    return bst1

def model_validation(train_df,val_df,predictors,target,params):
    categorical=[]
    for item in predictors:
        if (item in atecml.data.CATE_FEATURE_LIST):
            categorical.append(item)
            
    if (target == 'Normal'):
        params = {
            'scale_pos_weight' : 0.01,
        } 
    else:
        params = {
            'scale_pos_weight' : 60,
        }
    bst = lgb_modelfit_nocv(params, 
                        train_df, 
                        val_df, 
                        predictors, 
                        target, 
                        model_type='dart',
                        objective='binary', 
                        #metrics ='binary',
                        metrics={'auc'},
                        early_stopping_rounds=100, 
                        verbose_eval=50, 
                        num_boost_round=1000, 
                        categorical_features=categorical
                        )
    y_predict = bst.predict(val_df[predictors])
    if (target == 'Normal'):
        y_predict = 1 - y_predict
    atec_Score,b,c=atecml.data.accuracy_validation(val_df['Fraud'],y_predict)
    return bst,atec_Score

In [3]:
train_df = pd.read_pickle('./01_train.dat')

predictors = [x for x in train_df.columns if x not in atecml.data.NOT_FEATURE_COLUMNS]
#predictors = ['mean', 'n_f7', 'f7', 'f15', 'f238', 'f253', 'f210', 'f248', 'f243', 'f247', 'f18', 'n_f6', 'f218', 'f5', 'f234', 'f19', 'f82', 'f4', 'f99', 'f237', 'f215', 'f246', 'n_NaN_LIST', 'f81', 'f84', 'f209', 'f101', 'f85', 'f106', 'f245', 'f17', 'f217', 'f31', 'f216', 'f244', 'f86', 'f242', 'f235', 'f233', 'f58', 'n_f29', 'f204', 'f236', 'f6', 'f8', 'n_f244', 'f250', 'f105', 'f11', 'f53', 'n_f28', 'n_f236', 'f241', 'n_f245', 'f14', 'f226', 'n_f238', 'n_f215', 'f232', 'f223', 'f83', 'f30', 'f266', 'n_f284', 'f214', 'f207', 'f100', 'f252', 'f239', 'f9', 'f231', 'f164', 'n_f235', 'f262', 'f27', 'f208', 'f10', 'f229', 'f55', 'f161', 'n_f30', 'f263', 'f230', 'f12', 'f222', 'f52', 'n_f14', 'f249', 'f219', 'f163', 'n_f294', 'n_f25', 'f240', 'f110', 'f16', 'n_f237', 'n_f17', 'f98', 'f63', 'n_f31', 'f104', 'n_f21', 'f211', 'f3', 'f178', 'f80', 'n_f210', 'n_f208', 'f224', 'f56', 'f213', 'n_f26', 'f251', 'n_f216', 'f57', 'f13', 'f284', 'f103', 'f206', 'f227', 'n_f105', 'f34', 'f291', 'n_f287', 'f212', 'f225', 'n_f24', 'n_f52', 'f185', 'f205', 'n_f286', 'f285', 'f221', 'f286', 'f54', 'f26', 'n_f262', 'f162', 'n_f33', 'f35', 'f278', 'n_f279', 'f73', 'n_f20', 'n_f293', 'f271', 'n_f234', 'f287', 'f79', 'f290', 'f33', 'f102', 'f1', 'n_f285', 'n_f106', 'f264', 'n_f289', 'f220', 'f265', 'n_f49', 'f184', 'f32', 'f75', 'f228', 'n_f95', 'n_f90', 'f50', 'f134', 'f270', 'f165', 'f296', 'f2', 'n_f266', 'f294', 'f283', 'n_f182', 'n_f50', 'n_f291', 'n_f209', 'f259', 'n_f175', 'f78', 'n_f5', 'n_f290', 'f61', 'f123', 'f183', 'f65', 'n_f53', 'f289', 'n_f34', 'n_f295', 'f297', 'n_f51', 'f62', 'f49', 'f21', 'f130', 'f48', 'n_f32']
#predictors = ['mean', 'n_f7', 'f7', 'f210', 'f238', 'f15', 'f253', 'f5', 'f99', 'f243', 'f82', 'f248', 'f247', 'f84', 'f234', 'f209', 'f106', 'n_NaN_LIST', 'f18', 'f218', 'n_f6', 'f101', 'f85', 'f204', 'f237', 'f81', 'f19', 'f86', 'f31', 'f215', 'f246', 'n_f28', 'f58', 'n_f284', 'f207', 'f53', 'n_f29', 'f178', 'f266', 'f244', 'f216', 'f4', 'f245', 'f55', 'f242', 'n_f244', 'f217', 'f235', 'f105', 'f9', 'f83', 'f233', 'n_f236', 'f262', 'n_f215', 'f17', 'n_f208', 'f236', 'f100', 'f214', 'f98', 'f30', 'n_f238', 'f208', 'n_f245', 'f6', 'f63', 'n_f25', 'f8', 'f263', 'f110', 'f130', 'f250', 'n_f235', 'f11', 'n_f210', 'n_f21', 'f14', 'n_f294', 'f291', 'f241', 'f226', 'f104', 'f10', 'f185', 'f231', 'n_f105', 'f52', 'f284', 'f161', 'f294', 'f103', 'f223', 'f27', 'f56', 'f278', 'f232', 'f16', 'f54', 'f239', 'f57', 'f285', 'f102', 'f206', 'f26', 'n_f30', 'f287', 'f75', 'f73', 'n_f287', 'f290', 'f12', 'n_f175', 'f264', 'f265', 'f184', 'f249', 'f164', 'f225', 'f240', 'n_f286', 'f252', 'f230', 'n_f20', 'f183', 'f34', 'n_f285', 'f271', 'f13', 'f286', 'n_f26', 'f205', 'f297', 'n_f237', 'n_f106', 'n_f182', 'f222', 'f80', 'f224', 'f134', 'f251', 'f283', 'f3', 'f79', 'n_f90', 'f259', 'f211', 'f219', 'n_f33', 'f21', 'f289', 'n_f289', 'n_f216', 'n_f209', 'n_f49', 'f227', 'n_f14', 'f163', 'n_f290', 'f48']

DateFold={}

DateFold[0] = set(atecml.data.filter_date(train_df,start_date='2017-09-05',end_date='2017-09-12').index)
DateFold[1] = set(atecml.data.filter_date(train_df,start_date='2017-09-13',end_date='2017-09-20').index)
DateFold[2] = set(atecml.data.filter_date(train_df,start_date='2017-09-21',end_date='2017-09-28').index)
DateFold[3] = set(atecml.data.filter_date(train_df,start_date='2017-09-29',end_date='2017-10-06').index)
DateFold[4] = set(atecml.data.filter_date(train_df,start_date='2017-10-07',end_date='2017-10-14').index)
DateFold[5] = list(atecml.data.filter_date(train_df,start_date='2017-10-15',end_date='2017-11-24').index)

all_list = set(train_df.index) - set(DateFold[5])
len(all_list),len(DateFold[5])

(634284, 360447)

In [4]:
pos_model_list  =[]
neg_model_list  =[]
score_posA = []
score_negA = []

'''
for idx in range(0,5):
    Train_DataSet = train_df[train_df.index.isin(list(all_list - DateFold[idx]))].reset_index(drop=True)
    Val_DataSet = train_df[train_df.index.isin(DateFold[5])].reset_index(drop=True)
    
    Normal_Set = Train_DataSet.copy()
    N_Fraud_DF = Normal_Set[Normal_Set['Normal']==0]
    fraud_num = len(Normal_Set[Normal_Set['Normal']==0])
    normal_num = len(Normal_Set[Normal_Set['Normal']==1])
    weight = normal_num // fraud_num -1
    n_templist = [Normal_Set]
    for item in range (0,weight):
        n_templist.append(N_Fraud_DF)
    Normal_Set =pd.concat(n_templist,ignore_index=True)
    post_fraud_num = len(Normal_Set[Normal_Set['Normal']==0])
    post_normal_num = len(Normal_Set[Normal_Set['Normal']==1])
    print('Normal Weight:',post_fraud_num,post_normal_num)
    

    Fraud_Set = Train_DataSet.copy()
    F_Fraud_DF = Fraud_Set[Fraud_Set['Fraud']==1]
    f_normal_num = len(Fraud_Set[Fraud_Set['Fraud']==0])
    f_fraud_num = len(Fraud_Set[Fraud_Set['Fraud']==1])
    fweight = f_normal_num // f_fraud_num -1
    f_templist = [Fraud_Set]
    for item in range (0,fweight):
        f_templist.append(F_Fraud_DF)
    Fraud_Set =pd.concat(f_templist,ignore_index=True)
    fpost_fraud_num = len(Fraud_Set[Fraud_Set['Fraud']==0])
    fpost_normal_num = len(Fraud_Set[Fraud_Set['Fraud']==1])
    print('Fraud Weight:',fpost_fraud_num,fpost_normal_num)    
    
    
    model_pos,score_pos = model_validation(Normal_Set,Val_DataSet,predictors,'Normal',{})
    model_neg,score_neg = model_validation(Fraud_Set,Val_DataSet,predictors,'Fraud',{})
    pos_model_list.append(model_pos)
    neg_model_list.append(model_neg)
    score_posA.append(score_pos)
    score_negA.append(score_neg)
'''    

"\nfor idx in range(0,5):\n    Train_DataSet = train_df[train_df.index.isin(list(all_list - DateFold[idx]))].reset_index(drop=True)\n    Val_DataSet = train_df[train_df.index.isin(DateFold[5])].reset_index(drop=True)\n    \n    Normal_Set = Train_DataSet.copy()\n    N_Fraud_DF = Normal_Set[Normal_Set['Normal']==0]\n    fraud_num = len(Normal_Set[Normal_Set['Normal']==0])\n    normal_num = len(Normal_Set[Normal_Set['Normal']==1])\n    weight = normal_num // fraud_num -1\n    n_templist = [Normal_Set]\n    for item in range (0,weight):\n        n_templist.append(N_Fraud_DF)\n    Normal_Set =pd.concat(n_templist,ignore_index=True)\n    post_fraud_num = len(Normal_Set[Normal_Set['Normal']==0])\n    post_normal_num = len(Normal_Set[Normal_Set['Normal']==1])\n    print('Normal Weight:',post_fraud_num,post_normal_num)\n    \n\n    Fraud_Set = Train_DataSet.copy()\n    F_Fraud_DF = Fraud_Set[Fraud_Set['Fraud']==1]\n    f_normal_num = len(Fraud_Set[Fraud_Set['Fraud']==0])\n    f_fraud_num = len

In [28]:
idx=2

Train_DataSet = train_df[train_df.index.isin(list(all_list - DateFold[idx]))].reset_index(drop=True)
Val_DataSet = train_df[train_df.index.isin(DateFold[5])].reset_index(drop=True)

#Train_DataSet = Train_DataSet[Train_DataSet['mean'] >0.5].reset_index(drop=True)
#Val_DataSet = Val_DataSet[Val_DataSet['mean']>0.5].reset_index(drop=True)

Normal_Set = Train_DataSet.copy()
N_Fraud_DF = Normal_Set[Normal_Set['Normal']==0]
fraud_num = len(Normal_Set[Normal_Set['Normal']==0])
normal_num = len(Normal_Set[Normal_Set['Normal']==1])
weight = normal_num // fraud_num -1
print(weight)
n_templist = [Normal_Set]
for item in range (0,weight):
    n_templist.append(N_Fraud_DF)
Normal_Set =pd.concat(n_templist,ignore_index=True)
post_fraud_num = len(Normal_Set[Normal_Set['Normal']==0])
post_normal_num = len(Normal_Set[Normal_Set['Normal']==1])
print('Normal Weight:',post_fraud_num,post_normal_num)


Fraud_Set = Train_DataSet.copy()
F_Fraud_DF = Fraud_Set[Fraud_Set['Fraud']==1]
f_normal_num = len(Fraud_Set[Fraud_Set['Fraud']==0])
f_fraud_num = len(Fraud_Set[Fraud_Set['Fraud']==1])
fweight = f_normal_num // f_fraud_num -1
f_templist = [Fraud_Set]
for item in range (0,fweight):
    f_templist.append(F_Fraud_DF)
Fraud_Set =pd.concat(f_templist,ignore_index=True)
fpost_fraud_num = len(Fraud_Set[Fraud_Set['Fraud']==0])
fpost_normal_num = len(Fraud_Set[Fraud_Set['Fraud']==1])
print('Fraud Weight:',fpost_fraud_num,fpost_normal_num)    

60
Normal Weight: 493124 500930
Fraud Weight: 503449 500850


In [35]:
predictors = [x for x in train_df.columns if x not in atecml.data.NOT_FEATURE_SUM]

In [33]:
#predictors = ['mean', 'n_f7', 'f7', 'f210', 'f238', 'f15', 'f253', 'f5', 'f99', 'f243', 'f82', 'f248', 'f247', 'f84', 'f234', 'f209', 'f106', 'n_NaN_LIST', 'f18', 'f218', 'n_f6', 'f101', 'f85', 'f204', 'f237', 'f81', 'f19', 'f86', 'f31', 'f215', 'f246', 'n_f28', 'f58', 'n_f284', 'f207', 'f53', 'n_f29', 'f178', 'f266', 'f244', 'f216', 'f4', 'f245', 'f55', 'f242', 'n_f244', 'f217', 'f235', 'f105', 'f9', 'f83', 'f233', 'n_f236', 'f262', 'n_f215', 'f17', 'n_f208', 'f236', 'f100', 'f214', 'f98', 'f30', 'n_f238', 'f208', 'n_f245', 'f6', 'f63', 'n_f25', 'f8', 'f263', 'f110', 'f130', 'f250', 'n_f235', 'f11', 'n_f210', 'n_f21', 'f14', 'n_f294', 'f291', 'f241', 'f226', 'f104', 'f10', 'f185', 'f231', 'n_f105', 'f52', 'f284', 'f161', 'f294', 'f103', 'f223', 'f27',]

In [None]:
model_neg,score_neg = model_validation(Fraud_Set,Val_DataSet,predictors,'Fraud',{})

preparing validation datasets
Training until validation scores don't improve for 100 rounds.
[50]	train's auc: 0.953141	valid's auc: 0.918661
[100]	train's auc: 0.980974	valid's auc: 0.955337
[150]	train's auc: 0.985333	valid's auc: 0.963471
[200]	train's auc: 0.988549	valid's auc: 0.968093
[250]	train's auc: 0.990976	valid's auc: 0.971059
[300]	train's auc: 0.992618	valid's auc: 0.97367
[350]	train's auc: 0.993302	valid's auc: 0.975483
[400]	train's auc: 0.994037	valid's auc: 0.976929
[450]	train's auc: 0.995124	valid's auc: 0.974533
[500]	train's auc: 0.995678	valid's auc: 0.97359
Early stopping, best iteration is:
[405]	train's auc: 0.994159	valid's auc: 0.97705


In [None]:
importance = model_neg.feature_importance(importance_type='split')
feature_name = model_neg.feature_name()
# for (feature_name,importance) in zip(feature_name,importance):
#     print (feature_name,importance) 
feature_importance = pd.DataFrame({'feature_name':feature_name,'importance':importance} )
feature_importance.to_csv('feature_importance.csv',index=False)



In [None]:
list(feature_importance.sort_values('importance',ascending=False).head(160)['feature_name'])