In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import lightgbm as lgb
from sklearn.metrics import f1_score
import numpy as np
from contextlib import contextmanager
import time
import sys
import gc 
sys.path.append("../fraud_detection/src/")
from util import s_to_time_format, string_to_datetime,hour_to_range

CATEGORY = ['ecfg', 'flbmk', 'flg_3dsmk', 'insfg', 'ovrlt', 'scity', 'csmcu', 'cano', 'mchno', 'hcefg', 'bacno', 'contp', 'etymd', 'acqic']


In [50]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))
    
def main(args):
    with timer("Process train/test application"):
        #-------------------------
        # load dataset
        #-------------------------
        df_train = pd.read_csv(args.train_file)
        df_test = pd.read_csv(args.test_file)

        #-------------------------
        # pre-processing
        #-------------------------

        for cat in CATEGORY:
            df_train[cat] = df_train[cat].astype('category')#.cat.codes
            df_test[cat] = df_test[cat].astype('category')
            
        print("Train application df shape:", df_train.shape)
        print("Test application df shape:", df_test.shape)
        
        for df in [df_train, df_test]:
            # pre-processing
            df["loctm_"] = df.loctm.astype(int).astype(str)
            df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
            # time-related feature
            df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour).astype('category')
            df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
            df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)
            #df["loctm_absolute_time"] = [h*60+m for h,m in zip(df.loctm_hour_of_day,df.loctm_minute_of_hour)]
            df["hour_range"] = df.loctm_.apply(lambda x: hour_to_range(x.hour)).astype("category")
            # removed the columns no need
            df.drop(columns = ["loctm_"], axis = 1, inplace = True)
    return df_train, df_test

In [51]:
args = {
 "train_file":"/data/yunrui_li/fraud/dataset/train.csv",
 "test_file":"/data/yunrui_li/fraud/dataset/test.csv",
 "result_path":"/data/yunrui_li/fraud/fraud_detection/result/submission.csv",
 "feature_selection":False,
 "feature_importance_plot": True,
 "SEED": 1030,
 "NUM_FOLDS": 2, # 5
 "CPU_USE_RATE":1.0,
 "STRATIFIED": True,
 "TEST_NULL_HYPO":False,
 "NUM_LEAVES":31,
 "COLSAMPLE_BYTREE":1.0,
 "SUBSAMPLE": 1.0,
 "SUBSAMPLE_FREQ": 0,
 "MAX_DEPTH": -1,
 "REG_ALPHA": 0.0,
 "REG_LAMBDA": 0.0,
 "MIN_SPLIT_GAIN": 0.0,
 "MIN_CHILD_WEIGHT": 0.001,
 "MAX_BIN": 255,
 "SCALE_POS_WEIGHT": 3
    
}
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
args = AttrDict(args)
args

{'train_file': '/data/yunrui_li/fraud/dataset/train.csv',
 'test_file': '/data/yunrui_li/fraud/dataset/test.csv',
 'result_path': '/data/yunrui_li/fraud/fraud_detection/result/submission.csv',
 'feature_selection': False,
 'feature_importance_plot': True,
 'SEED': 1030,
 'NUM_FOLDS': 2,
 'CPU_USE_RATE': 1.0,
 'STRATIFIED': True,
 'TEST_NULL_HYPO': False,
 'NUM_LEAVES': 31,
 'COLSAMPLE_BYTREE': 1.0,
 'SUBSAMPLE': 1.0,
 'SUBSAMPLE_FREQ': 0,
 'MAX_DEPTH': -1,
 'REG_ALPHA': 0.0,
 'REG_LAMBDA': 0.0,
 'MIN_SPLIT_GAIN': 0.0,
 'MIN_CHILD_WEIGHT': 0.001,
 'MAX_BIN': 255,
 'SCALE_POS_WEIGHT': 3}

In [58]:
df_train, df_test = main(args)


Train application df shape: (1521787, 23)
Test application df shape: (421665, 22)
Process train/test application - done in 79s


In [None]:
# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df_train, df_test, nan_as_category = True):
    df = pd.concat([df_train, df_test], axis = 0)
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

In [None]:
df = pd.concat([df_train, df_test], axis = 0)

In [25]:
original_columns = list(df.columns)
# categorical_columns = [col for col in df.columns if df[col].dtype == 'object']

In [71]:
CATEGORY = ['ecfg', 'flbmk', 'flg_3dsmk', 'insfg', 
            'ovrlt', 'scity', 'csmcu', 'cano', 
            'mchno', 'hcefg', 'bacno', 'contp', 
            'etymd', 'acqic', 'hour_range',"loctm_hour_of_day"]


In [72]:
from sklearn.preprocessing import LabelEncoder
for col in CATEGORY:
    try:
        le = LabelEncoder()
        le.fit(np.concatenate([df_train[col], df_test[col]]))
        df_train[col] = le.transform(df_train[col])
        df_test[col] = le.transform(df_test[col])
    except:
        df_train[col].cat.add_categories('NULL', inplace = True)
        df_test[col].cat.add_categories('NULL', inplace = True)
        le = LabelEncoder()
        le.fit(np.concatenate([df_train[col].fillna('NULL'), df_test[col].fillna('NULL')]))
        df_train[col] = le.transform(df_train[col].fillna('NULL'))
        df_test[col] = le.transform(df_test[col].fillna('NULL'))


In [73]:
df_train

Unnamed: 0,acqic,bacno,cano,conam,contp,csmcu,ecfg,etymd,flbmk,flg_3dsmk,...,mchno,ovrlt,scity,stocn,stscd,txkey,loctm_hour_of_day,loctm_minute_of_hour,loctm_second_of_min,hour_range
0,6862,112785,37846,513.80,5,0,0,0,0,0,...,59034,0,0,102,0,516056,17,26,52,0
1,0,133951,45476,465.62,5,0,0,2,0,0,...,0,0,5795,102,0,4376,10,51,14,2
2,6862,15350,187354,513.80,5,0,0,0,0,0,...,59034,0,0,102,0,483434,15,24,58,0
3,6697,156492,29812,1016.11,5,62,0,5,0,0,...,50185,0,3267,102,0,1407164,17,29,46,0
4,5959,105534,80881,713.66,5,62,0,4,0,0,...,93290,0,5795,102,0,1051004,18,21,29,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1521782,6305,90635,15111,578.38,5,75,1,8,1,1,...,38034,0,5795,102,0,1478280,19,16,42,3
1521783,3214,144503,115625,435.32,5,75,1,8,1,1,...,89666,0,1450,102,0,661087,10,23,38,2
1521784,6750,161473,93103,1.38,5,75,1,8,1,1,...,78823,0,5795,102,0,167073,23,46,18,3
1521785,6016,45215,196436,1.38,5,75,1,2,1,1,...,78823,0,5795,102,0,338215,21,52,18,3


In [74]:
from xgboost import XGBClassifier

In [106]:
def xgb_f1_score(y_pred, y_true):
    """evaluation metric"""
    y_hat = np.round(y_pred)
    y_true = y_true.get_label()
    return 'f1', 1-f1_score(y_true, y_hat)

clf = XGBClassifier(
    n_jobs = 1,
    max_depth=3,
    learning_rate=0.05,
    n_estimators=10000,
    silent=True,
    objective='binary:logistic',
    booster='gbtree',
    gamma=0, 
    min_child_weight=1, 
    max_delta_step=0, 
    subsample=0.8, 
    colsample_bytree=1, 
    colsample_bylevel=1, 
    colsample_bynode=0.8, 
    reg_alpha=0, 
    reg_lambda=1e-05,
    random_state=1030,
    scale_pos_weight=args.SCALE_POS_WEIGHT
    )
clf

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=0.8, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=10000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1030,
              reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=3, seed=None,
              silent=True, subsample=0.8, verbosity=1)

In [107]:
feats = [f for f in df_train.columns if f not in ["fraud_ind"]]
train_x, train_y = df_train[feats], df_train['fraud_ind']
valid_x, valid_y = df_train[feats], df_train['fraud_ind']

In [108]:
clf.fit(train_x, 
        train_y, 
        eval_set=[(train_x, train_y), (valid_x, valid_y)], 
        eval_metric= xgb_f1_score, 
        verbose= True, 
        early_stopping_rounds= 100, 
        #categorical_feature='auto'
       ) # early_stopping_rounds= 200

[0]	validation_0-error:0.013478	validation_1-error:0.013478	validation_0-f1:0.389438	validation_1-f1:0.389438
Multiple eval metrics have been passed: 'validation_1-f1' will be used for early stopping.

Will train until validation_1-f1 hasn't improved in 100 rounds.
[1]	validation_0-error:0.012532	validation_1-error:0.012532	validation_0-f1:0.370449	validation_1-f1:0.370449
[2]	validation_0-error:0.012468	validation_1-error:0.012468	validation_0-f1:0.390256	validation_1-f1:0.390256
[3]	validation_0-error:0.013299	validation_1-error:0.013299	validation_0-f1:0.436632	validation_1-f1:0.436632
[4]	validation_0-error:0.012431	validation_1-error:0.012431	validation_0-f1:0.398678	validation_1-f1:0.398678
[5]	validation_0-error:0.012491	validation_1-error:0.012491	validation_0-f1:0.389987	validation_1-f1:0.389987
[6]	validation_0-error:0.012459	validation_1-error:0.012459	validation_0-f1:0.389608	validation_1-f1:0.389608
[7]	validation_0-error:0.012395	validation_1-error:0.012395	validation_0-f

[73]	validation_0-error:0.012002	validation_1-error:0.012002	validation_0-f1:0.431489	validation_1-f1:0.431489
[74]	validation_0-error:0.011984	validation_1-error:0.011984	validation_0-f1:0.437876	validation_1-f1:0.437876
[75]	validation_0-error:0.011937	validation_1-error:0.011937	validation_0-f1:0.442795	validation_1-f1:0.442795
[76]	validation_0-error:0.011941	validation_1-error:0.011941	validation_0-f1:0.443069	validation_1-f1:0.443069
[77]	validation_0-error:0.011958	validation_1-error:0.011958	validation_0-f1:0.44219	validation_1-f1:0.44219
[78]	validation_0-error:0.011992	validation_1-error:0.011992	validation_0-f1:0.4396	validation_1-f1:0.4396
[79]	validation_0-error:0.01195	validation_1-error:0.01195	validation_0-f1:0.443935	validation_1-f1:0.443935
[80]	validation_0-error:0.011973	validation_1-error:0.011973	validation_0-f1:0.445358	validation_1-f1:0.445358
[81]	validation_0-error:0.012012	validation_1-error:0.012012	validation_0-f1:0.442118	validation_1-f1:0.442118
[82]	vali

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=0.8, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=10000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1030,
              reg_alpha=0, reg_lambda=1e-05, scale_pos_weight=3, seed=None,
              silent=True, subsample=0.8, verbosity=1)