In [1]:
import pandas as pd
import numpy as np
import warnings
import time
warnings.filterwarnings("ignore")
import lightgbm as lgb
from bayes_opt import BayesianOptimization
import sys
sys.path.append("../fraud_detection/src/")
from util import lgb_f1_score

In [2]:
import sys
sys.path.append("../fraud_detection/src/")
import time
import argparse
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from contextlib import contextmanager
import gc 
from util import s_to_time_format, string_to_datetime, hour_to_range, kfold_lightgbm, kfold_xgb
from util import rolling_stats_target_by_cols
#from util import _time_elapsed_between_last_transactions,time_elapsed_between_last_transactions
#from util import num_transaction_in_past_n_days
#from util import add_auto_encoder_feature
#from util import group_target_by_cols_split_by_users
from time import strftime, localtime
import logging
import sys
from config import Configs

# logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))
#log_file = '{}-{}-{}.log'.format(opt.model_name, opt.dataset, strftime("%y%m%d-%H%M", localtime()))
log_file = '../fraud_detection/result/fs_{}.log'.format(strftime("%y%m%d-%H%M", localtime()))
logger.addHandler(logging.FileHandler(log_file))

def group_target_by_cols(df_train, df_test, recipe):
    df = pd.concat([df_train, df_test], axis = 0)
    for m in range(len(recipe)):
        cols = recipe[m][0]
        for n in range(len(recipe[m][1])):
            target = recipe[m][1][n][0]
            method = recipe[m][1][n][1]
            name_grouped_target = method+"_"+target+'_BY_'+'_'.join(cols)
            tmp = df[cols + [target]].groupby(cols).agg(method)
            tmp = tmp.reset_index().rename(index=str, columns={target: name_grouped_target})
            df_train = df_train.merge(tmp, how='left', on=cols)
            df_test = df_test.merge(tmp, how='left', on=cols)

        # reduced memory    
        del tmp
        gc.collect()
    
    return df_train, df_test

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    logger.info("{} - done in {:.0f}s".format(title, time.time() - t0))
    
def main(args):
    with timer("Process train/test application"):
        #-------------------------
        # load dataset
        #-------------------------
        df_train = pd.read_csv(args.train_file)
        df_test = pd.read_csv(args.test_file)
        #-------------------------
        # pre-processing
        #-------------------------

        for cat in Configs.CATEGORY:
            df_train[cat] = df_train[cat].astype('category') #.cat.codes
            df_test[cat] = df_test[cat].astype('category')
            
        for df in [df_train, df_test]:
            # pre-processing
            df["loctm_"] = df.loctm.astype(int).astype(str)
            df.loctm_ = df.loctm_.apply(s_to_time_format).apply(string_to_datetime)
            # # time-related feature
            df["loctm_hour_of_day"] = df.loctm_.apply(lambda x: x.hour).astype('category')
            df["loctm_minute_of_hour"] = df.loctm_.apply(lambda x: x.minute)
            df["loctm_second_of_min"] = df.loctm_.apply(lambda x: x.second)
            # df["loctm_absolute_time"] = [h*60+m for h,m in zip(df.loctm_hour_of_day,df.loctm_minute_of_hour)]
            df["hour_range"] = df.loctm_.apply(lambda x: hour_to_range(x.hour)).astype("category")
            # removed the columns no need
            df.drop(columns = ["loctm_"], axis = 1, inplace = True)
            # auxiliary fields
            df["day_hr_min"] = ["{}:{}:{}".format(i,j,k) for i,j,k in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour)]
            df["day_hr_min_sec"] = ["{}:{}:{}:{}".format(i,j,k,z) for i,j,k,z in zip(df.locdt,df.loctm_hour_of_day,df.loctm_minute_of_hour,df.loctm_second_of_min)]

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add bacno/cano feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CONAM_AGG_RECIPE_1)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add iterm-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.ITERM_AGG_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add conam-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CONAM_AGG_RECIPE_2)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add hour-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.HOUR_AGG_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add cano/conam feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.CANO_CONAM_COUNT_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add cano/bacno latent feature"):
        df = pd.read_csv("../fraud_detection/features/bacno_latent_features_w_cano.csv")
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")
        df = pd.read_csv("../fraud_detection/features/bacno_cano_latent_features.csv")
        df_train = df_train.merge(df, on = "cano", how = "left")
        df_test = df_test.merge(df, on = "cano", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add locdt-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.LOCDT_CONAM_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add mchno-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.MCHNO_CONAM_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add scity-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.SCITY_CONAM_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add stocn-related feature"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.STOCN_CONAM_RECIPE)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add mchno/bacno latent feature"):
        df = pd.read_csv("../fraud_detection/features/bacno_latent_features_w_mchno.csv")
        df_train = df_train.merge(df, on = "bacno", how = "left")
        df_test = df_test.merge(df, on = "bacno", how = "left")
        df = pd.read_csv("../fraud_detection/features/bacno_mchno_latent_features.csv")
        df_train = df_train.merge(df, on = "mchno", how = "left")
        df_test = df_test.merge(df, on = "mchno", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on bacno"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE_BACNO,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on cano"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE_CANO,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on mchno"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE_MCHNO,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on csmcu/stocn/scity"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add time second-level feature on acqic/csmcu/stocn/scity"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.HOUR_AGG_SEC_LEVEL_RECIPE_2,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add conam-related feature v3"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.CONAM_AGG_RECIPE_3,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add locdt-related feature v2"):
        df_train, df_test = group_target_by_cols(df_train, df_test, Configs.LOCDT_CONAM_RECIPE_2)

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add conam-related feature v4"):
        df_train, df_test = group_target_by_cols(
            df_train, 
            df_test, 
            Configs.CONAM_AGG_RECIPE_4,
            )
        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add cano/mchno latent feature"):
        df = pd.read_csv("../fraud_detection/features/cano_latent_features_w_mchno.csv")
        df_train = df_train.merge(df, on = "cano", how = "left")
        df_test = df_test.merge(df, on = "cano", how = "left")
        df = pd.read_csv("../fraud_detection/features/cano_mchno_latent_features.csv")
        df_train = df_train.merge(df, on = "mchno", how = "left")
        df_test = df_test.merge(df, on = "mchno", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))   

    with timer("Add cano/locdt latent feature"):
        df = pd.read_csv("../fraud_detection/features/cano_latent_features_w_locdt.csv")
        df_train = df_train.merge(df, on = "cano", how = "left")
        df_test = df_test.merge(df, on = "cano", how = "left")
        df = pd.read_csv("../fraud_detection/features/cano_locdt_latent_features.csv")
        df_train = df_train.merge(df, on = "locdt", how = "left")
        df_test = df_test.merge(df, on = "locdt", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    with timer("Add mchno/locdt latent feature"):
        df = pd.read_csv("../fraud_detection/features/mchno_latent_features_w_locdt.csv")
        df_train = df_train.merge(df, on = "mchno", how = "left")
        df_test = df_test.merge(df, on = "mchno", how = "left")
        df = pd.read_csv("../fraud_detection/features/mchno_locdt_latent_features.csv")
        df_train = df_train.merge(df, on = "locdt", how = "left")
        df_test = df_test.merge(df, on = "locdt", how = "left")

        logger.info("Train application df shape: {}".format(df_train.shape))
        logger.info("Test application df shape: {}".format(df_test.shape))

    #return df_train, df_test



    
    with timer("Run LightGBM with kfold"):
        if args.feature_selection:
            logger.info("==============Feature Selection==============")
            for df in [df_train, df_test]:
                # drop random features (by null hypothesis)
                df.drop(Configs.FEATURE_GRAVEYARD, axis=1, inplace=True, errors='ignore')

                # drop unused features features_with_no_imp_at_least_twice
                df.drop(Configs.FEATURE_USELESSNESS, axis=1, inplace=True, errors='ignore')

                gc.collect()   
            logger.info("Train application df shape: {}".format(df_train.shape))
            logger.info("Test application df shape: {}".format(df_test.shape))

        for df in [df_train, df_test]:
            df.drop(columns = ["loctm_hour_of_day",
                               "loctm_minute_of_hour", 
                               "loctm_second_of_min",
                               "day_hr_min",
                               "day_hr_min_sec",
                               ], axis = 1, inplace = True)
   
    return df_train, df_test

args = {
 "train_file":"/data/yunrui_li/fraud/dataset/train.csv",
 "test_file":"/data/yunrui_li/fraud/dataset/test.csv",
 "result_path":"/data/yunrui_li/fraud/fraud_detection/result/submission.csv",
 "feature_selection":True,
 "feature_importance_plot": True,
 "SEED": 1030,
 "NUM_FOLDS": 2, # 5
 "CPU_USE_RATE":1.0,
 "STRATIFIED": True,
 "TEST_NULL_HYPO":False,
 "NUM_LEAVES":31,
 "COLSAMPLE_BYTREE":1.0,
 "SUBSAMPLE": 1.0,
 "SUBSAMPLE_FREQ": 0,
 "MAX_DEPTH": -1,
 "REG_ALPHA": 0.0,
 "REG_LAMBDA": 0.0,
 "MIN_SPLIT_GAIN": 0.0,
 "MIN_CHILD_WEIGHT": 0.001,
 "MAX_BIN": 255,
 "SCALE_POS_WEIGHT": 3
    
}

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self
args = AttrDict(args)
df_train, df_test = main(args)


Train application df shape: (1521787, 29)
Test application df shape: (421665, 28)
Process train/test application - done in 67s
Train application df shape: (1521787, 43)
Test application df shape: (421665, 42)
Add bacno/cano feature - done in 17s
Train application df shape: (1521787, 115)
Test application df shape: (421665, 114)
Add iterm-related feature - done in 108s
Train application df shape: (1521787, 185)
Test application df shape: (421665, 184)
Add conam-related feature - done in 130s
Train application df shape: (1521787, 209)
Test application df shape: (421665, 208)
Add hour-related feature - done in 545s
Train application df shape: (1521787, 210)
Test application df shape: (421665, 209)
Add cano/conam feature - done in 42s
Train application df shape: (1521787, 230)
Test application df shape: (421665, 229)
Add cano/bacno latent feature - done in 4s
Train application df shape: (1521787, 265)
Test application df shape: (421665, 264)
Add locdt-related feature - done in 188s
Train a

In [3]:
feats = [f for f in df_train.columns if f not in ["fraud_ind"]]
X,y = df_train[feats], df_train.fraud_ind

In [4]:
X.shape,y.shape

((1521787, 695), (1521787,))

In [5]:
assert 1==0

AssertionError: 

In [None]:
from sklearn.metrics import f1_score

def lgb_f1_score(y_pred, y_true):
    """evaluation metric"""
    #print ("y_pred",y_pred)
    #print ("y_true",y_true)
    y_hat = np.round(y_pred)
    return 'f1', f1_score(y_true.get_label(), y_hat), True

def bayes_parameter_opt_lgb(X, y, 
                            init_round=15, 
                            opt_round=25, 
                            n_folds=5, 
                            random_seed=1030,
                            n_estimators=10000,
                            learning_rate=0.05, 
                            output_process=True):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y, categorical_feature='auto', free_raw_data = False)
    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction,
                 #max_depth, 
                 lambda_l1, lambda_l2, min_split_gain, 
                 min_child_weight, max_bin, scale_pos_weight):
        params = {'application':'binary',
                  'num_iterations': n_estimators, 
                  'learning_rate':learning_rate, 
                  'early_stopping_round':100, 
                  'n_jobs':5,
                  }
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        #params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        params['max_bin'] = int(round(max_bin))
        params['scale_pos_weight'] = scale_pos_weight

        cv_result = lgb.cv(params, 
                           train_data, 
                           nfold=n_folds,
                           seed=random_seed, 
                           stratified=True, 
                           categorical_feature = "auto",
                           feval=lgb_f1_score)
        print (cv_result)
        return max(cv_result['f1-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                            'feature_fraction': (0.5, 1.0),
                                            'bagging_fraction': (0.5, 1.0),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.0, 0.1),
                                            'min_child_weight': (1, 50),
                                            'scale_pos_weight': (1, 10),
                                            'max_bin': (255,355),
                                           }, 
                                 random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # output optimization process
    if output_process==True: 
        pd.DataFrame(lgbBO.res).sort_values(by = "target", ascending=False).to_csv("../fraud_detection/result/bayes_opt_result.csv")
    return lgbBO.max["target"], lgbBO.max["params"] # best score and best parameter
#     return lgbBO
#     # return best parameters
#     return lgbBO.res['max']['max_params']

opt_score, opt_params = bayes_parameter_opt_lgb(X, y, 
                                     init_round=5, 
                                     opt_round=10, 
                                     n_folds=5, 
                                     random_seed=1030, 
                                     n_estimators=10000, 
                                     learning_rate=0.02)

In [None]:
opt_params

In [None]:
opt_score

In [None]:
opt_params.max["target"], opt_params.max["params"]

In [None]:
pd.options.display.max_colwidth = 1000
pd.DataFrame(opt_params.res).sort_values(by = "target", ascending=False)

In [None]:
df = pd.DataFrame({"A":[1,2,3,4],"B":[1,1,1,1]})
df

In [None]:
df["A"] = df["A"].copy().sample(frac = 1.0).values

In [None]:
df

In [None]:
 df.A.copy().sample(frac = 1.0).values

In [None]:
df

In [None]:
clf = lgb.LGBMClassifier(
    n_jobs = 3,
    boosting_type = "rf",
    # nthread=int(multiprocessing.cpu_count()*args.CPU_USE_RATE),
    n_estimators=10000,
    )
clf

In [None]:
help(lgb.cv)