In [1]:
import os
import glob
import sys
import time
import copy
import pickle
from functools import reduce

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import sklearn
from sklearn.metrics import roc_auc_score
import xgboost

  import pandas.util.testing as tm


In [2]:
def display_df(df):
    display(HTML(df.to_html()))
    return None

## XGBoost

In [3]:
def set_xgb_version_v06(flag):
    global ATTR_NJOB
    global ATTR_SEED
    global FLAG_XGBOOST_V06
    FLAG_XGBOOST_V06 = flag
    if FLAG_XGBOOST_V06:
        ATTR_NJOB = "nthread"
        ATTR_SEED = "seed"
    else:
        ATTR_NJOB = "n_jobs"
        ATTR_SEED = "random_state"

FLAG_XGBOOST_V06 = xgboost.__version__.startswith("0.6")
ATTR_NJOB = "nthread"
ATTR_SEED = "seed"
set_xgb_version_v06(FLAG_XGBOOST_V06)
print("xgboost version = {}".format(xgboost.__version__))


def complete_xgboost_option(option_init={},option_fit={},mode="classify"):
    if mode=="classify":
        default_option_fit={
            "eval_metric"          : "auc", 
            "early_stopping_rounds": 100, # default: 100
            "verbose"              : False
            }
        default_option_init={
            "objective"         : "binary:logistic", 
            "max_depth"         : 4,     # default: 3
            "n_estimators"      : 1000,   # default: 500  
            "learning_rate"     : 0.025, # default: 0.1
            "gamma"             : 0.0, 
            "min_child_weight"  : 10, 
            "subsample"         : 0.8, 
            "tree_method"       : "gpu_hist",
            "colsample_bytree"  : 0.8,  # default:  1.0
            "colsample_bylevel" : 0.8,  # default: 1.0
            "reg_alpha"         : 0.0, 
            "reg_lambda"        : 1.0, 
            #"scale_pos_weight"  : 1.0, 
            ATTR_SEED           : 0,
            #"silent"            : True, 
            ATTR_NJOB           : 32
            }
    elif mode=="ranker":
        default_option_fit={
            "eval_metric"          : "auc", 
            "early_stopping_rounds": 100, # default: 100
            "verbose"              : False
            }
        default_option_init={
            "objective"         : "rank:pairwise", 
            "max_depth"         : 5,     # default: 3
            "n_estimators"      : 1000,   # default: 500  
            "learning_rate"     : 0.025, # default: 0.1
            "gamma"             : 0.0, 
            "min_child_weight"  : 10, 
            "subsample"         : 0.5, 
            "tree_method"       : "gpu_hist",
            "colsample_bytree"  : 0.5,  # default:  1.0
            "colsample_bylevel" : 0.5,  # default: 1.0
            "reg_alpha"         : 0.0, 
            "reg_lambda"        : 1.0, 
            #"scale_pos_weight"  : 1.0, 
            ATTR_SEED           : 0,
            #"silent"            : True, 
            ATTR_NJOB           : 32
            }
    else: # regressor
        default_option_fit={
            "eval_metric"          : "rmse", 
            "early_stopping_rounds": 100, # default: 100
            "verbose"              : False
            }
        default_option_init={
            "objective"         : "reg:linear", # default: "reg:linear"
            "max_depth"         : 4,     # default: 3
            "n_estimators"      : 500,   # default: 500  
            "learning_rate"     : 0.025, # default: 0.05
            "gamma"             : 0.0,   
            "min_child_weight"  : 20, 
            "subsample"         : 0.8, 
            "colsample_bytree"  : 0.8,  # default:  1.0
            "colsample_bylevel" : 0.8,  # default: 1.0
            "reg_alpha"         : 0.0, 
            "reg_lambda"        : 1.0, 
            #"scale_pos_weight"  : 1.0, 
            ATTR_SEED           : 1,
            #"silent"            : True, 
            ATTR_NJOB           : 32
            }
    default_option_fit.update(option_fit)
    default_option_init.update(option_init)
    return default_option_init,default_option_fit

xgboost version = 0.90


In [4]:
def refresh_gpu_model(model,pickle_filename='tmp/NT_tmp.pkl'):
    pd.to_pickle(model,pickle_filename)
    model.get_booster().__del__() 
    model  = pd.read_pickle(pickle_filename)
    return model


def extract_dataset_idx(pdXY, data_sets, tvt, tvt_cname="tvt", data_set_cname="data_set"):
    assert tvt in pdXY[tvt_cname].unique().tolist()
    
    group_sizes = []
    for data_set in data_sets:
        #print(data_set)
        assert data_set in pdXY.loc[pdXY[tvt_cname] == tvt, data_set_cname].unique().tolist()
        
    idx_list = [(pdXY[data_set_cname] == data_set) & (pdXY[tvt_cname] == tvt) for data_set in data_sets]
    group_sizes = [idx.sum() for idx in idx_list]
    #comb_idx = reduce(lambda x, y: x | y, idx_list)
    return idx_list, group_sizes

In [5]:
def run_xgboost_classify(pdXY, train_sets, early_sets, test_sets, 
                         feature_cnames, target_cname="bad", 
                         target_pred_posval_ind=1,
                         option_init={}, option_fit = {}, verbose=0):
    train_sets = sorted(train_sets)
    idx_list_train, _ = extract_dataset_idx(pdXY, train_sets, "train")
    idx_list_early, _ = extract_dataset_idx(pdXY, early_sets, "test")
    
    idx_test_dict = {}
    for test_set in test_sets:
        idx, _ = extract_dataset_idx(pdXY, [test_set], "test")
        idx_test_dict[test_set] = idx[0]
    
    feature_cnames = sorted(feature_cnames)
    results = {}
    results["feature_cnames"] = feature_cnames
    print("Num features", len(feature_cnames))
    
    X_train = pd.concat([pdXY.loc[idx_train, feature_cnames] for idx_train in idx_list_train], axis=0, ignore_index=True)
    y_train = pd.concat([pdXY.loc[idx_train, target_cname] for idx_train in idx_list_train], ignore_index=True)
                         
    X_early = pd.concat([pdXY.loc[idx_early, feature_cnames] for idx_early in idx_list_early], axis=0, ignore_index=True)
    y_early = pd.concat([pdXY.loc[idx_early, target_cname] for idx_early in idx_list_early], ignore_index=True)
    
    print("Train shape", X_train.shape)
    
    results["n_train"] = X_train.shape[0]
    results["train_sets"] = train_sets
    
    model_ = xgboost.XGBClassifier(**option_init)
    
    model_.fit(X_train, y_train, eval_set=[(X_early, y_early),], **option_fit)
    model_ = refresh_gpu_model(model_)
    
    ntree = model_.get_booster().best_ntree_limit
    print("ntree", ntree)
    
    y_train_pred = model_.predict_proba(X_train, ntree_limit=ntree)[:, target_pred_posval_ind]
    auc_train = roc_auc_score(y_train, y_train_pred)
    
    auc_test = {}
    for test_set, idx in idx_test_dict.items():
        X_test = pdXY.loc[idx, feature_cnames]
        y_test = pdXY.loc[idx, target_cname]
        y_test_pred = model_.predict_proba(X_test, ntree_limit=ntree)[:, target_pred_posval_ind]
        auc_test[test_set] = roc_auc_score(y_test, y_test_pred)
        if verbose != 0:
            print("Test: ", test_set, "Test size", X_test.shape[0], "Test bad rate:", y_test.mean(), "auc", auc_test[test_set])
    
    results["model"] = model_
    results["importance"] = model_.get_booster().get_score(importance_type='gain')
    results["ntree"] = ntree
    results["auc_train"] = auc_train
    results["auc_test"] = auc_test
    print("---------------------\n")
    return results


def run_xgboost_ranker(pdXY, train_sets, early_sets, test_sets, 
                       feature_cnames, target_cname="bad", 
                       option_init={}, option_fit = {}, verbose=0):
    train_sets = sorted(train_sets)
    idx_list_train, train_grp_sizes = extract_dataset_idx(pdXY, train_sets, "train")
    print("train_grp_sizes", train_grp_sizes)
    idx_list_early, early_grp_sizes = extract_dataset_idx(pdXY, early_sets, "test")
    
    idx_test_dict = {}
    for test_set in test_sets:
        idx, _ = extract_dataset_idx(pdXY, [test_set], "test")
        idx_test_dict[test_set] = idx[0]
    
    feature_cnames = sorted(feature_cnames)
    results = {}
    results["feature_cnames"] = feature_cnames
    print("Num features", len(feature_cnames))
    
    #X_train = pdXY.loc[idx_train, feature_cnames]
    X_train = pd.concat([pdXY.loc[idx_train, feature_cnames] for idx_train in idx_list_train], axis=0, ignore_index=True)
    #y_train = pdXY.loc[idx_train, target_cname]
    y_train = pd.concat([pdXY.loc[idx_train, target_cname] for idx_train in idx_list_train], ignore_index=True)
    
    #X_early = pdXY.loc[idx_early_stopping, feature_cnames]
    X_early = pd.concat([pdXY.loc[idx_early, feature_cnames] for idx_early in idx_list_early], axis=0, ignore_index=True)
    #y_early = pdXY.loc[idx_early_stopping, target_cname]
    y_early = pd.concat([pdXY.loc[idx_early, target_cname] for idx_early in idx_list_early], ignore_index=True)
    
    print("Train shape", X_train.shape)
    
    results["n_train"] = X_train.shape[0]
    results["train_sets"] = train_sets
    
    model_ = xgboost.XGBRanker(**option_init)
    early_set = [(X_early, y_early),]
    early_group = [[early_set[0][0].shape[0]]]
    early_set = [(X_v.values, y_v.values.astype("i4")) for X_v, y_v in early_set]
    
    model_.fit(X_train.values, y_train.values.astype("i4"), train_grp_sizes, eval_set=early_set, eval_group=early_group, **option_fit)
    model_ = refresh_gpu_model(model_)
    
    ntree = model_.get_booster().best_ntree_limit
    print("ntree", ntree)
    
    y_train_pred = model_.predict(X_train.values, ntree_limit=ntree)
    auc_train = roc_auc_score(y_train, y_train_pred)
    
    auc_test = {}
    for test_set, idx in idx_test_dict.items():
        X_test = pdXY.loc[idx, feature_cnames]
        y_test = pdXY.loc[idx, target_cname]
        y_test_pred = model_.predict(X_test.values, ntree_limit=ntree)
        auc_test[test_set] = roc_auc_score(y_test, y_test_pred)
        if verbose != 0:
            print("Test: ", test_set, "Test size", X_test.shape[0], "Test bad rate:", y_test.mean(), "auc", auc_test[test_set])
    
    results["model"] = model_
    results["importance"] = model_.get_booster().get_score(importance_type='gain')
    results["ntree"] = ntree
    results["auc_train"] = auc_train
    results["auc_test"] = auc_test
    print("---------------------\n")
    return results

In [6]:
def display_eval_results(result_pkl_path, test_sets):
    model_info = pickle.load(open(result_pkl_path, "rb"))
    
    displ = {}
    displ["model_id"] = os.path.basename(result_pkl_path).split(".pkl")[0]
    displ["train_set_list"] = model_info["train_sets"]
    displ["n_train"] = model_info["n_train"]
    displ["feat_cnames"] = model_info["feature_cnames"]
    displ["n_feat"] = len(model_info["feature_cnames"])
    displ["ntree"] = model_info["ntree"]
    displ["model_path"] = os.path.abspath(result_pkl_path)
    displ["train_auc"] = model_info["auc_train"]
    
    for test_set in test_sets:
        displ[test_set] = model_info["auc_test"][test_set]
    return pd.DataFrame({"displ": displ}).T[displ.keys()].reset_index(drop=True)

## Functions to reduce pdl train

In [None]:
TRAIN_SETS_PDL = ['pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit',]

def reduce_train_pdl(pdXY, pdl_size):
    """
    pdl_size = 17061, 10_000, 5_000
    """
    assert "PDL_NOTUSED" not in pdXY["tvt"].unique().tolist()
    
    is_pdl = pdXY["data_set"].isin(TRAIN_SETS_PDL)
    is_train = pdXY["tvt"] == "train"
    pn = pdXY.loc[is_train&is_pdl, "phone_number"].unique()
    print("pn", len(pn))
    np.random.seed(42)
    pn = np.random.choice(pn, size=pdl_size, replace=False)
    print("pn", len(pn))
    
    jj = is_pdl & is_train & (~pdXY["phone_number"].isin(pn))
    print(pdXY[jj].shape)
    pdXY.loc[jj, "tvt"] = "PDL_NOTUSED"
    return pdXY


def remove_pdl_nonhit(pdXY, hit_size):
    """
    remove all non-hit, 
    take only hi with hit_size = 1951, 1000, 500
    """
    assert "PDL_NOTUSED" not in pdXY["tvt"].unique().tolist()
    
    is_pdl = pdXY["data_set"].isin(TRAIN_SETS_PDL)
    is_train = pdXY["tvt"] == "train"
    is_good = pdXY["bad"] == 0
    is_bad = pdXY["bad"] == 1

    jj = is_pdl & is_train & is_good
    print(pdXY[jj].shape)
    pdXY.loc[jj, "tvt"] = "PDL_NOTUSED"

    pn = pdXY.loc[is_train&is_pdl&is_bad, "phone_number"].unique()
    print("pn", len(pn))
    np.random.seed(346)
    pn = np.random.choice(pn, size=hit_size, replace=False)

    kk = is_pdl & is_train & is_bad & (~pdXY["phone_number"].isin(pn))
    print(pdXY[kk].shape)
    pdXY.loc[kk, "tvt"] = "PDL_NOTUSED"
    return pdXY


## load pdXY

In [None]:
pdY = pd.read_pickle("../../data/pdY_v03b.pkl")
pdY["data_set"] = pdY["code"] + "__" + pdY["product"] + "__" + pdY["bad_def"]
print("pdY", pdY.shape)

pdX = pd.read_pickle("data/pdX/pdX_v03.pkl")
pdX.keys()
feat_groups = pdX["feat_groups"]
pdX = pdX["pdX"]
print("pdX", pdX.shape)

pdXY = pdY.merge(pdX, how="left", on=["phone_number", "upto_date"])
print("pdXY", pdXY.shape)
#del pdY, pdX


#pdXY = reduce_train_pdl(pdXY, pdl_size=17061)
#pdXY = remove_pdl_nonhit(pdXY, 1500)

In [None]:
assert False
pdY = pd.read_pickle("../../data/pdY_v03b.pkl")
print("pdY", pdY.shape)
pdY = pdY[pdY["tvt"] == "test"]
pdY["data_set"] = pdY["code"] + "__" + pdY["product"] + "__" + pdY["bad_def"]
print("pdY", pdY.shape)
pdY.to_pickle("../../pdl/data/risk_label_test.pkl")

In [None]:
assert False
pdY = pd.read_pickle("../../data/pdY_v03b.pkl")
print("pdY", pdY.shape)

pdY = pdY[(pdY["code"] == "vib") & (pdY["tvt"] == "test")]
print("pdY", pdY.shape)

pdX_csv3 = pd.read_pickle("data/pdX/pdX_csv3.pkl")
pdX_csv3 = pdX_csv3[["phone_number", "upto_date", "score02"]].rename(columns={"score02": "csv3"})

pdX_csv4 = pd.read_pickle("data/pdX/pdX_csv4.pkl")
pdX_csv4 = pdX_csv4[["phone_number", "upto_date", "csv4"]]

pdXY = pdY.merge(pdX_csv3, how="left", on=["phone_number", "upto_date"])
pdXY = pdXY.merge(pdX_csv4, how="left", on=["phone_number", "upto_date"])
print("pdXY", pdXY.shape)

pdXY = pdXY.drop(["reject_code"], axis=1)
print("pdXY", pdXY.shape)
pdXY.to_csv("data/DA_AM/mbf_vibcc_csv3_csv4_20220810.csv", index=False)

pdXY.head()

In [None]:
assert False
df_vib_cc = pd.read_csv("../../data/csv4_new_labels/VIB_mbf.csv")
df_vib_cc = df_vib_cc[["new_id", "lead_unique_token"]].rename(columns={"new_id": "app_id"})
df_vib_cc.to_csv("data/DA_AM/mbf_vibcc_token_20220810.csv", index=False)
df_vib_cc.head()

In [None]:
df_vib_cc.shape, df_vib_cc["app_id"].nunique()

In [None]:
feat_groups.keys()

In [None]:
len(pdXY.loc[pdXY["tvt"] == "test", "data_set"].unique().tolist())

In [None]:
TEST_SETS = ['be_hcvn01__na__bad',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_mobivi__na__bad',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'be_pvcombank__na__bad',
 'be_vietcapital__Credit card__bad',
 'be_vietcapital__Unsecured personal loans__bad',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__FPD30',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
             
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
             
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

# Test size

In [None]:
test_size = {}
bad_count = {}
bad_rate = {}
min_upto_date = {}
max_upto_date = {}

ii = pdXY["tvt"] == "test"
for ds in TEST_SETS:
    jj = pdXY["data_set"] == ds
    test_size[ds] = pdXY.loc[ii&jj, "bad"].count()
    bad_count[ds] = pdXY.loc[ii&jj, "bad"].sum()
    bad_rate[ds] = pdXY.loc[ii&jj, "bad"].mean()
    min_upto_date[ds] = pdXY.loc[ii&jj, "upto_date"].min()
    max_upto_date[ds] = pdXY.loc[ii&jj, "upto_date"].max()

df_disp = pd.DataFrame({"test_size": test_size,
             "bad_count": bad_count,
             "bad_rate": bad_rate,
             "min_upto_date": min_upto_date,
             "max_upto_date": max_upto_date}).T
display_df(df_disp)

# CSV3, CSV4_HCVN AUC

In [None]:
ii = pdXY["tvt"] == "test"

print("csv3")
csv3_auc = {}
for ds in TEST_SETS:
    jj = pdXY["data_set"] == ds
    nnull = pdXY.loc[ii&jj, "csv3"].isnull().mean()
    ytrue = pdXY.loc[ii&jj, "bad"]
    ypred = -pdXY.loc[ii&jj, "csv3"]
    print(ds, nnull)
    if nnull > 0:
        csv3_auc[ds] = np.nan
    else:
        csv3_auc[ds] = sklearn.metrics.roc_auc_score(y_true=ytrue, y_score=ypred)
        

print("csv3_hcvn")
csv3_hcvn_auc = {}
for ds in TEST_SETS: 
    jj = pdXY["data_set"] == ds
    nnull = pdXY.loc[ii&jj, "csv3_hcvn"].isnull().mean()
    ytrue = pdXY.loc[ii&jj, "bad"]
    ypred = -pdXY.loc[ii&jj, "csv3_hcvn"]
    print(ds, nnull)
    if nnull > 0:
        csv3_hcvn_auc[ds] = np.nan
    else:
        csv3_hcvn_auc[ds] = sklearn.metrics.roc_auc_score(y_true=ytrue, y_score=ypred)


        
auc = pd.DataFrame({"csv3_auc": csv3_auc, "csv3_hcvn_auc": csv3_hcvn_auc}).T
display_df(auc)

## csv3_hcvn 328 features

In [None]:
fmodel_load = pd.read_pickle('/ts/working/core_dev/csv3/explo/tri/final_model/layer2_csv3_hcvn_328_transform_20210615.pkl')
print(fmodel_load.keys())
CNAMES_328 = fmodel_load["cname_feature_list"]
print(len(CNAMES_328))
del fmodel_load

In [None]:
a = "tacidx_avg_rh_avg_amount_recharge_on_promotion_last_90d_4w"

In [None]:
a in CNAMES_328

## Binh 355 features

In [None]:
CNAMES_355 = ['ac_ac_real_age',
 'ac_avg_mth_dataplan_expense_last_3mth',
 'ac_max_mth_usage_last_3mth',
 'ac_sd_mth_expense_last_3mth',
 'ac_sd_mth_usage_last_3mth',
 'afternoon_count_max_step_dist_last_90d',
 'avg_areas_cross_last_90d',
 'bh_avg_day_balance_last_30d',
 'bh_avg_day_promotion_balance_last_30d',
 'bh_max_day_balance_last_30d',
 'bh_max_day_promotion_balance_last_30d',
 'bh_min_day_balance_last_30d',
 'bh_pct_day_balance_from_50k_to_100k_last_30d',
 'bh_pct_day_balance_ge_100k_last_30d',
 'bh_pct_day_balance_lt_10k_last_30d',
 'bh_pct_day_balance_lt_1k_last_30d',
 'bh_pct_day_balance_lt_5k_last_30d',
 'bh_sd_day_balance_last_30d',
 'bh_sd_day_promotion_balance_last_30d',
 'call_avg_call_duration_during_morning_in_day_last_3_months',
 'call_avg_num_night_calls_in_day_last_3_months',
 'call_std_num_calls_in_day_last_3_months',
 'call_std_num_night_calls_in_day_last_3_months',
 'call_sum_call_duration_during_morning_in_day_last_3_months',
 'call_sum_num_night_calls_in_day_last_3_months',
 'callsmssim_call_in_contact_sid_L1sim_4w21',
 'callsmssim_call_in_contact_sid_L1sim_4w8',
 'callsmssim_call_in_contact_sid_L2sim_4w21',
 'callsmssim_call_in_contact_sid_L2sim_4w8',
 'callsmssim_call_out_contact_sid_L1sim_4w21',
 'callsmssim_call_out_contact_sid_L1sim_4w8',
 'callsmssim_call_out_contact_sid_L2sim_4w21',
 'callsmssim_call_out_contact_sid_L2sim_4w8',
 'callsmssim_call_sms_dcontact_sid_4w21',
 'callsmssim_call_sms_dcontact_sid_4w8',
 'callsmssim_callout_sms_prc_common_contact_4w21',
 'callsmssim_callout_sms_prc_common_contact_4w8',
 'callsmssim_callsms_prc_common_contact_4w21',
 'callsmssim_callsms_prc_common_contact_4w8',
 'callsmssim_sms_contact_sid_L1sim_4w8',
 'callsmssim_sms_contact_sid_L2sim_4w8',
 'ch_avg_per_day_num_calls_last_30d',
 'ch_avg_per_day_num_evening_contacts_last_30d',
 'ch_avg_per_day_num_out_calls_last_30d',
 'ch_num_out_calls_last_30d',
 'ch_sd_per_day_num_freq_contacts_noc_ge6_last_30d',
 'cmtyb_count_18w_og',
 'cmtyb_count_3w_og',
 'cmtyb_count_9w_og',
 'cmtyb_num_calls_18w_og',
 'cmtyb_num_calls_3w_og',
 'cmtyb_num_calls_9w_og',
 'cmtyb_ratio_count_3w18w_ic',
 'cmtyb_ratio_count_3w9w_ic',
 'cmtyb_sum_seconds_18w_og',
 'cmtyb_sum_seconds_3w_og',
 'cmtyb_sum_seconds_9w_og',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_0_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_15_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_20_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_25_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_30_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_35_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_lt_2_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_lt_3_last21w',
 'cs_abnormal_min_wk_ndays_num_interactions_gt_0_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_0_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_10_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_20_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_25_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_30_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_35_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_5_last21w',
 'cs_abnormal_sum_wk_ndays_num_interactions_gt_0_last21w',
 'cs_abnormal_sum_wk_ndays_num_interactions_lt_2_last21w',
 'cs_abnormal_sum_wk_ndays_num_interactions_lt_3_last21w',
 'cs_avg_derv_wk_num_uses_in_weekend_last21w',
 'cs_avg_derv_wk_numd_contacts_in_evening_last21w',
 'cs_avg_wk_numd_contacts_in_evening_last21w',
 'cs_avg_wk_numd_contacts_last21w',
 'cs_avg_wk_numd_hours_in_weekend_last21w',
 'cs_max_wk_numd_contacts_in_evening_last21w',
 'cs_max_wk_numd_contacts_last21w',
 'cs_min_wk_numd_contacts_in_evening_last21w',
 'cs_std_derv_wk_num_uses_in_weekend_last21w',
 'cs_std_derv_wk_numd_contacts_in_evening_last21w',
 'cs_std_wk_num_uses_in_evening_last21w',
 'cs_std_wk_num_uses_in_office_hours_last21w',
 'cs_std_wk_num_uses_in_weekend_last21w',
 'cs_std_wk_numd_contacts_in_evening_last21w',
 'cs_std_wk_numd_contacts_in_office_hours_last21w',
 'cs_std_wk_numd_contacts_in_weekend_last21w',
 'cs_std_wk_numd_contacts_last21w',
 'cs_std_wk_numd_hours_at_night_last21w',
 'cs_std_wk_numd_hours_in_office_hours_last21w',
 'cs_std_wk_numd_hours_in_weekend_last21w',
 'cs_std_wk_numd_hours_last21w',
 'cs_std_wk_sum_num_events_last21w',
 'dnd01_avg_callin_ratio_avg_call_duration_last12w',
 'dnd01_avg_callin_ratio_dcount_csid_last12w',
 'dnd01_avg_callin_ratio_num_events_last12w',
 'dnd01_avg_callout_ratio_avg_call_duration_last12w',
 'dnd01_avg_callout_ratio_dcount_csid_last12w',
 'dnd01_avg_callout_ratio_dhour_last12w',
 'dnd01_avg_callsms_ratio_dcount_csid_last12w',
 'dnd01_avg_callsms_ratio_dhour_last12w',
 'dnd01_callin_avg_day_avg_call_duration_last12w',
 'dnd01_callin_avg_day_dcount_csid_last12w',
 'dnd01_callin_avg_day_num_events_last12w',
 'dnd01_callin_avg_dhour_last12w',
 'dnd01_callin_avg_night_dcount_csid_last12w',
 'dnd01_callin_max_day_avg_call_duration_last12w',
 'dnd01_callin_max_day_num_events_last12w',
 'dnd01_callin_max_night_num_events_last12w',
 'dnd01_callin_min_night_avg_call_duration_last12w',
 'dnd01_callin_min_night_dcount_csid_last12w',
 'dnd01_callout_avg_dhour_last12w',
 'dnd01_callsms_max_dhour_last12w',
 'dnd01_max_callin_ratio_avg_call_duration_last12w',
 'dnd01_max_callout_ratio_avg_call_duration_last12w',
 'dnd01_max_callout_ratio_dcount_csid_last12w',
 'dnd01_max_callsms_ratio_dcount_csid_last12w',
 'dnd01_max_callsms_ratio_dhour_last12w',
 'dnd01_min_callin_ratio_dcount_csid_last12w',
 'dnd01_min_callin_ratio_dhour_last12w',
 'dnd01_min_callout_ratio_dcount_csid_last12w',
 'dnd01_min_callsms_ratio_dcount_csid_last12w',
 'evening_count_max_step_dist_last_90d',
 'fhist12w_ac_ac_real_age',
 'fhist12w_nbor4w_ac_ac_real_age',
 'fhist12w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist12w_nbor4w_pred_proba_tc_lb1',
 'fhist12w_nbor4w_pred_proba_travel_entropy',
 'fhist12w_pred_proba_callsms_similarity_v2_4w8',
 'fhist12w_pred_proba_tc_lb1',
 'fhist12w_pred_proba_travel_entropy',
 'fhist16w_ac_ac_real_age',
 'fhist16w_nbor4w_ac_ac_real_age',
 'fhist16w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist16w_nbor4w_pred_proba_tc_lb1',
 'fhist16w_nbor4w_pred_proba_travel_entropy',
 'fhist16w_pred_proba_callsms_similarity_v2_4w8',
 'fhist16w_pred_proba_tc_lb1',
 'fhist16w_pred_proba_travel_entropy',
 'fhist20w_ac_ac_real_age',
 'fhist20w_nbor4w_ac_ac_real_age',
 'fhist20w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist20w_nbor4w_pred_proba_tc_lb1',
 'fhist20w_nbor4w_pred_proba_travel_entropy',
 'fhist20w_pred_proba_callsms_similarity_v2_4w8',
 'fhist20w_pred_proba_tc_lb1',
 'fhist20w_pred_proba_travel_entropy',
 'fhist4w_ac_ac_real_age',
 'fhist4w_nbor4w_ac_ac_real_age',
 'fhist4w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist4w_nbor4w_pred_proba_layer1_264',
 'fhist4w_nbor4w_pred_proba_tc_lb1',
 'fhist4w_nbor4w_pred_proba_travel_entropy',
 'fhist4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist4w_pred_proba_layer1_264',
 'fhist4w_pred_proba_tc_lb1',
 'fhist4w_pred_proba_travel_entropy',
 'fhist8w_ac_ac_real_age',
 'fhist8w_nbor4w_ac_ac_real_age',
 'fhist8w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist8w_nbor4w_pred_proba_layer1_264',
 'fhist8w_nbor4w_pred_proba_tc_lb1',
 'fhist8w_nbor4w_pred_proba_travel_entropy',
 'fhist8w_pred_proba_callsms_similarity_v2_4w8',
 'fhist8w_pred_proba_layer1_264',
 'fhist8w_pred_proba_tc_lb1',
 'fhist8w_pred_proba_travel_entropy',
 'lacci01_avgwk_churn_city_lb12wk',
 'lacci01_avgwk_churn_lacci_lb12wk',
 'lacci01_avgwk_entropy_of_cell_towers_lb12wk',
 'lacci01_avgwk_entropy_of_cities_lb12wk',
 'lacci01_avgwk_num_cell_towers_lb12wk',
 'lacci01_avgwk_num_cities_lb12wk',
 'lacci01_avgwk_num_day_lacci_lb12wk',
 'lacci01_avgwk_num_days_lt_10_cells_lb12wk',
 'lacci01_devwk_churn_city_lb12wk',
 'lacci01_devwk_churn_lacci_lb12wk',
 'lacci01_devwk_entropy_of_cell_towers_lb12wk',
 'lacci01_devwk_entropy_of_cities_lb12wk',
 'lacci01_devwk_num_cell_towers_lb12wk',
 'lacci01_devwk_num_cities_lb12wk',
 'lacci01_devwk_num_day_lacci_lb12wk',
 'lacci01_devwk_num_days_lt_10_cells_lb12wk',
 'lacci01_maxwk_churn_city_lb12wk',
 'lacci01_maxwk_churn_lacci_lb12wk',
 'lacci01_maxwk_entropy_of_cell_towers_lb12wk',
 'lacci01_maxwk_entropy_of_cities_lb12wk',
 'lacci01_maxwk_num_cell_towers_lb12wk',
 'lacci01_maxwk_num_cities_lb12wk',
 'lacci01_maxwk_num_day_lacci_lb12wk',
 'lacci01_maxwk_num_days_lt_10_cells_lb12wk',
 'lacci01_minwk_churn_city_lb12wk',
 'lacci01_minwk_churn_lacci_lb12wk',
 'lacci01_minwk_entropy_of_cell_towers_lb12wk',
 'lacci01_minwk_entropy_of_cities_lb12wk',
 'lacci01_minwk_num_cell_towers_lb12wk',
 'lacci01_minwk_num_cities_lb12wk',
 'lacci01_minwk_num_day_lacci_lb12wk',
 'lacci01_minwk_num_days_lt_10_cells_lb12wk',
 'locidx_Area_avg_places_5w',
 'locidx_Educated_labour_gt15_ratio_avg_places_5w',
 'locidx_Emigra_rate_avg_places_5w',
 'locidx_GRDP_avg_places_5w',
 'locidx_Net_migra_rate_avg_places_5w',
 'locidx_dist_rcg_avg_places_5w',
 'locidx_prov_rcg_avg_places_5w',
 'max_distance_range_last_90d',
 'max_max_step_distance_travel_km_last_90d',
 'mh_avg_per_day_num_receivers_last_30d',
 'mh_avg_per_day_num_send_messages_last_30d',
 'mh_num_send_messages_last_30d',
 'mi_ever_use_dpp_sum_3_lbm',
 'min_areas_cross_last_90d',
 'min_distance_range_last_90d',
 'min_distance_travel_km_last_90d',
 'min_max_step_distance_travel_km_last_90d',
 'missed_calls_in_8w',
 'mob_daynight_cosine_places_5w',
 'mob_have_home_place_5w',
 'mob_n_homes_5w',
 'mob_n_placese_5w',
 'mob_n_works_5w',
 'mob_real_dominant_rate_5w',
 'nbor4w_ac_ac_real_age',
 'nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'nbor4w_pred_proba_layer1_264',
 'nbor4w_pred_proba_recharge_call_intervent_time_lb1',
 'nbor4w_pred_proba_tc_lb1',
 'nbor4w_pred_proba_travel_entropy',
 'nday_activity_8w',
 'nday_activity_strict_8w',
 'nday_callsms_out_8w',
 'num_calls_in_8w',
 'num_calls_out_8w',
 'num_pickup_calls_in_8w',
 'num_recharges_8w',
 'num_sms_in_8w',
 'num_sms_out_8w',
 'num_vas_8w',
 'pred_proba_callsms_similarity_v2_4w8',
 'pred_proba_layer1_264',
 'pred_proba_recharge_call_intervent_time_lb1',
 'pred_proba_tc_lb1',
 'pred_proba_travel_entropy',
 'rcg_avg_recharge_amount_per_month_last_3_months',
 'rcg_call_mean_diff_timestamp_last_1_months',
 'rcg_call_mean_diff_timestamp_last_3_months',
 'rcg_call_min_diff_timestamp_last_1_months',
 'rcg_call_min_diff_timestamp_last_3_months',
 'rcg_mean_diff_recharge_time_last_1_months',
 'rcg_mean_max_recharge_amount_per_day_last_12_weeks',
 'rcg_mean_min_recharge_amount_per_day_last_12_weeks',
 'rcg_min_diff_recharge_time_last_3_months',
 'sms_avg_wk_num_send_messages_in_day_last_8_weeks',
 'sms_smt_num_week_sms_last_8_weeks',
 'sms_sum_wk_num_send_messages_in_day_last_8_weeks',
 'tacidx_avg_callsmssim_call_sms_dcontact_sid_4w8_4w',
 'tacidx_avg_dnd01_callin_avg_night_dcount_csid_last12w_4w',
 'tacidx_avg_rh_avg_amount_recharge_on_promotion_last_90d_4w',
 'tacidx_avg_rh_avg_recharge_amount_last_90d_4w',
 'tacidx_num_device_4w',
 'tacidx_percent_new_sim_4w',
 'tacidx_rank_avg_ac_ac_real_age_4w',
 'tacidx_rank_avg_vas_4w',
 'tacidx_rank_population_4w',
 'tacidx_tac_age_4w',
 'tc_avg_sms_expense_ratio_last_3_months',
 'tc_avg_total_sms_expense_last_3_months',
 'tc_avg_total_vas_expense_last_3_months',
 'tc_avg_vas_expense_ratio_last_3_months',
 'tc_max_sms_expense_ratio_last_3_months',
 'tc_max_total_sms_expense_last_3_months',
 'tc_max_total_vas_expense_last_3_months',
 'tc_max_vas_expense_ratio_last_3_months',
 'tc_min_sms_expense_ratio_last_3_months',
 'tc_min_total_sms_expense_last_3_months',
 'tc_min_total_vas_expense_last_3_months',
 'tc_rcg_avg_avg_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_avg_max_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_avg_recharge_amount_avg_expense_ratio_last_3_months',
 'tc_rcg_max_avg_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_max_max_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_min_avg_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_min_max_recharge_amount_expense_ratio_last_3_months',
 'tc_std_total_vas_expense_last_3_months',
 'tc_std_vas_expense_ratio_last_3_months',
 'total_calls_8w',
 'total_distance_range_last_90d',
 'total_max_step_distance_travel_km_last_90d',
 'total_recharge_amount_8w',
 'total_sms_8w',
 'total_vas_amount_8w',
 'vas_ex_credit_avg_wk_avg_iet_in_weekend_last8w',
 'vas_ex_credit_avg_wk_avg_iet_last8w',
 'vas_ex_credit_avg_wk_avg_service_amount_last8w',
 'vas_ex_credit_avg_wk_max_2nd_iet_last8w',
 'vas_ex_credit_avg_wk_max_service_amount_last8w',
 'vas_ex_credit_avg_wk_min_service_amount_in_weekend_last8w',
 'vas_ex_credit_avg_wk_min_service_amount_last8w',
 'vas_ex_credit_avg_wk_num_uses_last8w',
 'vas_ex_credit_avg_wk_numd_days_in_office_hours_last8w',
 'vas_ex_credit_avg_wk_numd_days_last8w',
 'vas_ex_credit_avg_wk_numd_days_of_week_last8w',
 'vas_ex_credit_avg_wk_numd_hours_in_office_hours_last8w',
 'vas_ex_credit_avg_wk_numd_hours_last8w',
 'vas_ex_credit_avg_wk_sum_iet_in_weekend_last8w',
 'vas_ex_credit_avg_wk_sum_iet_last8w',
 'vas_ex_credit_avg_wk_sum_service_amount_at_night_last8w',
 'vas_ex_credit_avg_wk_sum_service_amount_in_evening_last8w',
 'vas_ex_credit_avg_wk_sum_service_amount_in_office_hours_last8w',
 'vas_ex_credit_avg_wk_sum_service_amount_last8w',
 'vas_ex_credit_max_derv_wk_num_uses_last8w',
 'vas_ex_credit_max_wk_avg_iet_in_weekend_last8w',
 'vas_ex_credit_max_wk_avg_iet_last8w',
 'vas_ex_credit_max_wk_avg_service_amount_last8w',
 'vas_ex_credit_max_wk_max_2nd_iet_last8w',
 'vas_ex_credit_max_wk_max_service_amount_last8w',
 'vas_ex_credit_max_wk_min_service_amount_in_weekend_last8w',
 'vas_ex_credit_max_wk_min_service_amount_last8w',
 'vas_ex_credit_max_wk_num_uses_last8w',
 'vas_ex_credit_max_wk_numd_days_in_office_hours_last8w',
 'vas_ex_credit_max_wk_numd_hours_in_office_hours_last8w',
 'vas_ex_credit_max_wk_numd_hours_last8w',
 'vas_ex_credit_max_wk_sum_2nd_iet_last8w',
 'vas_ex_credit_max_wk_sum_iet_in_weekend_last8w',
 'vas_ex_credit_max_wk_sum_iet_last8w',
 'vas_ex_credit_max_wk_sum_service_amount_last8w',
 'vas_ex_credit_min_wk_avg_iet_last8w',
 'vas_ex_credit_min_wk_avg_service_amount_last8w',
 'vas_ex_credit_min_wk_max_service_amount_last8w',
 'vas_ex_credit_min_wk_min_service_amount_in_weekend_last8w',
 'vas_ex_credit_min_wk_min_service_amount_last8w',
 'vas_ex_credit_min_wk_sum_service_amount_last8w',
 'vas_ex_credit_std_derv_wk_num_uses_last8w',
 'vas_ex_credit_std_wk_avg_iet_in_weekend_last8w',
 'vas_ex_credit_std_wk_avg_iet_last8w',
 'vas_ex_credit_std_wk_avg_service_amount_last8w',
 'vas_ex_credit_std_wk_max_2nd_iet_in_weekend_last8w',
 'vas_ex_credit_std_wk_max_2nd_iet_last8w',
 'vas_ex_credit_std_wk_max_service_amount_last8w',
 'vas_ex_credit_std_wk_min_service_amount_in_weekend_last8w',
 'vas_ex_credit_std_wk_min_service_amount_last8w',
 'vas_ex_credit_std_wk_num_uses_last8w',
 'vas_ex_credit_std_wk_numd_days_in_office_hours_last8w',
 'vas_ex_credit_std_wk_numd_days_last8w',
 'vas_ex_credit_std_wk_numd_hours_last8w',
 'vas_ex_credit_std_wk_sum_2nd_iet_last8w',
 'vas_ex_credit_std_wk_sum_iet_in_weekend_last8w',
 'vas_ex_credit_std_wk_sum_iet_last8w',
 'vas_ex_credit_std_wk_sum_service_amount_last8w']
print(len(CNAMES_355))

## Khoa 304 features

In [None]:
CNAMES_304 = ['ac_ac_real_age',
 'ac_avg_mth_dataplan_expense_last_3mth',
 'ac_max_mth_usage_last_3mth',
 'ac_sd_mth_expense_last_3mth',
 'ac_sd_mth_usage_last_3mth',
 'afternoon_count_max_step_dist_last_90d',
 'avg_areas_cross_last_90d',
 'bh_avg_day_balance_last_30d',
 'bh_avg_day_promotion_balance_last_30d',
 'bh_max_day_balance_last_30d',
 'bh_max_day_promotion_balance_last_30d',
 'bh_min_day_balance_last_30d',
 'bh_pct_day_balance_from_50k_to_100k_last_30d',
 'bh_pct_day_balance_ge_100k_last_30d',
 'bh_pct_day_balance_lt_10k_last_30d',
 'bh_pct_day_balance_lt_1k_last_30d',
 'bh_pct_day_balance_lt_5k_last_30d',
 'bh_sd_day_balance_last_30d',
 'bh_sd_day_promotion_balance_last_30d',
 'call_avg_call_duration_during_morning_in_day_last_3_months',
 'call_avg_num_night_calls_in_day_last_3_months',
 'call_std_num_calls_in_day_last_3_months',
 'call_std_num_night_calls_in_day_last_3_months',
 'call_sum_call_duration_during_morning_in_day_last_3_months',
 'call_sum_num_night_calls_in_day_last_3_months',
 'callsmssim_call_in_contact_sid_L1sim_4w21',
 'callsmssim_call_in_contact_sid_L1sim_4w8',
 'callsmssim_call_in_contact_sid_L2sim_4w21',
 'callsmssim_call_in_contact_sid_L2sim_4w8',
 'callsmssim_call_out_contact_sid_L1sim_4w21',
 'callsmssim_call_out_contact_sid_L1sim_4w8',
 'callsmssim_call_out_contact_sid_L2sim_4w21',
 'callsmssim_call_out_contact_sid_L2sim_4w8',
 'callsmssim_call_sms_dcontact_sid_4w21',
 'callsmssim_call_sms_dcontact_sid_4w8',
 'callsmssim_callout_sms_prc_common_contact_4w21',
 'callsmssim_callout_sms_prc_common_contact_4w8',
 'callsmssim_callsms_prc_common_contact_4w21',
 'callsmssim_callsms_prc_common_contact_4w8',
 'callsmssim_sms_contact_sid_L1sim_4w8',
 'callsmssim_sms_contact_sid_L2sim_4w8',
 'ch_avg_per_day_num_calls_last_30d',
 'ch_avg_per_day_num_evening_contacts_last_30d',
 'ch_avg_per_day_num_out_calls_last_30d',
 'ch_num_out_calls_last_30d',
 'ch_sd_per_day_num_freq_contacts_noc_ge6_last_30d',
 'cmtyb_count_18w_og',
 'cmtyb_count_3w_og',
 'cmtyb_count_9w_og',
 'cmtyb_num_calls_18w_og',
 'cmtyb_num_calls_3w_og',
 'cmtyb_num_calls_9w_og',
 'cmtyb_ratio_count_3w18w_ic',
 'cmtyb_ratio_count_3w9w_ic',
 'cmtyb_sum_seconds_18w_og',
 'cmtyb_sum_seconds_3w_og',
 'cmtyb_sum_seconds_9w_og',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_0_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_15_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_20_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_25_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_30_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_gt_35_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_lt_2_last21w',
 'cs_abnormal_avg_wk_ndays_num_interactions_lt_3_last21w',
 'cs_abnormal_min_wk_ndays_num_interactions_gt_0_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_0_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_10_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_20_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_25_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_30_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_35_last21w',
 'cs_abnormal_std_wk_ndays_num_interactions_gt_5_last21w',
 'cs_abnormal_sum_wk_ndays_num_interactions_gt_0_last21w',
 'cs_abnormal_sum_wk_ndays_num_interactions_lt_2_last21w',
 'cs_abnormal_sum_wk_ndays_num_interactions_lt_3_last21w',
 'cs_avg_derv_wk_num_uses_in_weekend_last21w',
 'cs_avg_derv_wk_numd_contacts_in_evening_last21w',
 'cs_avg_wk_numd_contacts_in_evening_last21w',
 'cs_avg_wk_numd_contacts_last21w',
 'cs_avg_wk_numd_hours_in_weekend_last21w',
 'cs_max_wk_numd_contacts_in_evening_last21w',
 'cs_max_wk_numd_contacts_last21w',
 'cs_min_wk_numd_contacts_in_evening_last21w',
 'cs_std_derv_wk_num_uses_in_weekend_last21w',
 'cs_std_derv_wk_numd_contacts_in_evening_last21w',
 'cs_std_wk_num_uses_in_evening_last21w',
 'cs_std_wk_num_uses_in_office_hours_last21w',
 'cs_std_wk_num_uses_in_weekend_last21w',
 'cs_std_wk_numd_contacts_in_evening_last21w',
 'cs_std_wk_numd_contacts_in_office_hours_last21w',
 'cs_std_wk_numd_contacts_in_weekend_last21w',
 'cs_std_wk_numd_contacts_last21w',
 'cs_std_wk_numd_hours_at_night_last21w',
 'cs_std_wk_numd_hours_in_office_hours_last21w',
 'cs_std_wk_numd_hours_in_weekend_last21w',
 'cs_std_wk_numd_hours_last21w',
 'cs_std_wk_sum_num_events_last21w',
 'dnd01_avg_callin_ratio_avg_call_duration_last12w',
 'dnd01_avg_callin_ratio_dcount_csid_last12w',
 'dnd01_avg_callin_ratio_num_events_last12w',
 'dnd01_avg_callout_ratio_avg_call_duration_last12w',
 'dnd01_avg_callout_ratio_dcount_csid_last12w',
 'dnd01_avg_callout_ratio_dhour_last12w',
 'dnd01_avg_callsms_ratio_dcount_csid_last12w',
 'dnd01_avg_callsms_ratio_dhour_last12w',
 'dnd01_callin_avg_day_avg_call_duration_last12w',
 'dnd01_callin_avg_day_dcount_csid_last12w',
 'dnd01_callin_avg_day_num_events_last12w',
 'dnd01_callin_avg_dhour_last12w',
 'dnd01_callin_avg_night_dcount_csid_last12w',
 'dnd01_callin_max_day_avg_call_duration_last12w',
 'dnd01_callin_max_day_num_events_last12w',
 'dnd01_callin_max_night_num_events_last12w',
 'dnd01_callin_min_night_avg_call_duration_last12w',
 'dnd01_callin_min_night_dcount_csid_last12w',
 'dnd01_callout_avg_dhour_last12w',
 'dnd01_callsms_max_dhour_last12w',
 'dnd01_max_callin_ratio_avg_call_duration_last12w',
 'dnd01_max_callout_ratio_avg_call_duration_last12w',
 'dnd01_max_callout_ratio_dcount_csid_last12w',
 'dnd01_max_callsms_ratio_dcount_csid_last12w',
 'dnd01_max_callsms_ratio_dhour_last12w',
 'dnd01_min_callin_ratio_dcount_csid_last12w',
 'dnd01_min_callin_ratio_dhour_last12w',
 'dnd01_min_callout_ratio_dcount_csid_last12w',
 'dnd01_min_callsms_ratio_dcount_csid_last12w',
 'evening_count_max_step_dist_last_90d',
 'fhist12w_ac_ac_real_age',
 'fhist12w_nbor4w_ac_ac_real_age',
 'fhist12w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist12w_nbor4w_pred_proba_tc_lb1',
 'fhist12w_nbor4w_pred_proba_travel_entropy',
 'fhist12w_pred_proba_callsms_similarity_v2_4w8',
 'fhist12w_pred_proba_tc_lb1',
 'fhist12w_pred_proba_travel_entropy',
 'fhist16w_ac_ac_real_age',
 'fhist16w_nbor4w_ac_ac_real_age',
 'fhist16w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist16w_nbor4w_pred_proba_tc_lb1',
 'fhist16w_nbor4w_pred_proba_travel_entropy',
 'fhist16w_pred_proba_callsms_similarity_v2_4w8',
 'fhist16w_pred_proba_tc_lb1',
 'fhist16w_pred_proba_travel_entropy',
 'fhist20w_ac_ac_real_age',
 'fhist20w_nbor4w_ac_ac_real_age',
 'fhist20w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist20w_nbor4w_pred_proba_recharge_call_intervent_time_lb1',
 'fhist20w_nbor4w_pred_proba_tc_lb1',
 'fhist20w_nbor4w_pred_proba_travel_entropy',
 'fhist20w_pred_proba_callsms_similarity_v2_4w8',
 'fhist20w_pred_proba_recharge_call_intervent_time_lb1',
 'fhist20w_pred_proba_tc_lb1',
 'fhist20w_pred_proba_travel_entropy',
 'fhist4w_ac_ac_real_age',
 'fhist4w_nbor4w_ac_ac_real_age',
 'fhist4w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist4w_nbor4w_pred_proba_layer1_264',
 'fhist4w_nbor4w_pred_proba_tc_lb1',
 'fhist4w_nbor4w_pred_proba_travel_entropy',
 'fhist4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist4w_pred_proba_layer1_264',
 'fhist4w_pred_proba_tc_lb1',
 'fhist4w_pred_proba_travel_entropy',
 'fhist8w_ac_ac_real_age',
 'fhist8w_nbor4w_ac_ac_real_age',
 'fhist8w_nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'fhist8w_nbor4w_pred_proba_layer1_264',
 'fhist8w_nbor4w_pred_proba_tc_lb1',
 'fhist8w_nbor4w_pred_proba_travel_entropy',
 'fhist8w_pred_proba_callsms_similarity_v2_4w8',
 'fhist8w_pred_proba_layer1_264',
 'fhist8w_pred_proba_tc_lb1',
 'fhist8w_pred_proba_travel_entropy',
 'lacci01_avgwk_churn_city_lb12wk',
 'lacci01_avgwk_churn_lacci_lb12wk',
 'lacci01_avgwk_entropy_of_cell_towers_lb12wk',
 'lacci01_avgwk_entropy_of_cities_lb12wk',
 'lacci01_avgwk_num_cell_towers_lb12wk',
 'lacci01_avgwk_num_cities_lb12wk',
 'lacci01_avgwk_num_day_lacci_lb12wk',
 'lacci01_avgwk_num_days_lt_10_cells_lb12wk',
 'lacci01_devwk_churn_city_lb12wk',
 'lacci01_devwk_churn_lacci_lb12wk',
 'lacci01_devwk_entropy_of_cell_towers_lb12wk',
 'lacci01_devwk_entropy_of_cities_lb12wk',
 'lacci01_devwk_num_cell_towers_lb12wk',
 'lacci01_devwk_num_cities_lb12wk',
 'lacci01_devwk_num_day_lacci_lb12wk',
 'lacci01_devwk_num_days_lt_10_cells_lb12wk',
 'lacci01_maxwk_churn_city_lb12wk',
 'lacci01_maxwk_churn_lacci_lb12wk',
 'lacci01_maxwk_entropy_of_cell_towers_lb12wk',
 'lacci01_maxwk_entropy_of_cities_lb12wk',
 'lacci01_maxwk_num_cell_towers_lb12wk',
 'lacci01_maxwk_num_cities_lb12wk',
 'lacci01_maxwk_num_day_lacci_lb12wk',
 'lacci01_maxwk_num_days_lt_10_cells_lb12wk',
 'lacci01_minwk_churn_city_lb12wk',
 'lacci01_minwk_churn_lacci_lb12wk',
 'lacci01_minwk_entropy_of_cell_towers_lb12wk',
 'lacci01_minwk_entropy_of_cities_lb12wk',
 'lacci01_minwk_num_cell_towers_lb12wk',
 'lacci01_minwk_num_cities_lb12wk',
 'lacci01_minwk_num_day_lacci_lb12wk',
 'lacci01_minwk_num_days_lt_10_cells_lb12wk',
 'locidx_Area_avg_places_5w',
 'locidx_Educated_labour_gt15_ratio_avg_places_5w',
 'locidx_Emigra_rate_avg_places_5w',
 'locidx_GRDP_avg_places_5w',
 'locidx_Net_migra_rate_avg_places_5w',
 'locidx_dist_rcg_avg_places_5w',
 'locidx_prov_rcg_avg_places_5w',
 'max_distance_range_last_90d',
 'max_max_step_distance_travel_km_last_90d',
 'mh_avg_per_day_num_receivers_last_30d',
 'mh_avg_per_day_num_send_messages_last_30d',
 'mh_num_send_messages_last_30d',
 'mi_ever_use_dpp_sum_3_lbm',
 'min_areas_cross_last_90d',
 'min_distance_range_last_90d',
 'min_distance_travel_km_last_90d',
 'min_max_step_distance_travel_km_last_90d',
 'missed_calls_in_8w',
 'mob_daynight_cosine_places_5w',
 'mob_have_home_place_5w',
 'mob_n_homes_5w',
 'mob_n_placese_5w',
 'mob_n_works_5w',
 'mob_real_dominant_rate_5w',
 'nbor4w_ac_ac_real_age',
 'nbor4w_pred_proba_callsms_similarity_v2_4w8',
 'nbor4w_pred_proba_layer1_264',
 'nbor4w_pred_proba_recharge_call_intervent_time_lb1',
 'nbor4w_pred_proba_tc_lb1',
 'nbor4w_pred_proba_travel_entropy',
 'nday_activity_8w',
 'nday_activity_strict_8w',
 'nday_callsms_out_8w',
 'num_calls_in_8w',
 'num_calls_out_8w',
 'num_pickup_calls_in_8w',
 'num_recharges_8w',
 'num_sms_in_8w',
 'num_sms_out_8w',
 'rcg_avg_recharge_amount_per_month_last_3_months',
 'rcg_call_mean_diff_timestamp_last_1_months',
 'rcg_call_mean_diff_timestamp_last_3_months',
 'rcg_mean_diff_recharge_time_last_1_months',
 'rcg_mean_max_recharge_amount_per_day_last_12_weeks',
 'rcg_mean_min_recharge_amount_per_day_last_12_weeks',
 'rcg_min_diff_recharge_time_last_3_months',
 'sms_avg_wk_num_send_messages_in_day_last_8_weeks',
 'sms_smt_num_week_sms_last_8_weeks',
 'sms_sum_wk_num_send_messages_in_day_last_8_weeks',
 'tacidx_avg_callsmssim_call_sms_dcontact_sid_4w8_4w',
 'tacidx_avg_dnd01_callin_avg_night_dcount_csid_last12w_4w',
 'tacidx_avg_rh_avg_amount_recharge_on_promotion_last_90d_4w',
 'tacidx_avg_rh_avg_recharge_amount_last_90d_4w',
 'tacidx_num_device_4w',
 'tacidx_percent_new_sim_4w',
 'tacidx_rank_avg_ac_ac_real_age_4w',
 'tacidx_rank_avg_vas_4w',
 'tacidx_rank_population_4w',
 'tacidx_tac_age_4w',
 'tc_avg_sms_expense_ratio_last_3_months',
 'tc_avg_total_sms_expense_last_3_months',
 'tc_avg_total_vas_expense_last_3_months',
 'tc_avg_vas_expense_ratio_last_3_months',
 'tc_max_sms_expense_ratio_last_3_months',
 'tc_max_total_sms_expense_last_3_months',
 'tc_max_total_vas_expense_last_3_months',
 'tc_max_vas_expense_ratio_last_3_months',
 'tc_min_sms_expense_ratio_last_3_months',
 'tc_min_total_sms_expense_last_3_months',
 'tc_min_total_vas_expense_last_3_months',
 'tc_rcg_avg_avg_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_avg_max_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_avg_recharge_amount_avg_expense_ratio_last_3_months',
 'tc_rcg_max_avg_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_max_max_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_min_avg_recharge_amount_expense_ratio_last_3_months',
 'tc_rcg_min_max_recharge_amount_expense_ratio_last_3_months',
 'tc_std_total_vas_expense_last_3_months',
 'tc_std_vas_expense_ratio_last_3_months',
 'total_calls_8w',
 'total_distance_range_last_90d',
 'total_max_step_distance_travel_km_last_90d',
 'total_recharge_amount_8w',
 'total_sms_8w',
 'vas_ex_credit_avg_wk_numd_days_in_office_hours_last8w',
 'vas_ex_credit_avg_wk_numd_days_last8w',
 'vas_ex_credit_avg_wk_numd_days_of_week_last8w',
 'vas_ex_credit_avg_wk_numd_hours_in_office_hours_last8w',
 'vas_ex_credit_avg_wk_numd_hours_last8w',
 'vas_ex_credit_avg_wk_sum_iet_in_weekend_last8w',
 'vas_ex_credit_avg_wk_sum_iet_last8w',
 'vas_ex_credit_avg_wk_sum_service_amount_at_night_last8w',
 'vas_ex_credit_avg_wk_sum_service_amount_in_evening_last8w',
 'vas_ex_credit_avg_wk_sum_service_amount_in_office_hours_last8w',
 'vas_ex_credit_avg_wk_sum_service_amount_last8w',
 'vas_ex_credit_max_derv_wk_num_uses_last8w',
 'vas_ex_credit_max_wk_avg_iet_in_weekend_last8w',
 'vas_ex_credit_max_wk_avg_iet_last8w']
print(len(CNAMES_304))

In [None]:
CNAMES_ACT = feat_groups["activity_8w"]
CNAMES_ACT

In [None]:
CNAMES_306 = sorted(CNAMES_304 + ["num_vas_8w", "total_vas_amount_8w"])
print(len(CNAMES_306))

In [None]:
CNAMES_307 = sorted(CNAMES_306 + ["pred_proba_pdlhit"])
print(len(CNAMES_307))

In [None]:
# in Binh but not in Khoa
for col in CNAMES_355:
    if col not in CNAMES_304:
        print(col)

In [None]:
# in 328 but not in Khoa
for col in CNAMES_328:
    if col not in CNAMES_304:
        print(col)

In [None]:
# in Binh but not in 328
for col in CNAMES_355:
    if col not in CNAMES_328:
        print(col)

In [None]:
# in Khoa but not in 328
for col in CNAMES_304:
    if col not in CNAMES_328:
        print(col)

In [None]:
CNAMES_KHOA_ADDED = [
    "fhist20w_nbor4w_pred_proba_recharge_call_intervent_time_lb1",
    "fhist20w_pred_proba_recharge_call_intervent_time_lb1",
    "nbor4w_pred_proba_recharge_call_intervent_time_lb1",
    "rcg_call_mean_diff_timestamp_last_1_months",
    "rcg_call_mean_diff_timestamp_last_3_months",
    "rcg_mean_diff_recharge_time_last_1_months",
    "rcg_min_diff_recharge_time_last_3_months",
]

In [None]:
CNAMES_350 = sorted(list(set(CNAMES_328 + CNAMES_ACT + CNAMES_KHOA_ADDED)))
print(len(CNAMES_350))

In [None]:
CNAMES_343 = sorted(list(set(CNAMES_328 + CNAMES_ACT)))
print(len(CNAMES_343))

## train sets

In [None]:
TRAIN_SETS_ALL = sorted(pdXY.loc[pdXY["tvt"] == "train", "data_set"].unique())
TRAIN_SETS_ALL = [ts for ts in TRAIN_SETS_ALL if ts not in TRAIN_SETS_PDL]
print(len(TRAIN_SETS_ALL))
TRAIN_SETS_ALL

In [None]:
TRAIN_SETS_CSV3_HCVN = ['be_mcredit01a__CashLoan__bad', 
                        'be_mirae02a__Cash Loan__dpd04', 
                        'be_mirae02a__Cash Loan__dpd09', 
                        'be_hcvn03__CD__bad', 
                        'be_hcvn03__TW__bad', 
                        'be_hcvn04__Xsell-target__DPD30_04', 
                        'be_hcvn04__Xsell-non-target__DPD30_04', 
                        'be_hcvn04__Xsell-target__DPD30_06', 
                        'be_hcvn04__Xsell-non-target__DPD30_06', 
                        'be_hcvn04__Xsell-target__DPD30_09', 
                        'be_hcvn04__Xsell-non-target__DPD30_09', 
                        'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
                        'be_cash24a__CL__bad',
                        'be_cash24a__CL__bad2',
                        'be_hcvn00x__CD__MAXDPD04_30',
                        'be_ocb02y__NONTS__DEL91_MOB12',
                        'be_ocb02y__NONTS__dpd01',
                        'be_ocb02y__NONTS__dpd04',
                        'be_ocb02y__NONTS__dpd06',
                        'be_cash24a__CL+Rejected__bad3',
                        'be_hcvn00x__TW__MAXDPD04_30',
                        'be_hcvn02x__na__DPD12_90']

In [None]:
TRAIN_SETS_BINH = [ts for ts in TRAIN_SETS_ALL if ts not in TRAIN_SETS_PDL + ["be_hcvn01__na__bad"]]
len(TRAIN_SETS_BINH)

In [None]:
TRAIN_SETS_KHOA = [ts for ts in TRAIN_SETS_ALL if ts not in TRAIN_SETS_PDL + ["be_mirae02ax__Cash Loan__DEL90_MOB12", 
                                                             "be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3"]]
len(TRAIN_SETS_KHOA)

In [None]:
#EARLY_SETS = ['be_hcvn04__Xsell-non-target__DPD30_09']
EARLY_SETS = ['fe02a01__none__DEL30_MOB7_APP']
EARLY_SETS

## Test AUC vs number of estimator

In [None]:
option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               # 'colsample_bytree':0.5,
                 'learning_rate':0.03,
                 'n_estimators':3000,
                 #'subsample':0.3,
                #'min_child_weight':10,
               
               'random_state':0,
               'predictor': 'cpu_predictor',
                }

option_fit={"early_stopping_rounds": None,
            "verbose"              : 0
            }
train_sets =  TRAIN_SETS_BINH
cnames = CNAMES_350

for n in range(600, 3300, 200):
    print(n)
    option_init["n_estimators"] = n
    option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")
    reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                             target_cname="bad", 
                            option_init=option_init, option_fit = option_fit)
    
    model_path = "models_select_early/train_56_ft350_est{}.pkl".format(n)
    print("Save model to:", model_path)
    pickle.dump(reslt, open(model_path, "wb"))

In [None]:
auc = {}
for n in range(600, 3300, 200):
    model_path = "models_select_early/train_56_ft350_est{}.pkl".format(n)
    print("Loading", model_path)
    fmodel = pd.read_pickle(model_path)
    auc[n] = fmodel["auc_test"]

auc = pd.DataFrame(auc).T
auc

In [None]:
fig, axes = plt.subplots(nrows=10, ncols=5, figsize=(25, 45))
axes = axes.flatten()

cols = auc.columns
for col, ax in zip(cols, axes):
    auc[col].plot(ax=ax)
    ax.set_title(col)
    

# Experiments

In [None]:
assert False
option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':0,
               'predictor': 'cpu_predictor',
                }

option_fit={"early_stopping_rounds": 100,
            "verbose"              : 500
            }

option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")

train_sets = train_sets_d2

cnames = CNAMES_307
reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                             target_cname="bad", 
                            option_init=option_init, option_fit = option_fit)

model_path = "models_v02/train_d2_nopdl_ft307.pkl"
pickle.dump(reslt, open(model_path, "wb"))

displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

In [None]:
assert False
option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':0,
               'predictor': 'cpu_predictor',
                }

option_fit={"early_stopping_rounds": 100,
            "verbose"              : 500
            }

option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")


train_set_list = {
    "e1": train_sets_e1,
    "e2": train_sets_e2,
    "e3": train_sets_e3,
    "e4": train_sets_e4,
    "e5": train_sets_e5,
    "e6": train_sets_e6,
    "e7": train_sets_e7,
    "e8": train_sets_e8,
    "e9": train_sets_e9,
    "e10": train_sets_e10,
    "e11": train_sets_e11,
}
cnames = CNAMES_306
print("cnames", len(cnames))

for set_label, tr_set in train_set_list.items():
    assert len(tr_set) == len(set(tr_set))
    
    train_sets = tr_set
    reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                                 target_cname="bad", 
                                option_init=option_init, option_fit = option_fit)

    model_path = "models_v03b/train_{}_early_fe_ft306.pkl".format(set_label)
    
    print("Save model to:", model_path)
    pickle.dump(reslt, open(model_path, "wb"))

In [None]:
!ls models_v03

In [None]:
model_path = "models_v03/train_e6_pdl_hit_nonhit_17K_ft306_rnd19.pkl"

displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

In [None]:
for rnd in range(20):
    option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':rnd,
               'predictor': 'cpu_predictor',
                }

    option_fit={"early_stopping_rounds": 100,
                "verbose"              : 500
                }

    option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")
    
    train_sets = train_sets_e14
    cnames = CNAMES_306
    reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                               target_cname="bad", 
                               option_init=option_init, option_fit = option_fit)

    model_path = "models_v03b/train_e14_early_fe_ft306_rnd{}.pkl".format(rnd)
    print("Save model to:", model_path)
    pickle.dump(reslt, open(model_path, "wb"))

## Data set selection results

### No pdl

### With pdl hit and non-hit

In [None]:
auc_e1 = 0.7525762684702132
train_sets_e1 = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24b__PCB__dpd06',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob04',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']
c_e1 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e1]
c_e1

In [None]:
auc_e2 = 0.7523320244283304
train_sets_e2 = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24b__PCB__dpd06',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob04',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']
c_e2 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e2]
c_e2

In [None]:
auc_e3 = 0.7520116722784955
train_sets_e3 = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd06',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']
c_e3 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e3]
c_e3

In [None]:
auc_e4 = 0.7522896951563102
train_sets_e4 = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd06',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob04',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']
c_e4 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e4]
c_e4

In [None]:
auc_e5 = 0.752232433691089
train_sets_e5 = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24b__PCB__dpd06',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob04',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']
c_e5 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e5]
c_e5

In [None]:
auc_e6 = 0.7524323530885103
train_sets_e6 = ['be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']
c_e6 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e6]
c_e6

In [None]:
auc_e7 = 0.7523133239395282
train_sets_e7 = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd06',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

c_e7 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e7]
c_e7

In [None]:
auc_e8 = 0.7516607916877259
train_sets_e8 = ['be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']
c_e8 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e8]
c_e8

In [None]:
auc_e9 = 0.7519613290680477
train_sets_e9 = ['be_hcvn01__na__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd09',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd06',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob04',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']
c_e9 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e9]
c_e9

In [None]:
auc_e10 = 0.7519125581926742
train_sets_e10 = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd06',
 'be_cash24b__PCB__dpd12',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob04',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

c_e10 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e10]
c_e10

In [None]:
auc_e11 = 0.7522083806063001
train_sets_e11 = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd06',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob04',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

c_e11 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e11]
c_e11

In [None]:
auc_e12 = 0.75127
train_sets_e12 = TRAIN_SETS_ALL + TRAIN_SETS_PDL
c_e12 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e12]
c_e12

In [None]:
auc_e13 = 0.7512554926502539
train_sets_e13 = ['be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad2',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

c_e13 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e13]
c_e13

In [None]:
auc_e14 = 0.7514265886970779

train_sets_e14 = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

c_e14 = [s for s in TRAIN_SETS_ALL+TRAIN_SETS_PDL if s not in train_sets_e14]
c_e14

## FE covid

In [None]:
assert True
option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':5,
               'predictor': 'cpu_predictor',
                }

option_fit={"early_stopping_rounds": 100,
            "verbose"              : 500
            }

option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")

train_sets = train_sets_e12

cnames = CNAMES_306
reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                             target_cname="bad", 
                            option_init=option_init, option_fit = option_fit)

model_path = "models_fe_covid/train_e12_early_fe_ft306_rnd5.pkl"
pickle.dump(reslt, open(model_path, "wb"))

displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

In [None]:
fe_post_coid = [
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 ]
train_sets_e12a = [s for s in train_sets_e12 if s not in fe_post_coid]
train_sets_e12a

In [None]:
for rnd in range(20):
    option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':rnd,
               'predictor': 'cpu_predictor',
                }

    option_fit={"early_stopping_rounds": 100,
                "verbose"              : 500
                }

    option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")

    train_sets = train_sets_e12a

    cnames = CNAMES_306
    reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                                 target_cname="bad", 
                                option_init=option_init, option_fit = option_fit)

    model_path = "models_fe_covid/train_e12a_early_fe_ft306_rnd{}.pkl".format(rnd)
    print("Save to", model_path)
    pickle.dump(reslt, open(model_path, "wb"))

In [None]:
model_path = "models_fe_covid/train_e12a_early_fe_ft306_rnd19.pkl"

displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

## No pdl

In [None]:
TRAIN_SETS_PDL

In [None]:
train_sets_e10b = [s for s in train_sets_e10 if s not in TRAIN_SETS_PDL]
train_sets_e10b

In [None]:
for rnd in range(20):
    option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':rnd,
               'predictor': 'cpu_predictor',
                }

    option_fit={"early_stopping_rounds": 100,
                "verbose"              : 500
                }

    option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")

    train_sets = train_sets_e10b

    cnames = CNAMES_306
    reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                                 target_cname="bad", 
                                option_init=option_init, option_fit = option_fit)

    model_path = "models_nopdl/train_e10b_early_fe_ft306_rnd{}.pkl".format(rnd)
    print("Save to", model_path)
    pickle.dump(reslt, open(model_path, "wb"))

In [None]:
model_path = "models_nopdl/train_e10b_early_fe_ft306_rnd0.pkl"

displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

# No `pdl_vibcc`

In [None]:
pdl_exld = ['pdl_vibcc__none__pdl_hit',
            'pdl_vibcc_bwd3m__none__pdl_hit']
train_sets_e10d = [s for s in train_sets_e10 if s not in pdl_exld]
train_sets_e10d

In [None]:
for rnd in range(20):
    option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':rnd,
               'predictor': 'cpu_predictor',
                }

    option_fit={"early_stopping_rounds": 100,
                "verbose"              : 500
                }

    option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")

    train_sets = train_sets_e10d

    cnames = CNAMES_306
    reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                                 target_cname="bad", 
                                option_init=option_init, option_fit = option_fit)

    model_path = "models_nopdl/train_e10d_early_fe_ft306_rnd{}.pkl".format(rnd)
    print("Save to", model_path)
    pickle.dump(reslt, open(model_path, "wb"))

In [None]:
model_path = "models_nopdl/train_e10d_early_fe_ft306_rnd0.pkl"

displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

# no random churn

In [None]:
pdXY["data_set"].unique()

In [None]:
churn_exld = ['be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',]
train_sets_e10e = [s for s in train_sets_e10 if s not in churn_exld]
train_sets_e10e

In [None]:
for rnd in range(20):
    option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':rnd,
               'predictor': 'cpu_predictor',
                }

    option_fit={"early_stopping_rounds": 100,
                "verbose"              : 500
                }

    option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")

    train_sets = train_sets_e10e

    cnames = CNAMES_306
    reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                                 target_cname="bad", 
                                option_init=option_init, option_fit = option_fit)

    model_path = "models_nochurn/train_e10e_early_fe_ft306_rnd{}.pkl".format(rnd)
    print("Save to", model_path)
    pickle.dump(reslt, open(model_path, "wb"))

In [None]:
model_path = "models_nochurn/train_e10e_early_fe_ft306_rnd19.pkl"

displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

# No pdl+churn

In [None]:
train_sets_e10f = [s for s in train_sets_e10 if s not in TRAIN_SETS_PDL+["be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3"]]
train_sets_e10f

In [None]:
for rnd in range(20):
    option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':rnd,
               'predictor': 'cpu_predictor',
                }

    option_fit={"early_stopping_rounds": 100,
                "verbose"              : 500
                }

    option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")

    train_sets = train_sets_e10f

    cnames = CNAMES_306
    reslt = run_xgboost_ranker(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                                 target_cname="bad", 
                                option_init=option_init, option_fit = option_fit)

    model_path = "models_no_pdl_churn/train_e10f_early_fe_ft306_rnd{}.pkl".format(rnd)
    print("Save to", model_path)
    pickle.dump(reslt, open(model_path, "wb"))

## Flipping

In [None]:
def gen_initial_state(init_train_sets, pdXY):
    train_set_all = pdXY.loc[pdXY["tvt"] == "train", "data_set"].unique().tolist()
    assert set(init_train_sets).intersection(train_set_all) == set(init_train_sets)
    state = {ds: False for ds in train_set_all}
    for ds in init_train_sets:
        state[ds] = True
    return state


def flip(state_current, at_set):
    assert at_set in state_current.keys()
    state_trail = copy.deepcopy(state_current)
    state_trail[at_set] = not state_trail[at_set]
    return state_trail



In [None]:
def flip_over_trial_sets(pdXY, initial_state, trial_train_sets, 
                         early_sets, test_sets, eval_sets,
                         feature_cnames, target_cname="bad", 
                         option_init={}, option_fit={}):
    state_current = initial_state
    train_sets = [s for s in state_current if state_current[s]]
    results = run_xgboost_ranker(pdXY, train_sets, early_sets, test_sets, 
                                 feature_cnames, target_cname=target_cname, 
                                 option_init=option_init, option_fit=option_fit, 
                                 verbose=0)
    auc_test = results["auc_test"]
    auc_current = np.mean([auc_test[s] for s in eval_sets])
    print("auc_current", auc_current)
    
    for trail_set in trial_train_sets:
        print("Trying", trail_set)
        state_trial = flip(state_current, trail_set)
        train_sets = [s for s in state_trial if state_trial[s]]
        results = run_xgboost_ranker(pdXY, train_sets, early_sets, test_sets, 
                                     feature_cnames, target_cname=target_cname, 
                                     option_init=option_init, option_fit=option_fit, 
                                     verbose=0)
        auc_test = results["auc_test"]
        auc_trial = np.mean([auc_test[s] for s in eval_sets])
        print("auc_trial", auc_trial)
        
        if auc_trial > auc_current:
            print("Accepted")
            auc_current = auc_trial
            state_current = state_trial
        else:
            print("Rejected")
        print("auc_current", auc_current)
        print("state_current:", [s for s in state_current if state_current[s]])
        print("-------------------------\n\n")
    return auc_current, state_current



def flip_over_trial_sets_clf(pdXY, initial_state, trial_train_sets, 
                         early_sets, test_sets, eval_sets,
                         feature_cnames, target_cname="bad", 
                         option_init={}, option_fit={}):
    state_current = initial_state
    train_sets = [s for s in state_current if state_current[s]]
    results = run_xgboost_classify(pdXY, train_sets, early_sets, test_sets, 
                                 feature_cnames, target_cname=target_cname, 
                                 option_init=option_init, option_fit=option_fit, 
                                 verbose=0)
    auc_test = results["auc_test"]
    auc_current = np.mean([auc_test[s] for s in eval_sets])
    print("auc_current", auc_current)
    
    for trail_set in trial_train_sets:
        print("Trying", trail_set)
        state_trial = flip(state_current, trail_set)
        train_sets = [s for s in state_trial if state_trial[s]]
        results = run_xgboost_classify(pdXY, train_sets, early_sets, test_sets, 
                                     feature_cnames, target_cname=target_cname, 
                                     option_init=option_init, option_fit=option_fit, 
                                     verbose=0)
        auc_test = results["auc_test"]
        auc_trial = np.mean([auc_test[s] for s in eval_sets])
        print("auc_trial", auc_trial)
        
        if auc_trial > auc_current:
            print("Accepted")
            auc_current = auc_trial
            state_current = state_trial
        else:
            print("Rejected")
        print("auc_current", auc_current)
        print("state_current:", [s for s in state_current if state_current[s]])
        print("-------------------------\n\n")
    return auc_current, state_current

In [None]:
EVAL_SETS = ['be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'fe02a01__none__FPD30',
             
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__FPD30',
             
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
             
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
             
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30']

In [None]:
TEST_SETS

In [None]:
NOT_IN_EVAL = ['be_pvcombank__na__bad',
               'be_vietcapital__Credit card__bad',
               'be_vietcapital__Unsecured personal loans__bad',
               
               'be_mirae02b__Installment Loan__dpd04',
               'be_mirae02b__Installment Loan__dpd09',
               'be_mobivi__na__bad',
               
               'be_hcvn01__na__bad',
              ]

[s for s in TEST_SETS if s not in NOT_IN_EVAL]

## Round 1

In [None]:
option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':0,
               'predictor': 'cpu_predictor',
                }

option_fit={"early_stopping_rounds": 100,
            "verbose"              : 500
            }

option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="ranker")


init_train_sets = ['be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
                   'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn01__na__bad',
                   
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad2',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

init_state = gen_initial_state(init_train_sets, pdXY)

trial_train_sets = ['be_cash24a__CL+Rejected__bad3',
 'be_cash24a__CL__bad',
 'be_cash24b__PCB__dpd06',
 'be_cash24b__PCB__dpd12',
                    'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__CLX__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn01__na__bad',
                    
 'be_mirae02b__Installment Loan__dpd09',
 'fsv5_ferr__none__del30_mob3',
 'fsv5_mafc01_20210813_11K__none__mob04']

np.random.seed(3009)
trial_train_sets = np.random.choice(trial_train_sets, size=len(trial_train_sets), replace=False).tolist()

cnames = CNAMES_306

auc_current, state_current = flip_over_trial_sets(pdXY, init_state, trial_train_sets, 
                                                  EARLY_SETS, TEST_SETS, EVAL_SETS,
                                                  cnames, option_init=option_init, option_fit=option_fit)

In [None]:
[s for s in state_current if state_current[s]]

In [None]:
auc_current = 0.7514265886970779
[s for s in state_current if state_current[s]]
train_sets = ['be_hcvn01__na__bad',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3',
 'be_cash24a__CL__bad',
 'be_cash24a__CL__bad2',
 'be_mirae02ax__Cash Loan__DEL90_MOB12',
 'be_hcvn00x__CD__MAXDPD04_30',
 'be_hcvn00x__TW__MAXDPD04_30',
 'be_hcvn02x__na__DPD12_90',
 'be_ocb02x__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'fe02a01__none__FPD30',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__FPD30',
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'fsv5_mafc01_20210629_30K__none__fpd30',
 'fsv5_mafc01_20210629_30K__none__mob04',
 'fsv5_mafc01_20210629_30K__none__mob06',
 'fsv5_mafc01_20210629_30K__none__mob12',
 'fsv5_mafc01_20210813_11K__none__mob06',
 'fsv5_mafc01_20210813_11K__none__mob12',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

In [None]:
pdXY["data_set"].unique()

# pdl model

In [None]:
EARLY_SETS = ['pdl_avay__none__pdl_hit']

In [None]:
EVAL_SETS = ['pdl_avay__none__pdl_hit',  'pdl_avay_bwd3m__none__pdl_hit', 
             'pdl_vib__none__pdl_hit',   'pdl_vib_bwd3m__none__pdl_hit', 
             'pdl_vibcc__none__pdl_hit', 'pdl_vibcc_bwd3m__none__pdl_hit',
            ]

In [None]:
assert False
option_init = {#"booster" : 'gbtree' ,
               'tree_method': 'gpu_hist',
               #'colsample_bylevel':1,
               #'colsample_bytree':0.5,
                 'learning_rate':0.025,
                 'n_estimators':3000,
                #'subsample':0.5,
                #'min_child_weight':10,
               
               'random_state':0,
               'predictor': 'cpu_predictor',
                }

option_fit={"early_stopping_rounds": 100,
            "verbose"              : 500
            }

option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="classify")


init_train_sets = ['be_mcredit01b__InstallmentLoan__bad',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd12',
 'be_ocb02y__NONTS__dpd01',
 'vib__cc__MOB12_DPD90',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

init_state = gen_initial_state(init_train_sets, pdXY)

trial_train_sets = TRAIN_SETS_ALL

np.random.seed(520)
trial_train_sets = np.random.choice(trial_train_sets, size=len(trial_train_sets), replace=False).tolist()

cnames = CNAMES_350

auc_current, state_current = flip_over_trial_sets_clf(pdXY, init_state, trial_train_sets, 
                                                  EARLY_SETS, TEST_SETS, EVAL_SETS,
                                                  cnames, option_init=option_init, option_fit=option_fit)

In [None]:
auc_01 = 0.8415830818649689
train_set_01 = ['be_mcredit01b__InstallmentLoan__bad',
 'be_cash24a__CL+Rejected__bad3',
 'be_cash24b__PCB__dpd12',
 'be_ocb02y__NONTS__dpd01',
 'vib__cc__MOB12_DPD90',
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']

In [None]:
assert False

for rnd in range(20):
    option_init = {#"booster" : 'gbtree' ,
                   'tree_method': 'gpu_hist',
                   #'colsample_bylevel':1,
                   #'colsample_bytree':0.5,
                     'learning_rate':0.025,
                     'n_estimators':3000,
                    #'subsample':0.5,
                    #'min_child_weight':10,

                   'random_state': rnd,
                   'predictor': 'cpu_predictor',
                    }

    option_fit={"early_stopping_rounds": 100,
                "verbose"              : 500
                }

    option_init, option_fit = complete_xgboost_option(option_init=option_init,option_fit=option_fit, mode="classify")

    train_sets = train_set_01

    cnames = CNAMES_350
    reslt = run_xgboost_classify(pdXY, train_sets, EARLY_SETS, TEST_SETS, cnames, 
                                 target_cname="bad", 
                                option_init=option_init, option_fit = option_fit)

    model_path = "models_pdl/clf_sel04_ft350_rnd{}.pkl".format(rnd)
    pickle.dump(reslt, open(model_path, "wb"))

displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

In [None]:
model_path = "models_pdl/clf_sel04_ft350_rnd19.pkl"
displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

# Predict pdl model

In [None]:
model_path = "/bricks/brick1/gv0/working/core_dev/csv4/explo/hai/models_pdl/clf_sel00_rnd04_ft306.pkl"
fmodel = pd.read_pickle(model_path)
feature_cnames = fmodel["feature_cnames"]
model = fmodel["model"]
pdXY["pred_proba_pdlhit"] = model.predict_proba(pdXY[feature_cnames], ntree_limit=model.get_booster().best_ntree_limit)[:, 1]
pdXY = pdXY[["phone_number", "upto_date", "pred_proba_pdlhit"]]
print(pdXY.shape)
pdXY = pdXY.drop_duplicates(subset=["phone_number", "upto_date"])
print(pdXY.shape)

assert False
pdXY.to_pickle("../../data/pdX_proba_pdl.pkl")

In [8]:
TEST_SETS = ['be_hcvn01__na__bad',
 'be_hcvn03__CD__bad',
 'be_hcvn03__TW__bad',
 'be_hcvn04__Xsell-non-target__DPD30_04',
 'be_hcvn04__Xsell-non-target__DPD30_06',
 'be_hcvn04__Xsell-non-target__DPD30_09',
 'be_hcvn04__Xsell-target__DPD30_04',
 'be_hcvn04__Xsell-target__DPD30_06',
 'be_hcvn04__Xsell-target__DPD30_09',
 'be_mcredit01a__CashLoan__bad',
 'be_mcredit01b__InstallmentLoan__bad',
 'be_mirae02a__Cash Loan__dpd04',
 'be_mirae02a__Cash Loan__dpd09',
 'be_mirae02b__Installment Loan__dpd04',
 'be_mirae02b__Installment Loan__dpd09',
 'be_mobivi__na__bad',
 'be_ocb02y__NONTS__DEL91_MOB12',
 'be_ocb02y__NONTS__dpd01',
 'be_ocb02y__NONTS__dpd04',
 'be_ocb02y__NONTS__dpd06',
 'be_pvcombank__na__bad',
 'be_vietcapital__Credit card__bad',
 'be_vietcapital__Unsecured personal loans__bad',
             
 'fe02a01__none__DEL30_MOB4_APP',
 'fe02a01__none__DEL30_MOB7_APP',
 'fe02a01__none__FPD30',
             
 'fe02a01_long__none__query_dpd30',
 'fe02a01_long__none__query_dpd90',
 'fe02a01_long__none__query_dpd30_rm_fpd30',
 'fe02a01_long__none__query_dpd90_rm_fpd30',
             
 'fe02a01_long_rm560__none__query_dpd30',
 'fe02a01_long_rm560__none__query_dpd90',
 'fe02a01_long_rm560__none__query_dpd30_rm_fpd30',
 'fe02a01_long_rm560__none__query_dpd90_rm_fpd30',
             
 'fe02a02_NonTS__none__DEL30_MOB4_APP',
 'fe02a02_NonTS__none__FPD30',
 'fe02a02_TS__none__DEL30_MOB4_APP',
 'fe02a02_TS__none__FPD30',
             
 'mafc04a__none__DEL30_MOB4',
 'mafc04a__none__DEL30_MOB6',
 'mafc05a__none__del30mob12',
             
 'shb06a__none__type01',
 'shb06a__none__type02',
 'shb06a__none__type03',
 'shb06a__none__type04',
             
 'vib__cc__MOB12_DPD90',
 'vib__cc__MOB12_ever_DPD90',
 'vib__cc__MOB6_DPD30',
 'vib__cc__MOB6_ever_DPD30',
 
 'pdl_avay__none__pdl_hit',
 'pdl_avay_bwd3m__none__pdl_hit',
 'pdl_vib__none__pdl_hit',
 'pdl_vib_bwd3m__none__pdl_hit',
 'pdl_vibcc__none__pdl_hit',
 'pdl_vibcc_bwd3m__none__pdl_hit']
len(TEST_SETS)

55

In [30]:
model_path = "models_v03c_felong/csv4_felong_rnd0.pkl"

displ = display_eval_results(model_path, TEST_SETS)
display_df(displ)
print(displ["feat_cnames"].values[0])

Unnamed: 0,model_id,train_set_list,n_train,feat_cnames,n_feat,ntree,model_path,train_auc,be_hcvn01__na__bad,be_hcvn03__CD__bad,be_hcvn03__TW__bad,be_hcvn04__Xsell-non-target__DPD30_04,be_hcvn04__Xsell-non-target__DPD30_06,be_hcvn04__Xsell-non-target__DPD30_09,be_hcvn04__Xsell-target__DPD30_04,be_hcvn04__Xsell-target__DPD30_06,be_hcvn04__Xsell-target__DPD30_09,be_mcredit01a__CashLoan__bad,be_mcredit01b__InstallmentLoan__bad,be_mirae02a__Cash Loan__dpd04,be_mirae02a__Cash Loan__dpd09,be_mirae02b__Installment Loan__dpd04,be_mirae02b__Installment Loan__dpd09,be_mobivi__na__bad,be_ocb02y__NONTS__DEL91_MOB12,be_ocb02y__NONTS__dpd01,be_ocb02y__NONTS__dpd04,be_ocb02y__NONTS__dpd06,be_pvcombank__na__bad,be_vietcapital__Credit card__bad,be_vietcapital__Unsecured personal loans__bad,fe02a01__none__DEL30_MOB4_APP,fe02a01__none__DEL30_MOB7_APP,fe02a01__none__FPD30,fe02a01_long__none__query_dpd30,fe02a01_long__none__query_dpd90,fe02a01_long__none__query_dpd30_rm_fpd30,fe02a01_long__none__query_dpd90_rm_fpd30,fe02a01_long_rm560__none__query_dpd30,fe02a01_long_rm560__none__query_dpd90,fe02a01_long_rm560__none__query_dpd30_rm_fpd30,fe02a01_long_rm560__none__query_dpd90_rm_fpd30,fe02a02_NonTS__none__DEL30_MOB4_APP,fe02a02_NonTS__none__FPD30,fe02a02_TS__none__DEL30_MOB4_APP,fe02a02_TS__none__FPD30,mafc04a__none__DEL30_MOB4,mafc04a__none__DEL30_MOB6,mafc05a__none__del30mob12,shb06a__none__type01,shb06a__none__type02,shb06a__none__type03,shb06a__none__type04,vib__cc__MOB12_DPD90,vib__cc__MOB12_ever_DPD90,vib__cc__MOB6_DPD30,vib__cc__MOB6_ever_DPD30,pdl_avay__none__pdl_hit,pdl_avay_bwd3m__none__pdl_hit,pdl_vib__none__pdl_hit,pdl_vib_bwd3m__none__pdl_hit,pdl_vibcc__none__pdl_hit,pdl_vibcc_bwd3m__none__pdl_hit
0,csv4_felong_rnd0,"[be_cash24a__CL+Rejected__bad3, be_cash24a__CL__bad, be_cash24a__CL__bad2, be_cash24b__PCB__dpd06, be_cash24b__PCB__dpd12, be_hcvn00x__CD__MAXDPD04_30, be_hcvn00x__CLX__MAXDPD04_30, be_hcvn00x__TW__MAXDPD04_30, be_hcvn01__na__bad, be_hcvn02x__na__DPD12_90, be_hcvn03__CD__bad, be_hcvn03__TW__bad, be_hcvn04__Xsell-non-target__DPD30_04, be_hcvn04__Xsell-non-target__DPD30_06, be_hcvn04__Xsell-non-target__DPD30_09, be_hcvn04__Xsell-target__DPD30_04, be_hcvn04__Xsell-target__DPD30_06, be_hcvn04__Xsell-target__DPD30_09, be_mcredit01a__CashLoan__bad, be_mcredit01b__InstallmentLoan__bad, be_mirae02a__Cash Loan__dpd04, be_mirae02a__Cash Loan__dpd09, be_mirae02ax__Cash Loan__DEL90_MOB12, be_mirae02b__Installment Loan__dpd04, be_mirae02b__Installment Loan__dpd09, be_ocb02x__NONTS__DEL91_MOB12, be_ocb02y__NONTS__DEL91_MOB12, be_ocb02y__NONTS__dpd01, be_ocb02y__NONTS__dpd04, be_ocb02y__NONTS__dpd06, be_rnd_csv3_hcvn__none__churn_16w_or_low_csv3, fe02a01__none__DEL30_MOB4_APP, fe02a01__none__DEL30_MOB7_APP, fe02a01__none__FPD30, fe02a01_long__none__query_dpd30, fe02a01_long__none__query_dpd90, fe02a02_NonTS__none__DEL30_MOB4_APP, fe02a02_NonTS__none__FPD30, fe02a02_TS__none__DEL30_MOB4_APP, fe02a02_TS__none__FPD30, fsv5_ferr__none__del30_mob3, fsv5_mafc01_20210629_30K__none__fpd30, fsv5_mafc01_20210629_30K__none__mob04, fsv5_mafc01_20210629_30K__none__mob06, fsv5_mafc01_20210629_30K__none__mob12, fsv5_mafc01_20210813_11K__none__mob04, fsv5_mafc01_20210813_11K__none__mob06, fsv5_mafc01_20210813_11K__none__mob12, mafc04a__none__DEL30_MOB4, mafc04a__none__DEL30_MOB6, mafc05a__none__del30mob12, pdl_avay__none__pdl_hit, pdl_avay_bwd3m__none__pdl_hit, pdl_vib__none__pdl_hit, pdl_vib_bwd3m__none__pdl_hit, pdl_vibcc__none__pdl_hit, pdl_vibcc_bwd3m__none__pdl_hit, shb06a__none__type01, shb06a__none__type02, shb06a__none__type03, shb06a__none__type04, vib__cc__MOB12_DPD90, vib__cc__MOB12_ever_DPD90, vib__cc__MOB6_DPD30, vib__cc__MOB6_ever_DPD30]",1544132,"[ac_ac_real_age, ac_avg_mth_dataplan_expense_last_3mth, ac_max_mth_usage_last_3mth, ac_sd_mth_expense_last_3mth, ac_sd_mth_usage_last_3mth, afternoon_count_max_step_dist_last_90d, avg_areas_cross_last_90d, bh_avg_day_balance_last_30d, bh_avg_day_promotion_balance_last_30d, bh_max_day_balance_last_30d, bh_max_day_promotion_balance_last_30d, bh_min_day_balance_last_30d, bh_pct_day_balance_from_50k_to_100k_last_30d, bh_pct_day_balance_ge_100k_last_30d, bh_pct_day_balance_lt_10k_last_30d, bh_pct_day_balance_lt_1k_last_30d, bh_pct_day_balance_lt_5k_last_30d, bh_sd_day_balance_last_30d, bh_sd_day_promotion_balance_last_30d, call_avg_call_duration_during_morning_in_day_last_3_months, call_avg_num_night_calls_in_day_last_3_months, call_std_num_calls_in_day_last_3_months, call_std_num_night_calls_in_day_last_3_months, call_sum_call_duration_during_morning_in_day_last_3_months, call_sum_num_night_calls_in_day_last_3_months, callsmssim_call_in_contact_sid_L1sim_4w21, callsmssim_call_in_contact_sid_L1sim_4w8, callsmssim_call_in_contact_sid_L2sim_4w21, callsmssim_call_in_contact_sid_L2sim_4w8, callsmssim_call_out_contact_sid_L1sim_4w21, callsmssim_call_out_contact_sid_L1sim_4w8, callsmssim_call_out_contact_sid_L2sim_4w21, callsmssim_call_out_contact_sid_L2sim_4w8, callsmssim_call_sms_dcontact_sid_4w21, callsmssim_call_sms_dcontact_sid_4w8, callsmssim_callout_sms_prc_common_contact_4w21, callsmssim_callout_sms_prc_common_contact_4w8, callsmssim_callsms_prc_common_contact_4w21, callsmssim_callsms_prc_common_contact_4w8, callsmssim_sms_contact_sid_L1sim_4w8, callsmssim_sms_contact_sid_L2sim_4w8, ch_avg_per_day_num_calls_last_30d, ch_avg_per_day_num_evening_contacts_last_30d, ch_avg_per_day_num_out_calls_last_30d, ch_num_out_calls_last_30d, ch_sd_per_day_num_freq_contacts_noc_ge6_last_30d, cmtyb_count_18w_og, cmtyb_count_3w_og, cmtyb_count_9w_og, cmtyb_num_calls_18w_og, cmtyb_num_calls_3w_og, cmtyb_num_calls_9w_og, cmtyb_ratio_count_3w18w_ic, cmtyb_ratio_count_3w9w_ic, cmtyb_sum_seconds_18w_og, cmtyb_sum_seconds_3w_og, cmtyb_sum_seconds_9w_og, cs_abnormal_avg_wk_ndays_num_interactions_gt_0_last21w, cs_abnormal_avg_wk_ndays_num_interactions_gt_15_last21w, cs_abnormal_avg_wk_ndays_num_interactions_gt_20_last21w, cs_abnormal_avg_wk_ndays_num_interactions_gt_25_last21w, cs_abnormal_avg_wk_ndays_num_interactions_gt_30_last21w, cs_abnormal_avg_wk_ndays_num_interactions_gt_35_last21w, cs_abnormal_avg_wk_ndays_num_interactions_lt_2_last21w, cs_abnormal_avg_wk_ndays_num_interactions_lt_3_last21w, cs_abnormal_min_wk_ndays_num_interactions_gt_0_last21w, cs_abnormal_std_wk_ndays_num_interactions_gt_0_last21w, cs_abnormal_std_wk_ndays_num_interactions_gt_10_last21w, cs_abnormal_std_wk_ndays_num_interactions_gt_20_last21w, cs_abnormal_std_wk_ndays_num_interactions_gt_25_last21w, cs_abnormal_std_wk_ndays_num_interactions_gt_30_last21w, cs_abnormal_std_wk_ndays_num_interactions_gt_35_last21w, cs_abnormal_std_wk_ndays_num_interactions_gt_5_last21w, cs_abnormal_sum_wk_ndays_num_interactions_gt_0_last21w, cs_abnormal_sum_wk_ndays_num_interactions_lt_2_last21w, cs_abnormal_sum_wk_ndays_num_interactions_lt_3_last21w, cs_avg_derv_wk_num_uses_in_weekend_last21w, cs_avg_derv_wk_numd_contacts_in_evening_last21w, cs_avg_wk_numd_contacts_in_evening_last21w, cs_avg_wk_numd_contacts_last21w, cs_avg_wk_numd_hours_in_weekend_last21w, cs_max_wk_numd_contacts_in_evening_last21w, cs_max_wk_numd_contacts_last21w, cs_min_wk_numd_contacts_in_evening_last21w, cs_std_derv_wk_num_uses_in_weekend_last21w, cs_std_derv_wk_numd_contacts_in_evening_last21w, cs_std_wk_num_uses_in_evening_last21w, cs_std_wk_num_uses_in_office_hours_last21w, cs_std_wk_num_uses_in_weekend_last21w, cs_std_wk_numd_contacts_in_evening_last21w, cs_std_wk_numd_contacts_in_office_hours_last21w, cs_std_wk_numd_contacts_in_weekend_last21w, cs_std_wk_numd_contacts_last21w, cs_std_wk_numd_hours_at_night_last21w, cs_std_wk_numd_hours_in_office_hours_last21w, cs_std_wk_numd_hours_in_weekend_last21w, cs_std_wk_numd_hours_last21w, cs_std_wk_sum_num_events_last21w, dnd01_avg_callin_ratio_avg_call_duration_last12w, dnd01_avg_callin_ratio_dcount_csid_last12w, ...]",306,1729,/bricks/brick1/gv0/working/core_dev/csv4/explo/hai/models_v03c_felong/csv4_felong_rnd0.pkl,0.80481,0.878772,0.800146,0.729013,0.830476,0.811989,0.809567,0.714201,0.692875,0.684678,0.745792,0.788201,0.726881,0.732547,0.811702,0.809672,0.670117,0.72106,0.772317,0.741892,0.725983,0.754831,0.821844,0.745771,0.89132,0.847341,0.912055,0.595488,0.606575,0.588823,0.59939,0.613338,0.62506,0.604216,0.615352,0.698002,0.7051,0.752109,0.716021,0.702138,0.673842,0.758721,0.742811,0.708613,0.697724,0.74662,0.716688,0.707656,0.75167,0.753165,0.67753,0.614616,0.816009,0.787992,0.78674,0.798391


['ac_ac_real_age', 'ac_avg_mth_dataplan_expense_last_3mth', 'ac_max_mth_usage_last_3mth', 'ac_sd_mth_expense_last_3mth', 'ac_sd_mth_usage_last_3mth', 'afternoon_count_max_step_dist_last_90d', 'avg_areas_cross_last_90d', 'bh_avg_day_balance_last_30d', 'bh_avg_day_promotion_balance_last_30d', 'bh_max_day_balance_last_30d', 'bh_max_day_promotion_balance_last_30d', 'bh_min_day_balance_last_30d', 'bh_pct_day_balance_from_50k_to_100k_last_30d', 'bh_pct_day_balance_ge_100k_last_30d', 'bh_pct_day_balance_lt_10k_last_30d', 'bh_pct_day_balance_lt_1k_last_30d', 'bh_pct_day_balance_lt_5k_last_30d', 'bh_sd_day_balance_last_30d', 'bh_sd_day_promotion_balance_last_30d', 'call_avg_call_duration_during_morning_in_day_last_3_months', 'call_avg_num_night_calls_in_day_last_3_months', 'call_std_num_calls_in_day_last_3_months', 'call_std_num_night_calls_in_day_last_3_months', 'call_sum_call_duration_during_morning_in_day_last_3_months', 'call_sum_num_night_calls_in_day_last_3_months', 'callsmssim_call_in_co