https://github.com/PanJianning/DCIC-2019-Credit-2th-Place/tree/master/code

In [1]:
import os

os.environ['NUM_OMP_THREADS'] = "4"

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import time
from sklearn.linear_model import HuberRegressor
import sklearn.ensemble as tree_model
from tqdm import tqdm
import datetime
pd.set_option('display.max_column',100)
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
from utils import make_dir, score, timer, kf_lgbm, kf_xgbm, kf_ctbm, kf_sklearn

In [2]:
def make_features(df):
    app_feature = [
        'online_shopping_count',
        'express_count',
        'finance_app_count',
        'video_app_count',
        'flight_count',
        'train_count',
        'tour_app_count'
    ]
    
    for f in app_feature:
        df['round_log1p_' + f] = np.round(np.log1p(df[f])).astype(int)
    
    df['ago_5month_total_fee'] = 6 * df['recent_6month_avg_use'] - df['total_account_fee']
    df['ago_5month_avg_fee'] = df['ago_5month_total_fee'] / 5
    df['total_account_fee/ago_5month_total_fee'] = (df['total_account_fee']) / (1 + df['ago_5month_total_fee'])
    df['total_account_fee-ago_5month_avg_fee'] = df['total_account_fee'] - df['ago_5month_avg_fee']
        
    def make_count_feature(df, col, fea_name):
        df['idx'] = range(len(df))
        tmp = df.groupby(col)['uid'].agg([
            (fea_name,'count')]).reset_index()
        df = df.merge(tmp)
        df = df.sort_values('idx').drop('idx',axis=1).reset_index(drop=True)
        return df
        
    df = make_count_feature(df, 'top_up_amount', 'count_top_up_amount')
    df = make_count_feature(df, 'total_account_fee', 'count_total_account_fee')
    df = make_count_feature(df, 'ago_5month_total_fee', 'count_ago_5month_total_fee')
    df = make_count_feature(df, 'total_account_fee-ago_5month_avg_fee', 'count_total_account_fee-ago_5month_avg_fee')
    df = make_count_feature(df, 'recent_6month_avg_use', 'count_recent_6month_avg_use')
    df = make_count_feature(df, ['total_account_fee', 'recent_6month_avg_use'], 'count_total_account_fee_recent_6month_avg_use')
            
    arr = df['top_up_amount']
    df['is_998_discount'] = ((arr / 0.998) % 1 == 0) & (arr != 0)
    
    df['age_0_as_nan'] = np.where(df['age'] == 0, [np.nan] * len(df), df['age'])
    
    return df
    
def load_df_and_make_features():
    train_df = pd.read_csv('../../input/train_dataset.csv')
    test_df = pd.read_csv('../../input/test_dataset.csv')
    
    train_df.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',
                        '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',
                        'recent_6month_avg_use','total_account_fee','curr_month_balance',
                        'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',
                        'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',
                        'tour_flag','sport_flag','online_shopping_count','express_count',
                        'finance_app_count','video_app_count','flight_count','train_count',
                        'tour_app_count','score']
    test_df.columns = train_df.columns[:-1]
    train_df['train'] = 1
    test_df['train'] = 0
    df = pd.concat([train_df,test_df])
    df = make_features(df)
    return df

In [3]:
feature_name1 = [
    'age',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'ago_5month_total_fee',
    'count_top_up_amount',
    'count_total_account_fee',
    'count_total_account_fee-ago_5month_avg_fee',
    'count_recent_6month_avg_use',
    'count_total_account_fee_recent_6month_avg_use',
    'is_998_discount',
    'connect_num',
    'recent_3month_shopping_count',
    'online_shopping_count',
    'express_count',
    'finance_app_count',
    'video_app_count',
    'flight_count',
    'train_count',
    'tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name2 = [
    'age_0_as_nan',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'ago_5month_total_fee',
    'count_top_up_amount',
    'count_total_account_fee',
    'count_total_account_fee-ago_5month_avg_fee',
    'count_recent_6month_avg_use',
    'count_total_account_fee_recent_6month_avg_use',
    'is_998_discount',
    'connect_num',
    'recent_3month_shopping_count',
    'online_shopping_count',
    'express_count',
    'finance_app_count',
    'video_app_count',
    'flight_count',
    'train_count',
    'tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name3 = [
    'age',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'ago_5month_total_fee',
    'count_top_up_amount',
    'count_total_account_fee',
    'count_total_account_fee-ago_5month_avg_fee',
    'count_recent_6month_avg_use',
    'count_total_account_fee_recent_6month_avg_use',
    'is_998_discount',
    'connect_num',
    'recent_3month_shopping_count',
    'round_log1p_online_shopping_count',
    'round_log1p_express_count',
    'round_log1p_finance_app_count',
    'round_log1p_video_app_count',
    'round_log1p_flight_count',
    'round_log1p_train_count',
    'round_log1p_tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name4 = [
    'age_0_as_nan',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'ago_5month_total_fee',
    'count_top_up_amount',
    'count_total_account_fee',
    'count_total_account_fee-ago_5month_avg_fee',
    'count_recent_6month_avg_use',
    'count_total_account_fee_recent_6month_avg_use',
    'is_998_discount',
    'connect_num',
    'recent_3month_shopping_count',
    'round_log1p_online_shopping_count',
    'round_log1p_express_count',
    'round_log1p_finance_app_count',
    'round_log1p_video_app_count',
    'round_log1p_flight_count',
    'round_log1p_train_count',
    'round_log1p_tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name5 = [
    'age',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'connect_num',
    'recent_3month_shopping_count',
    'online_shopping_count',
    'express_count',
    'finance_app_count',
    'video_app_count',
    'flight_count',
    'train_count',
    'tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name6 = [
    'age_0_as_nan',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'connect_num',
    'recent_3month_shopping_count',
    'online_shopping_count',
    'express_count',
    'finance_app_count',
    'video_app_count',
    'flight_count',
    'train_count',
    'tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

In [4]:
df = load_df_and_make_features()
train_df = df[df['train']==1]
test_df = df[df['train']!=1]

In [5]:
output_dir = 'stacking_files/'

In [6]:
x, y = train_df[feature_name1], train_df['score'].values
x_test = test_df[feature_name1]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=25, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="mae_fair",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb1',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.4725	train's mae: 14.4725	test's l1: 14.8903	test's mae: 14.8903
[1000]	train's l1: 14.0933	train's mae: 14.0933	test's l1: 14.7451	test's mae: 14.7451
[1500]	train's l1: 13.8385	train's mae: 13.8385	test's l1: 14.6922	test's mae: 14.6922
[2000]	train's l1: 13.6193	train's mae: 13.6193	test's l1: 14.6579	test's mae: 14.6579
Early stopping, best iteration is:
[2330]	train's l1: 13.4874	train's mae: 13.4874	test's l1: 14.6461	test's mae: 14.6461

Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.5281	train's mae: 14.5281	test's l1: 14.4704	test's mae: 14.4704
[1000]	train's l1: 14.1461	train's mae: 14.1461	test's l1: 14.3129	test's mae: 14.3129
[1500]	train's l1: 13.8896	train's mae: 13.8896	test's l1: 14.2574	test's mae: 14.2574
[2000]	train's l1: 13.6721	train's mae: 13.6721	test's l1: 14.2233	test's mae: 14.2233
Early stopping, best iteration is:
[2343]	train's l1: 13.530

In [7]:
x, y = train_df[feature_name2], train_df['score'].values
x_test = test_df[feature_name2]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=23, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="fair_huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb2',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.4934	train's mae: 14.4934	test's l1: 14.9063	test's mae: 14.9063
[1000]	train's l1: 14.0941	train's mae: 14.0941	test's l1: 14.7418	test's mae: 14.7418
[1500]	train's l1: 13.8433	train's mae: 13.8433	test's l1: 14.6956	test's mae: 14.6956
[2000]	train's l1: 13.6329	train's mae: 13.6329	test's l1: 14.6673	test's mae: 14.6673
[2500]	train's l1: 13.4412	train's mae: 13.4412	test's l1: 14.6534	test's mae: 14.6534
[3000]	train's l1: 13.27	train's mae: 13.27	test's l1: 14.6433	test's mae: 14.6433
[3500]	train's l1: 13.109	train's mae: 13.109	test's l1: 14.634	test's mae: 14.634
Early stopping, best iteration is:
[3457]	train's l1: 13.1229	train's mae: 13.1229	test's l1: 14.6326	test's mae: 14.6326

Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.5433	train's mae: 14.5433	test's l1: 14.5045	test's mae: 14.5045
[1000]	train's l1: 14.1469	train's mae: 14.1469	test's l1: 14.3308	t

0.06396882561504774	0.06581746686266536	0.06402136341163739	0.06415931021741644	0.06370541187477802	0.06372033829942685	0.06381814747236468	0.06515569503247276	0.06284134519722118	0.0646712074120622
min score: 0.062841
max score: 0.065817
median score: 0.063995
mean score: 0.064188
[601.17341606 524.38687437 670.3201957  675.70379786 657.79585174
 614.60295649 640.71176115 569.18260506 674.53827234 589.78891582]


In [8]:
x, y = train_df[feature_name3], train_df['score'].values
x_test = test_df[feature_name3]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True, 
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=25, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="mae_fair",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb3',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.4936	train's mae: 14.4936	test's l1: 14.9001	test's mae: 14.9001
[1000]	train's l1: 14.1261	train's mae: 14.1261	test's l1: 14.7475	test's mae: 14.7475
[1500]	train's l1: 13.878	train's mae: 13.878	test's l1: 14.6923	test's mae: 14.6923
[2000]	train's l1: 13.6679	train's mae: 13.6679	test's l1: 14.6573	test's mae: 14.6573
Early stopping, best iteration is:
[2313]	train's l1: 13.549	train's mae: 13.549	test's l1: 14.6482	test's mae: 14.6482

Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.544	train's mae: 14.544	test's l1: 14.4744	test's mae: 14.4744
[1000]	train's l1: 14.1713	train's mae: 14.1713	test's l1: 14.3281	test's mae: 14.3281
[1500]	train's l1: 13.9221	train's mae: 13.9221	test's l1: 14.2696	test's mae: 14.2696
[2000]	train's l1: 13.7147	train's mae: 13.7147	test's l1: 14.2419	test's mae: 14.2419
[2500]	train's l1: 13.5208	train's mae: 13.5208	test's l1: 14.2262

In [9]:
x, y = train_df[feature_name4], train_df['score'].values
x_test = test_df[feature_name4]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=23, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="fair_huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb4',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.5096	train's mae: 14.5096	test's l1: 14.9051	test's mae: 14.9051
[1000]	train's l1: 14.1253	train's mae: 14.1253	test's l1: 14.7341	test's mae: 14.7341
[1500]	train's l1: 13.8841	train's mae: 13.8841	test's l1: 14.6797	test's mae: 14.6797
[2000]	train's l1: 13.6788	train's mae: 13.6788	test's l1: 14.6448	test's mae: 14.6448
[2500]	train's l1: 13.4972	train's mae: 13.4972	test's l1: 14.6336	test's mae: 14.6336
Early stopping, best iteration is:
[2874]	train's l1: 13.3763	train's mae: 13.3763	test's l1: 14.6226	test's mae: 14.6226

Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.5609	train's mae: 14.5609	test's l1: 14.501	test's mae: 14.501
[1000]	train's l1: 14.1774	train's mae: 14.1774	test's l1: 14.3404	test's mae: 14.3404
[1500]	train's l1: 13.9377	train's mae: 13.9377	test's l1: 14.287	test's mae: 14.287
[2000]	train's l1: 13.7346	train's mae: 13.7346	test's l1: 14.25

In [10]:
x, y = train_df[feature_name6], train_df['score'].values
x_test = test_df[feature_name6]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=23, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="fair_huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb5',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.5287	train's mae: 14.5287	test's l1: 14.9158	test's mae: 14.9158
[1000]	train's l1: 14.1462	train's mae: 14.1462	test's l1: 14.7504	test's mae: 14.7504
[1500]	train's l1: 13.91	train's mae: 13.91	test's l1: 14.6932	test's mae: 14.6932
[2000]	train's l1: 13.7114	train's mae: 13.7114	test's l1: 14.6658	test's mae: 14.6658
[2500]	train's l1: 13.5365	train's mae: 13.5365	test's l1: 14.6477	test's mae: 14.6477
Early stopping, best iteration is:
[2560]	train's l1: 13.518	train's mae: 13.518	test's l1: 14.6457	test's mae: 14.6457

Training until validation scores don't improve for 80 rounds
[500]	train's l1: 14.5613	train's mae: 14.5613	test's l1: 14.5149	test's mae: 14.5149
[1000]	train's l1: 14.181	train's mae: 14.181	test's l1: 14.3388	test's mae: 14.3388
[1500]	train's l1: 13.9497	train's mae: 13.9497	test's l1: 14.2786	test's mae: 14.2786
[2000]	train's l1: 13.751	train's mae: 13.751	test's l1: 14.2465	tes

In [11]:
x, y = train_df[feature_name1], train_df['score'].values
x_test = test_df[feature_name1]

def fn_transform(x):
    return np.power(1.005, x)
def fn_reverse_transform(x):
    ret = np.log(x)/np.log(1.005)
    return ret

y = fn_transform(y)

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.03, 
                fn_reverse_transform=fn_reverse_transform,
                stratify=True, 
                split_seed=8888,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=25, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=7,
                output_dir=output_dir,
                name='gotcha_lgb6',
                verbose=200,
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds
[200]	train's l1: 2.01684	train's mae: 19.4369	test's l1: 2.03953	test's mae: 19.6309
[400]	train's l1: 1.61909	train's mae: 15.2966	test's l1: 1.65438	test's mae: 15.6069
[600]	train's l1: 1.54196	train's mae: 14.4403	test's l1: 1.59319	test's mae: 14.9126
[800]	train's l1: 1.51407	train's mae: 14.1602	test's l1: 1.58069	test's mae: 14.7824
[1000]	train's l1: 1.49478	train's mae: 13.9772	test's l1: 1.5761	test's mae: 14.734
[1200]	train's l1: 1.48004	train's mae: 13.8396	test's l1: 1.57325	test's mae: 14.7054
[1400]	train's l1: 1.46965	train's mae: 13.7432	test's l1: 1.57157	test's mae: 14.689
[1600]	train's l1: 1.46331	train's mae: 13.6858	test's l1: 1.57097	test's mae: 14.683
[1800]	train's l1: 1.45823	train's mae: 13.6385	test's l1: 1.57035	test's mae: 14.6772
Early stopping, best iteration is:
[1872]	train's l1: 1.45683	train's mae: 13.6254	test's l1: 1.57029	test's mae: 14.6764

Training until validation scores don't i

[200]	train's l1: 2.01319	train's mae: 19.4051	test's l1: 2.066	test's mae: 19.8706
[400]	train's l1: 1.61511	train's mae: 15.2576	test's l1: 1.69149	test's mae: 15.9671
[600]	train's l1: 1.53801	train's mae: 14.4009	test's l1: 1.6288	test's mae: 15.2658
[800]	train's l1: 1.51041	train's mae: 14.1242	test's l1: 1.61277	test's mae: 15.0974
[1000]	train's l1: 1.4917	train's mae: 13.9484	test's l1: 1.60609	test's mae: 15.0313
[1200]	train's l1: 1.47719	train's mae: 13.8125	test's l1: 1.60278	test's mae: 14.9991
[1400]	train's l1: 1.46719	train's mae: 13.7201	test's l1: 1.60188	test's mae: 14.9891
[1600]	train's l1: 1.45988	train's mae: 13.6524	test's l1: 1.6013	test's mae: 14.984
[1800]	train's l1: 1.45419	train's mae: 13.6003	test's l1: 1.60096	test's mae: 14.9807
[2000]	train's l1: 1.45154	train's mae: 13.5762	test's l1: 1.60065	test's mae: 14.9778
Early stopping, best iteration is:
[1925]	train's l1: 1.45154	train's mae: 13.5762	test's l1: 1.60065	test's mae: 14.9778

Training until va