In [1]:
# https://github.com/PanJianning/DCIC-2019-Credit-2th-Place/tree/master/code

In [2]:
import os

os.environ['NUM_OMP_THREADS'] = "4"

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import time
from sklearn.linear_model import HuberRegressor
import sklearn.ensemble as tree_model
from tqdm import tqdm
import datetime
pd.set_option('display.max_column',100)
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
from utils import make_dir, score, timer, kf_lgbm, kf_xgbm, kf_ctbm, kf_sklearn

In [3]:
def make_features(df):
    app_feature = [
        'online_shopping_count',
        'express_count',
        'finance_app_count',
        'video_app_count',
        'flight_count',
        'train_count',
        'tour_app_count'
    ]
    
    for f in app_feature:
        df['round_log1p_' + f] = np.round(np.log1p(df[f])).astype(int)
    
    df['ago_5month_total_fee'] = 6 * df['recent_6month_avg_use'] - df['total_account_fee']
    df['ago_5month_avg_fee'] = df['ago_5month_total_fee'] / 5
    df['total_account_fee/ago_5month_total_fee'] = (df['total_account_fee']) / (1 + df['ago_5month_total_fee'])
    df['total_account_fee-ago_5month_avg_fee'] = df['total_account_fee'] - df['ago_5month_avg_fee']
        
    def make_count_feature(df, col, fea_name):
        df['idx'] = range(len(df))
        tmp = df.groupby(col)['uid'].agg([
            (fea_name,'count')]).reset_index()
        df = df.merge(tmp)
        df = df.sort_values('idx').drop('idx',axis=1).reset_index(drop=True)
        return df
        
    df = make_count_feature(df, 'top_up_amount', 'count_top_up_amount')
    df = make_count_feature(df, 'total_account_fee', 'count_total_account_fee')
    df = make_count_feature(df, 'ago_5month_total_fee', 'count_ago_5month_total_fee')
    df = make_count_feature(df, 'total_account_fee-ago_5month_avg_fee', 'count_total_account_fee-ago_5month_avg_fee')
    df = make_count_feature(df, 'recent_6month_avg_use', 'count_recent_6month_avg_use')
    df = make_count_feature(df, ['total_account_fee', 'recent_6month_avg_use'], 'count_total_account_fee_recent_6month_avg_use')
            
    arr = df['top_up_amount']
    df['is_998_discount'] = ((arr / 0.998) % 1 == 0) & (arr != 0)
    
    df['age_0_as_nan'] = np.where(df['age'] == 0, [np.nan] * len(df), df['age'])
    
    return df
    
def load_df_and_make_features():
    train_df = pd.read_csv('../../input/train_dataset.csv')
    test_df = pd.read_csv('../../input/test_dataset.csv')
    
    train_df.columns = ['uid','true_name_flag','age','uni_student_flag','blk_list_flag',
                        '4g_unhealth_flag','net_age_till_now','top_up_month_diff','top_up_amount',
                        'recent_6month_avg_use','total_account_fee','curr_month_balance',
                        'curr_overdue_flag','cost_sensitivity','connect_num','freq_shopping_flag',
                        'recent_3month_shopping_count','wanda_flag','sam_flag','movie_flag',
                        'tour_flag','sport_flag','online_shopping_count','express_count',
                        'finance_app_count','video_app_count','flight_count','train_count',
                        'tour_app_count','score']
    test_df.columns = train_df.columns[:-1]
    train_df['train'] = 1
    test_df['train'] = 0
    df = pd.concat([train_df,test_df])
    df = make_features(df)
    return df

In [4]:
feature_name1 = [
    'age',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'ago_5month_total_fee',
    'count_top_up_amount',
    'count_total_account_fee',
    'count_total_account_fee-ago_5month_avg_fee',
    'count_recent_6month_avg_use',
    'count_total_account_fee_recent_6month_avg_use',
    'is_998_discount',
    'connect_num',
    'recent_3month_shopping_count',
    'online_shopping_count',
    'express_count',
    'finance_app_count',
    'video_app_count',
    'flight_count',
    'train_count',
    'tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name2 = [
    'age_0_as_nan',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'ago_5month_total_fee',
    'count_top_up_amount',
    'count_total_account_fee',
    'count_total_account_fee-ago_5month_avg_fee',
    'count_recent_6month_avg_use',
    'count_total_account_fee_recent_6month_avg_use',
    'is_998_discount',
    'connect_num',
    'recent_3month_shopping_count',
    'online_shopping_count',
    'express_count',
    'finance_app_count',
    'video_app_count',
    'flight_count',
    'train_count',
    'tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name3 = [
    'age',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'ago_5month_total_fee',
    'count_top_up_amount',
    'count_total_account_fee',
    'count_total_account_fee-ago_5month_avg_fee',
    'count_recent_6month_avg_use',
    'count_total_account_fee_recent_6month_avg_use',
    'is_998_discount',
    'connect_num',
    'recent_3month_shopping_count',
    'round_log1p_online_shopping_count',
    'round_log1p_express_count',
    'round_log1p_finance_app_count',
    'round_log1p_video_app_count',
    'round_log1p_flight_count',
    'round_log1p_train_count',
    'round_log1p_tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name4 = [
    'age_0_as_nan',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'ago_5month_total_fee',
    'count_top_up_amount',
    'count_total_account_fee',
    'count_total_account_fee-ago_5month_avg_fee',
    'count_recent_6month_avg_use',
    'count_total_account_fee_recent_6month_avg_use',
    'is_998_discount',
    'connect_num',
    'recent_3month_shopping_count',
    'round_log1p_online_shopping_count',
    'round_log1p_express_count',
    'round_log1p_finance_app_count',
    'round_log1p_video_app_count',
    'round_log1p_flight_count',
    'round_log1p_train_count',
    'round_log1p_tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name5 = [
    'age',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'connect_num',
    'recent_3month_shopping_count',
    'online_shopping_count',
    'express_count',
    'finance_app_count',
    'video_app_count',
    'flight_count',
    'train_count',
    'tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

feature_name6 = [
    'age_0_as_nan',
    'net_age_till_now',
    'true_name_flag',
    'uni_student_flag',
    '4g_unhealth_flag',
    'top_up_month_diff',
    'top_up_amount',
    'recent_6month_avg_use',
    'total_account_fee',
    'curr_month_balance',
    'cost_sensitivity',
    'total_account_fee-ago_5month_avg_fee',
    'connect_num',
    'recent_3month_shopping_count',
    'online_shopping_count',
    'express_count',
    'finance_app_count',
    'video_app_count',
    'flight_count',
    'train_count',
    'tour_app_count',
    'wanda_flag',
    'sam_flag',
    'movie_flag',
    'tour_flag',
    'sport_flag',
    'freq_shopping_flag',
    'blk_list_flag',
    'curr_overdue_flag'
]

NameError: name 'train_data' is not defined

In [None]:
df = load_df_and_make_features()
train_df = df[df['train']==1]
test_df = df[df['train']!=1]

In [None]:
output_dir = 'stacking_files/'

In [None]:
x, y = train_df[feature_name1], train_df['score'].values
x_test = test_df[feature_name1]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=25, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="mae_fair",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb1',
                n_estimators=8000)

In [None]:
x, y = train_df[feature_name2], train_df['score'].values
x_test = test_df[feature_name2]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=23, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="fair_huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb2',
                n_estimators=8000)

In [None]:
x, y = train_df[feature_name3], train_df['score'].values
x_test = test_df[feature_name3]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True, 
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=25, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="mae_fair",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb3',
                n_estimators=8000)

In [None]:
x, y = train_df[feature_name4], train_df['score'].values
x_test = test_df[feature_name4]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=23, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="fair_huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb4',
                n_estimators=8000)

In [None]:
x, y = train_df[feature_name6], train_df['score'].values
x_test = test_df[feature_name6]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=23, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="fair_huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb5',
                n_estimators=8000)

In [None]:
x, y = train_df[feature_name1], train_df['score'].values
x_test = test_df[feature_name1]

def fn_transform(x):
    return np.power(1.005, x)
def fn_reverse_transform(x):
    ret = np.log(x)/np.log(1.005)
    return ret

y = fn_transform(y)

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.03, 
                fn_reverse_transform=fn_reverse_transform,
                stratify=True, 
                split_seed=8888,
                min_split_gain=1,
                categorical_feature=['cost_sensitivity'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=25, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=7,
                output_dir=output_dir,
                name='gotcha_lgb6',
                verbose=200,
                n_estimators=8000)