In [1]:
import os

os.environ['NUM_OMP_THREADS'] = "4"

import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
import time
from sklearn.linear_model import HuberRegressor
import sklearn.ensemble as tree_model
from tqdm import tqdm
import datetime
pd.set_option('display.max_column',100)
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2
from utils import make_dir, score, timer, kf_lgbm, kf_xgbm, kf_ctbm, kf_sklearn

In [2]:
def make_features(df):
    app_feature = [
        '当月网购类应用使用次数',
        '当月物流快递类应用使用次数',
        '当月金融理财类应用使用总次数',
        '当月视频播放类应用使用次数',
        '当月飞机类应用使用次数',
        '当月火车类应用使用次数',
        '当月旅游资讯类应用使用次数',
    ]
    
    for f in app_feature:
        df['round_log1p'+f] = np.round(np.log1p(df[f])).astype(int)
    
    df['前五个月消费总费用'] = 6*df['用户近6个月平均消费值（元）'] - df['用户账单当月总费用（元）']
    df['前五个月消费平均费用'] = df['前五个月消费总费用'] / 5
    df['当月费用/前五个月消费平均费用'] = (df['用户账单当月总费用（元）']) \
                        / (1+df['前五个月消费平均费用'])
    df['当月费用-前五个月消费平均费用'] = df['用户账单当月总费用（元）'] - df['前五个月消费平均费用']
        
    def make_count_feature(df, col, fea_name):
        df['idx'] = range(len(df))
        tmp = df.groupby(col)['用户编码'].agg([
            (fea_name,'count')]).reset_index()
        df = df.merge(tmp)
        df = df.sort_values('idx').drop('idx',axis=1).reset_index(drop=True)
        return df
        
    df = make_count_feature(df, '缴费用户最近一次缴费金额（元）','count_缴费')
    df = make_count_feature(df, '用户账单当月总费用（元）','count_当月费用')
    df = make_count_feature(df, '前五个月消费总费用', 'count_总费用')
    df = make_count_feature(df, '当月费用-前五个月消费平均费用', 'count_费用差')
    df = make_count_feature(df, '用户近6个月平均消费值（元）', 'count_平均费用')
    df = make_count_feature(df, ['用户账单当月总费用（元）','用户近6个月平均消费值（元）'],
                            'count_当月费用_平均费用')
            
    arr = df['缴费用户最近一次缴费金额（元）']
    df['是否998折'] = ((arr/0.998)%1==0)&(arr!=0)
    
    df['年龄_0_as_nan'] = np.where(df['用户年龄']==0, [np.nan]*len(df), df['用户年龄'])
    
    return df
    
def load_df_and_make_features():
    train_df = pd.read_csv('../input/train_dataset.csv')
    test_df = pd.read_csv('../input/test_dataset.csv')
    train_df['train'] = 1
    test_df['train'] = 0
    df = pd.concat([train_df,test_df])
    df = make_features(df)
    return df

In [3]:
feature_name1 = \
['用户年龄',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '前五个月消费总费用',
 'count_缴费',
 'count_当月费用',
 'count_费用差',
 'count_平均费用',
 'count_当月费用_平均费用',
 '是否998折',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']


feature_name2 = \
['年龄_0_as_nan',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '前五个月消费总费用',
 'count_缴费',
 'count_当月费用',
 'count_费用差',
 'count_平均费用',
 'count_当月费用_平均费用',
 '是否998折',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']

feature_name3 = \
['用户年龄',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '前五个月消费总费用',
 'count_缴费',
 'count_当月费用',
 'count_费用差',
 'count_平均费用',
 'count_当月费用_平均费用',
 '是否998折',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 'round_log1p当月网购类应用使用次数',
 'round_log1p当月物流快递类应用使用次数',
 'round_log1p当月金融理财类应用使用总次数',
 'round_log1p当月视频播放类应用使用次数',
 'round_log1p当月飞机类应用使用次数',
 'round_log1p当月火车类应用使用次数',
 'round_log1p当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']

feature_name4 = \
['年龄_0_as_nan',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '前五个月消费总费用',
 'count_缴费',
 'count_当月费用',
 'count_费用差',
 'count_平均费用',
 'count_当月费用_平均费用',
 '是否998折',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 'round_log1p当月网购类应用使用次数',
 'round_log1p当月物流快递类应用使用次数',
 'round_log1p当月金融理财类应用使用总次数',
 'round_log1p当月视频播放类应用使用次数',
 'round_log1p当月飞机类应用使用次数',
 'round_log1p当月火车类应用使用次数',
 'round_log1p当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']

feature_name5 = \
['用户年龄',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']


feature_name6 = \
['年龄_0_as_nan',
 '用户网龄（月）',
 '用户实名制是否通过核实',
 '是否大学生客户',
 '是否4G不健康客户',
 '用户最近一次缴费距今时长（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '用户话费敏感度',
 '当月费用-前五个月消费平均费用',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月物流快递类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月飞机类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '当月是否逛过福州仓山万达',
 '当月是否到过福州山姆会员店',
 '当月是否看电影',
 '当月是否景点游览',
 '当月是否体育场馆消费',
 '是否经常逛商场的人',
 '是否黑名单客户',
 '缴费用户当前是否欠费缴费']

In [4]:
df = load_df_and_make_features()
train_df = df[df['train']==1]
test_df = df[df['train']!=1]

In [6]:
# now = str(datetime.datetime.now()).split('.')[0]
# now=now.replace(' ','_')
# now=now.replace(':','-')
# output_dir = 'lgb'+now
output_dir = './stacking_files/' 

In [7]:
x, y = train_df[feature_name1], train_df['信用分'].values
x_test = test_df[feature_name1]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['用户话费敏感度'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=25, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="mae_fair",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb1',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds.
[200]	train's l1: 15.6015	train's mae: 15.6015	test's l1: 15.8681	test's mae: 15.8681
[400]	train's l1: 14.6312	train's mae: 14.6312	test's l1: 15.0797	test's mae: 15.0797
[600]	train's l1: 14.3699	train's mae: 14.3699	test's l1: 14.9224	test's mae: 14.9224
[800]	train's l1: 14.2164	train's mae: 14.2164	test's l1: 14.8466	test's mae: 14.8466
[1000]	train's l1: 14.0943	train's mae: 14.0943	test's l1: 14.8012	test's mae: 14.8012
[1200]	train's l1: 13.986	train's mae: 13.986	test's l1: 14.7672	test's mae: 14.7672
[1400]	train's l1: 13.8829	train's mae: 13.8829	test's l1: 14.7333	test's mae: 14.7333
[1600]	train's l1: 13.7907	train's mae: 13.7907	test's l1: 14.7107	test's mae: 14.7107
[1800]	train's l1: 13.7037	train's mae: 13.7037	test's l1: 14.6936	test's mae: 14.6936
[2000]	train's l1: 13.6225	train's mae: 13.6225	test's l1: 14.6796	test's mae: 14.6796
[2200]	train's l1: 13.54	train's mae: 13.54	test's l1: 14.6659	test's mae

In [11]:
x, y = train_df[feature_name2], train_df['信用分'].values
x_test = test_df[feature_name2]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['用户话费敏感度'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=23, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="fair_huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb2',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds.
[200]	train's l1: 15.6963	train's mae: 15.6963	test's l1: 15.9533	test's mae: 15.9533
[400]	train's l1: 14.6584	train's mae: 14.6584	test's l1: 15.0967	test's mae: 15.0967
[600]	train's l1: 14.3789	train's mae: 14.3789	test's l1: 14.9297	test's mae: 14.9297
[800]	train's l1: 14.2188	train's mae: 14.2188	test's l1: 14.8505	test's mae: 14.8505
[1000]	train's l1: 14.0979	train's mae: 14.0979	test's l1: 14.8023	test's mae: 14.8023
[1200]	train's l1: 13.993	train's mae: 13.993	test's l1: 14.7703	test's mae: 14.7703
[1400]	train's l1: 13.895	train's mae: 13.895	test's l1: 14.7371	test's mae: 14.7371
[1600]	train's l1: 13.8049	train's mae: 13.8049	test's l1: 14.7171	test's mae: 14.7171
[1800]	train's l1: 13.7176	train's mae: 13.7176	test's l1: 14.6966	test's mae: 14.6966
[2000]	train's l1: 13.6398	train's mae: 13.6398	test's l1: 14.6834	test's mae: 14.6834
[2200]	train's l1: 13.5612	train's mae: 13.5612	test's l1: 14.6704	test's m

In [12]:
x, y = train_df[feature_name3], train_df['信用分'].values
x_test = test_df[feature_name3]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True, 
                min_split_gain=1,
                categorical_feature=['用户话费敏感度'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=25, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="mae_fair",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb3',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds.
[200]	train's l1: 15.6113	train's mae: 15.6113	test's l1: 15.8654	test's mae: 15.8654
[400]	train's l1: 14.6492	train's mae: 14.6492	test's l1: 15.0782	test's mae: 15.0782
[600]	train's l1: 14.3951	train's mae: 14.3951	test's l1: 14.9205	test's mae: 14.9205
[800]	train's l1: 14.2447	train's mae: 14.2447	test's l1: 14.849	test's mae: 14.849
[1000]	train's l1: 14.1242	train's mae: 14.1242	test's l1: 14.798	test's mae: 14.798
[1200]	train's l1: 14.0181	train's mae: 14.0181	test's l1: 14.7622	test's mae: 14.7622
[1400]	train's l1: 13.9196	train's mae: 13.9196	test's l1: 14.7293	test's mae: 14.7293
[1600]	train's l1: 13.8311	train's mae: 13.8311	test's l1: 14.7093	test's mae: 14.7093
[1800]	train's l1: 13.7485	train's mae: 13.7485	test's l1: 14.6958	test's mae: 14.6958
[2000]	train's l1: 13.6685	train's mae: 13.6685	test's l1: 14.6816	test's mae: 14.6816
[2200]	train's l1: 13.5911	train's mae: 13.5911	test's l1: 14.6701	test's m

In [13]:
x, y = train_df[feature_name4], train_df['信用分'].values
x_test = test_df[feature_name4]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['用户话费敏感度'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=23, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="fair_huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb4',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds.
[200]	train's l1: 15.7028	train's mae: 15.7028	test's l1: 15.9519	test's mae: 15.9519
[400]	train's l1: 14.6729	train's mae: 14.6729	test's l1: 15.0955	test's mae: 15.0955
[600]	train's l1: 14.3987	train's mae: 14.3987	test's l1: 14.9265	test's mae: 14.9265
[800]	train's l1: 14.2423	train's mae: 14.2423	test's l1: 14.8486	test's mae: 14.8486
[1000]	train's l1: 14.1208	train's mae: 14.1208	test's l1: 14.7959	test's mae: 14.7959
[1200]	train's l1: 14.0167	train's mae: 14.0167	test's l1: 14.7588	test's mae: 14.7588
[1400]	train's l1: 13.9235	train's mae: 13.9235	test's l1: 14.7299	test's mae: 14.7299
[1600]	train's l1: 13.836	train's mae: 13.836	test's l1: 14.7125	test's mae: 14.7125
[1800]	train's l1: 13.7545	train's mae: 13.7545	test's l1: 14.6952	test's mae: 14.6952
[2000]	train's l1: 13.6773	train's mae: 13.6773	test's l1: 14.6835	test's mae: 14.6835
[2200]	train's l1: 13.603	train's mae: 13.603	test's l1: 14.6721	test's m

In [14]:
x, y = train_df[feature_name6], train_df['信用分'].values
x_test = test_df[feature_name6]

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.01, 
                stratify=True,
                min_split_gain=1,
                categorical_feature=['用户话费敏感度'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=23, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="fair_huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=5,
                output_dir=output_dir,
                name='gotcha_lgb5',
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds.
[200]	train's l1: 15.7732	train's mae: 15.7732	test's l1: 16.0722	test's mae: 16.0722
[400]	train's l1: 14.6916	train's mae: 14.6916	test's l1: 15.154	test's mae: 15.154
[600]	train's l1: 14.4138	train's mae: 14.4138	test's l1: 14.9887	test's mae: 14.9887
[800]	train's l1: 14.2572	train's mae: 14.2572	test's l1: 14.9157	test's mae: 14.9157
[1000]	train's l1: 14.1433	train's mae: 14.1433	test's l1: 14.8653	test's mae: 14.8653
[1200]	train's l1: 14.0501	train's mae: 14.0501	test's l1: 14.8334	test's mae: 14.8334
[1400]	train's l1: 13.9573	train's mae: 13.9573	test's l1: 14.8048	test's mae: 14.8048
[1600]	train's l1: 13.8753	train's mae: 13.8753	test's l1: 14.7809	test's mae: 14.7809
[1800]	train's l1: 13.7981	train's mae: 13.7981	test's l1: 14.7639	test's mae: 14.7639
[2000]	train's l1: 13.7258	train's mae: 13.7258	test's l1: 14.7533	test's mae: 14.7533
[2200]	train's l1: 13.6554	train's mae: 13.6554	test's l1: 14.74	test's m

In [15]:
x, y = train_df[feature_name1], train_df['信用分'].values
x_test = test_df[feature_name1]

def fn_transform(x):
    return np.power(1.005, x)
def fn_reverse_transform(x):
    ret = np.log(x)/np.log(1.005)
    return ret

y = fn_transform(y)

model = kf_lgbm(x=x,y=y,x_test=x_test,learning_rate=0.03, 
                fn_reverse_transform=fn_reverse_transform,
                stratify=True, 
                split_seed=8888,
                min_split_gain=1,
                categorical_feature=['用户话费敏感度'],
                boosting_type='gbdt',
                early_stopping_rounds=80,
                fair_c=25, 
                huber_delta=2,
                max_cat_to_onehot=4,
                objective="huber",
                eval_metric="mae",
                subsample_freq=2,
                min_child_samples=20,
                num_leaves=31,
                bagging_fraction=0.8,
                feature_fraction=0.5,
                max_depth=7,
                output_dir=output_dir,
                name='gotcha_lgb6',
                verbose=200,
                n_estimators=8000)


Training until validation scores don't improve for 80 rounds.
[200]	train's l1: 2.01778	train's mae: 19.4444	test's l1: 2.03708	test's mae: 19.6335
[400]	train's l1: 1.61963	train's mae: 15.3019	test's l1: 1.65911	test's mae: 15.6763
[600]	train's l1: 1.54135	train's mae: 14.4329	test's l1: 1.60048	test's mae: 14.9969
[800]	train's l1: 1.51387	train's mae: 14.1584	test's l1: 1.58761	test's mae: 14.8504
[1000]	train's l1: 1.49434	train's mae: 13.9724	test's l1: 1.58121	test's mae: 14.7817
[1200]	train's l1: 1.4803	train's mae: 13.8426	test's l1: 1.57812	test's mae: 14.7482
[1400]	train's l1: 1.47022	train's mae: 13.7499	test's l1: 1.57622	test's mae: 14.7275
[1600]	train's l1: 1.46324	train's mae: 13.6857	test's l1: 1.57543	test's mae: 14.7183
[1800]	train's l1: 1.45834	train's mae: 13.6409	test's l1: 1.57473	test's mae: 14.711
[2000]	train's l1: 1.45477	train's mae: 13.6083	test's l1: 1.57454	test's mae: 14.7084
Early stopping, best iteration is:
[1963]	train's l1: 1.45545	train's mae