In [1]:
import numpy as np
np.random.seed(42)

import pandas as pd
from tqdm import tqdm, tqdm_notebook 

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold

import os 
import gc

%matplotlib inline

In [2]:
train = pd.read_csv('../input/train_dataset.csv')
test = pd.read_csv('../input/test_dataset.csv')

In [3]:
def _simple_features(df_):
    df = df_.copy() 
    df['次数'] = (df['当月网购类应用使用次数'] +  df['当月物流快递类应用使用次数'] +  df['当月金融理财类应用使用总次数'] +
                  df['当月视频播放类应用使用次数'] + df['当月飞机类应用使用次数'] + df['当月火车类应用使用次数'] +
                  df['当月旅游资讯类应用使用次数']  + 1)

    for col in ['当月金融理财类应用使用总次数', '当月旅游资讯类应用使用次数']: # 这两个比较积极向上一点
        df[col + '百分比'] = df[col].values / df['次数'].values 

    df['当月通话人均话费'] = df['用户账单当月总费用（元）'].values / (df['当月通话交往圈人数'].values + 1)
    df['上个月费用'] = df['用户当月账户余额（元）'].values + df['用户账单当月总费用（元）'].values
    df['用户上网年龄'] = df['用户年龄'] - df['用户网龄（月）']
    df['用户上网年龄百分比'] = df['用户网龄（月）'] / (df['用户年龄'] + 1)
    df['近似总消费'] = df['用户近6个月平均消费值（元）'] / 6 * df['用户网龄（月）']

    return df

In [4]:
train_fea = _simple_features(train)

test_fea  = _simple_features(test)

In [5]:
fea_cols = [col for col in train_fea.columns if train_fea[col].dtypes!='object' and train_fea[col].dtypes != '<M8[ns]' and col!='用户编码' and col!='信用分']   

len(fea_cols)

36

In [6]:
def _get_values_lgbregresser_models(df_fea, df_label,  feature_names):
    kf = KFold(n_splits=5,shuffle=False)#,random_state=1)

    models  = [] 

    models_1 = []

    models_2 = []

    importances = pd.DataFrame() 

    lgb_params = {'num_leaves': 31,

         'min_data_in_leaf': 32, 

#          'objective':'mae',

         'max_depth': -1,

         'learning_rate': 0.005,

         "min_child_samples": 20,

         "boosting": "gbdt",

         "feature_fraction": 0.9,

         "bagging_freq": 1,

         "bagging_fraction": 0.9 ,

         'n_estimators': 10000,

         "bagging_seed": 11,

         "metric": 'rmse',

         "lambda_l1": 0.1,

         "nthread": 50,

         "verbosity": -1}





    lgb_params1 = {'num_leaves': 31,

         'min_data_in_leaf': 32, 

         'objective':'mae',

         'max_depth': -1,

         'learning_rate': 0.005,

         "min_child_samples": 20,

         "boosting": "gbdt",

         "feature_fraction": 0.9,

         "bagging_freq": 1,

         "bagging_fraction": 0.9 ,

         'n_estimators': 10000,

         "bagging_seed": 11,

         "lambda_l1": 0.1,

         "nthread": 50,

         "verbosity": -1}

    

    min_val = np.min(df_label)

    print(min_val)

    for fold_, (trn_, val_) in enumerate(kf.split(df_fea)): 

        trn_x, trn_y= df_fea[trn_,:], df_label[trn_]#, df_label1[trn_] 

        val_x, val_y = df_fea[val_,:], df_label[val_]#, df_label1[val_] 

        tmp = pd.DataFrame()

         

        

        model = lgb.LGBMRegressor(**lgb_params1)

        model.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric ='mae',verbose=50,early_stopping_rounds=250)     

        tmp['target'] = val_y

        tmp['pred1'] = model.predict(val_x)

        models.append(model)

        

        model1 = lgb.LGBMRegressor(**lgb_params)

        model1.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], eval_metric ='mae',verbose=50,early_stopping_rounds=250)     

        tmp['pred2'] = model1.predict(val_x)

        models_1.append(model1)

  

        tmp = tmp.sort_values('pred1')

        tmp['ranks'] = list(range(tmp.shape[0]))

        tmp['preds'] = tmp['pred1'].values

        tmp.loc[tmp.ranks<2000,'preds']  = tmp.loc[tmp.ranks< 2000,'pred2'].values *0.4 + tmp.loc[tmp.ranks< 2000,'pred1'].values * 0.6

        tmp.loc[tmp.ranks>8000,'preds']  = tmp.loc[tmp.ranks> 8000,'pred2'].values *0.4 + tmp.loc[tmp.ranks> 8000,'pred1'].values * 0.6

         

        print('*' * 100)

        print('MAE Model',     1 / (1 + (mean_absolute_error(y_true= tmp['target'] , y_pred= tmp['pred1'] ))))

        print('MSE Model',     1 / (1 + (mean_absolute_error(y_true= tmp['target'] , y_pred= tmp['pred2'] ))))

        print('Merge Model12', 1 / (1 + (mean_absolute_error(y_true= tmp['target'] , y_pred= tmp['preds'] )))) 

        

        imp_df = pd.DataFrame()

        imp_df['feature'] = feature_names

        imp_df['gain'] = model.feature_importances_

        imp_df['fold'] = fold_ + 1

        

        importances = pd.concat([importances, imp_df], axis=0)

        

        gc.collect() 

    return models,models_1,importances

In [7]:
models_mae, models_mse, importances = _get_values_lgbregresser_models(train_fea[fea_cols].values, train_fea['信用分'].values, feature_names=fea_cols)

422
Training until validation scores don't improve for 250 rounds
[50]	training's l1: 27.6411	valid_1's l1: 27.8676
[100]	training's l1: 23.8993	valid_1's l1: 24.1131
[150]	training's l1: 21.2376	valid_1's l1: 21.487
[200]	training's l1: 19.3633	valid_1's l1: 19.6374
[250]	training's l1: 18.044	valid_1's l1: 18.3506
[300]	training's l1: 17.1245	valid_1's l1: 17.4783
[350]	training's l1: 16.4794	valid_1's l1: 16.8891
[400]	training's l1: 16.0182	valid_1's l1: 16.471
[450]	training's l1: 15.6845	valid_1's l1: 16.1732
[500]	training's l1: 15.4377	valid_1's l1: 15.954
[550]	training's l1: 15.2476	valid_1's l1: 15.7838
[600]	training's l1: 15.0954	valid_1's l1: 15.6569
[650]	training's l1: 14.9712	valid_1's l1: 15.5571
[700]	training's l1: 14.8663	valid_1's l1: 15.4745
[750]	training's l1: 14.7747	valid_1's l1: 15.4054
[800]	training's l1: 14.6936	valid_1's l1: 15.3467
[850]	training's l1: 14.6233	valid_1's l1: 15.2981
[900]	training's l1: 14.5626	valid_1's l1: 15.2589
[950]	training's l1: 

[500]	training's l1: 15.424	training's rmse: 20.0806	valid_1's l1: 15.8929	valid_1's rmse: 20.6193
[550]	training's l1: 15.2476	training's rmse: 19.8421	valid_1's l1: 15.7394	valid_1's rmse: 20.4049
[600]	training's l1: 15.1076	training's rmse: 19.6546	valid_1's l1: 15.6239	valid_1's rmse: 20.2425
[650]	training's l1: 14.9939	training's rmse: 19.5011	valid_1's l1: 15.5319	valid_1's rmse: 20.1135
[700]	training's l1: 14.8979	training's rmse: 19.3706	valid_1's l1: 15.4554	valid_1's rmse: 20.0055
[750]	training's l1: 14.8137	training's rmse: 19.2571	valid_1's l1: 15.3894	valid_1's rmse: 19.9154
[800]	training's l1: 14.7404	training's rmse: 19.1572	valid_1's l1: 15.336	valid_1's rmse: 19.8412
[850]	training's l1: 14.6743	training's rmse: 19.0666	valid_1's l1: 15.2873	valid_1's rmse: 19.7742
[900]	training's l1: 14.6179	training's rmse: 18.9901	valid_1's l1: 15.25	valid_1's rmse: 19.7207
[950]	training's l1: 14.5658	training's rmse: 18.918	valid_1's l1: 15.2178	valid_1's rmse: 19.6737
[1000

[4550]	training's l1: 13.0109	training's rmse: 16.7865	valid_1's l1: 14.8913	valid_1's rmse: 19.1887
[4600]	training's l1: 12.9954	training's rmse: 16.7658	valid_1's l1: 14.891	valid_1's rmse: 19.1879
[4650]	training's l1: 12.9803	training's rmse: 16.746	valid_1's l1: 14.8911	valid_1's rmse: 19.188
[4700]	training's l1: 12.9649	training's rmse: 16.7255	valid_1's l1: 14.8907	valid_1's rmse: 19.1877
[4750]	training's l1: 12.9499	training's rmse: 16.7055	valid_1's l1: 14.8908	valid_1's rmse: 19.1877
[4800]	training's l1: 12.9349	training's rmse: 16.6857	valid_1's l1: 14.8913	valid_1's rmse: 19.1885
[4850]	training's l1: 12.9198	training's rmse: 16.6662	valid_1's l1: 14.8909	valid_1's rmse: 19.1878
Early stopping, best iteration is:
[4620]	training's l1: 12.9895	training's rmse: 16.7579	valid_1's l1: 14.8905	valid_1's rmse: 19.1876
****************************************************************************************************
MAE Model 0.0628546330443074
MSE Model 0.06293077681140374


[750]	training's l1: 14.837	training's rmse: 19.2556	valid_1's l1: 15.1715	valid_1's rmse: 19.8481
[800]	training's l1: 14.7633	training's rmse: 19.1554	valid_1's l1: 15.1233	valid_1's rmse: 19.7854
[850]	training's l1: 14.6966	training's rmse: 19.0642	valid_1's l1: 15.0805	valid_1's rmse: 19.7309
[900]	training's l1: 14.638	training's rmse: 18.9841	valid_1's l1: 15.0463	valid_1's rmse: 19.6888
[950]	training's l1: 14.584	training's rmse: 18.9094	valid_1's l1: 15.0136	valid_1's rmse: 19.647
[1000]	training's l1: 14.534	training's rmse: 18.8407	valid_1's l1: 14.986	valid_1's rmse: 19.6113
[1050]	training's l1: 14.4887	training's rmse: 18.7775	valid_1's l1: 14.9631	valid_1's rmse: 19.581
[1100]	training's l1: 14.4459	training's rmse: 18.7172	valid_1's l1: 14.9434	valid_1's rmse: 19.5534
[1150]	training's l1: 14.405	training's rmse: 18.6604	valid_1's l1: 14.9256	valid_1's rmse: 19.5276
[1200]	training's l1: 14.3667	training's rmse: 18.6066	valid_1's l1: 14.9078	valid_1's rmse: 19.504
[125

[2250]	training's l1: 13.8257	valid_1's l1: 14.6748
[2300]	training's l1: 13.8069	valid_1's l1: 14.6727
[2350]	training's l1: 13.787	valid_1's l1: 14.6687
[2400]	training's l1: 13.7672	valid_1's l1: 14.6649
[2450]	training's l1: 13.7486	valid_1's l1: 14.6626
[2500]	training's l1: 13.73	valid_1's l1: 14.6592
[2550]	training's l1: 13.7121	valid_1's l1: 14.6573
[2600]	training's l1: 13.6932	valid_1's l1: 14.6549
[2650]	training's l1: 13.6746	valid_1's l1: 14.6508
[2700]	training's l1: 13.6579	valid_1's l1: 14.6486
[2750]	training's l1: 13.6402	valid_1's l1: 14.6463
[2800]	training's l1: 13.6233	valid_1's l1: 14.6443
[2850]	training's l1: 13.6046	valid_1's l1: 14.6415
[2900]	training's l1: 13.5873	valid_1's l1: 14.6392
[2950]	training's l1: 13.5707	valid_1's l1: 14.6369
[3000]	training's l1: 13.5545	valid_1's l1: 14.6353
[3050]	training's l1: 13.5378	valid_1's l1: 14.6338
[3100]	training's l1: 13.5205	valid_1's l1: 14.6317
[3150]	training's l1: 13.5037	valid_1's l1: 14.6292
[3200]	training

[750]	training's l1: 14.8909	training's rmse: 19.3392	valid_1's l1: 15.1195	valid_1's rmse: 19.6756
[800]	training's l1: 14.8164	training's rmse: 19.2374	valid_1's l1: 15.0581	valid_1's rmse: 19.5948
[850]	training's l1: 14.7503	training's rmse: 19.1465	valid_1's l1: 15.0048	valid_1's rmse: 19.5245
[900]	training's l1: 14.6932	training's rmse: 19.0688	valid_1's l1: 14.9645	valid_1's rmse: 19.4714
[950]	training's l1: 14.6412	training's rmse: 18.9967	valid_1's l1: 14.9302	valid_1's rmse: 19.4243
[1000]	training's l1: 14.5936	training's rmse: 18.931	valid_1's l1: 14.8984	valid_1's rmse: 19.3822
[1050]	training's l1: 14.5479	training's rmse: 18.8678	valid_1's l1: 14.8699	valid_1's rmse: 19.3444
[1100]	training's l1: 14.5046	training's rmse: 18.8079	valid_1's l1: 14.8429	valid_1's rmse: 19.3089
[1150]	training's l1: 14.4645	training's rmse: 18.7532	valid_1's l1: 14.8215	valid_1's rmse: 19.2806
[1200]	training's l1: 14.4256	training's rmse: 18.6986	valid_1's l1: 14.7981	valid_1's rmse: 19.2

[4800]	training's l1: 13.0206	training's rmse: 16.7823	valid_1's l1: 14.5564	valid_1's rmse: 18.919
[4850]	training's l1: 13.0055	training's rmse: 16.7623	valid_1's l1: 14.557	valid_1's rmse: 18.9192
[4900]	training's l1: 12.9902	training's rmse: 16.7415	valid_1's l1: 14.5564	valid_1's rmse: 18.9188
Early stopping, best iteration is:
[4664]	training's l1: 13.0618	training's rmse: 16.8362	valid_1's l1: 14.5559	valid_1's rmse: 18.9192
****************************************************************************************************
MAE Model 0.06423031041880967
MSE Model 0.06428425554607979
Merge Model12 0.06436476871165739
Training until validation scores don't improve for 250 rounds
[50]	training's l1: 27.7966	valid_1's l1: 27.2938
[100]	training's l1: 24.0336	valid_1's l1: 23.6674
[150]	training's l1: 21.3562	valid_1's l1: 21.0897
[200]	training's l1: 19.4577	valid_1's l1: 19.2744
[250]	training's l1: 18.1242	valid_1's l1: 18.0239
[300]	training's l1: 17.1941	valid_1's l1: 17.1591
[

[1000]	training's l1: 14.5349	training's rmse: 18.8358	valid_1's l1: 14.9974	valid_1's rmse: 19.5529
[1050]	training's l1: 14.4892	training's rmse: 18.7731	valid_1's l1: 14.9757	valid_1's rmse: 19.5264
[1100]	training's l1: 14.4456	training's rmse: 18.7113	valid_1's l1: 14.9545	valid_1's rmse: 19.499
[1150]	training's l1: 14.4047	training's rmse: 18.6542	valid_1's l1: 14.9366	valid_1's rmse: 19.4756
[1200]	training's l1: 14.3669	training's rmse: 18.5999	valid_1's l1: 14.9228	valid_1's rmse: 19.4564
[1250]	training's l1: 14.3313	training's rmse: 18.5499	valid_1's l1: 14.9097	valid_1's rmse: 19.4391
[1300]	training's l1: 14.2979	training's rmse: 18.5027	valid_1's l1: 14.8986	valid_1's rmse: 19.4253
[1350]	training's l1: 14.2651	training's rmse: 18.456	valid_1's l1: 14.8884	valid_1's rmse: 19.412
[1400]	training's l1: 14.2341	training's rmse: 18.4124	valid_1's l1: 14.8792	valid_1's rmse: 19.4009
[1450]	training's l1: 14.205	training's rmse: 18.3714	valid_1's l1: 14.8713	valid_1's rmse: 19

[1250]	training's l1: 14.2645	valid_1's l1: 15.0439
[1300]	training's l1: 14.2312	valid_1's l1: 15.0305
[1350]	training's l1: 14.1999	valid_1's l1: 15.0192
[1400]	training's l1: 14.1695	valid_1's l1: 15.0088
[1450]	training's l1: 14.1398	valid_1's l1: 14.9992
[1500]	training's l1: 14.112	valid_1's l1: 14.9915
[1550]	training's l1: 14.0847	valid_1's l1: 14.9834
[1600]	training's l1: 14.0585	valid_1's l1: 14.9768
[1650]	training's l1: 14.0332	valid_1's l1: 14.9716
[1700]	training's l1: 14.0081	valid_1's l1: 14.9653
[1750]	training's l1: 13.9837	valid_1's l1: 14.9591
[1800]	training's l1: 13.9599	valid_1's l1: 14.954
[1850]	training's l1: 13.9375	valid_1's l1: 14.9512
[1900]	training's l1: 13.9158	valid_1's l1: 14.9474
[1950]	training's l1: 13.8942	valid_1's l1: 14.9435
[2000]	training's l1: 13.8735	valid_1's l1: 14.9405
[2050]	training's l1: 13.8539	valid_1's l1: 14.9386
[2100]	training's l1: 13.8348	valid_1's l1: 14.9364
[2150]	training's l1: 13.8137	valid_1's l1: 14.9327
[2200]	trainin

[600]	training's l1: 15.1142	training's rmse: 19.627	valid_1's l1: 15.5183	valid_1's rmse: 20.2752
[650]	training's l1: 15.0021	training's rmse: 19.4728	valid_1's l1: 15.4276	valid_1's rmse: 20.1566
[700]	training's l1: 14.9047	training's rmse: 19.338	valid_1's l1: 15.355	valid_1's rmse: 20.0599
[750]	training's l1: 14.8199	training's rmse: 19.2199	valid_1's l1: 15.2938	valid_1's rmse: 19.9775
[800]	training's l1: 14.7458	training's rmse: 19.1176	valid_1's l1: 15.2413	valid_1's rmse: 19.9108
[850]	training's l1: 14.6804	training's rmse: 19.0265	valid_1's l1: 15.1984	valid_1's rmse: 19.8557
[900]	training's l1: 14.6242	training's rmse: 18.9481	valid_1's l1: 15.162	valid_1's rmse: 19.8119
[950]	training's l1: 14.5718	training's rmse: 18.8751	valid_1's l1: 15.1293	valid_1's rmse: 19.7725
[1000]	training's l1: 14.5233	training's rmse: 18.8065	valid_1's l1: 15.0988	valid_1's rmse: 19.737
[1050]	training's l1: 14.4785	training's rmse: 18.7434	valid_1's l1: 15.072	valid_1's rmse: 19.7058
[110

MAE Model 0.06314368002017899
MSE Model 0.06329154300724835
Merge Model12 0.06328324083589802


In [8]:
np.mean([0.06287124227830537,0.06356345258514168,0.06422149465677383, 0.06367346108002223,0.06314959550420497])

0.06349584922088962

In [9]:
np.mean([0.063039062547442,0.06359245145019841,0.06436835293198238, 0.0637306555814696, 0.06332385690501188])

0.06361087588322085

In [10]:
pred_mae = 0

for i,model in enumerate(models_mae): 

    pred_mae += model.predict(test_fea[fea_cols]) * 0.2

test_fea['pred_mae'] = pred_mae

In [11]:
pred_mse = 0

for i,model in enumerate(models_mse): 

    pred_mse += model.predict(test_fea[fea_cols]) * 0.2

test_fea['pred_mse'] = pred_mse

In [12]:
submit_mae = pd.DataFrame()

submit_mae['id']    = test_fea['用户编码'].values

submit_mae['score'] = test_fea['pred_mae'].values 

submit_mae['score'] = submit_mae['score'].astype(int)

submit_mae[['id','score']].to_csv('../sub/baseline_mae.csv',index = None)

submit_mae['score'].describe()

count    50000.00000
mean       618.78192
std         37.69697
min        477.00000
25%        598.00000
50%        628.00000
75%        646.00000
max        695.00000
Name: score, dtype: float64

In [13]:
test_fea = test_fea.sort_values('pred_mae')

test_fea['ranks'] = list(range(test_fea.shape[0]))

test_fea['score'] = test_fea['pred_mae'].values

test_fea.loc[test_fea.ranks<10000,'score']  = test_fea.loc[test_fea.ranks< 10000,'pred_mse'].values *0.4 + test_fea.loc[test_fea.ranks< 10000,'pred_mae'].values * 0.6

test_fea.loc[test_fea.ranks>40000,'score']  = test_fea.loc[test_fea.ranks> 40000,'pred_mse'].values *0.4 + test_fea.loc[test_fea.ranks> 40000,'pred_mae'].values * 0.6    

In [14]:
submit_mae_mse = pd.DataFrame()

submit_mae_mse['id']    = test_fea['用户编码'].values

submit_mae_mse['score'] = test_fea['score'].values 

submit_mae_mse['score'] = submit_mae_mse['score'].astype(int)

submit_mae_mse[['id','score']].to_csv('../sub/baseline_mae_mse.csv',index = None)

submit_mae_mse['score'].describe()

count    50000.000000
mean       618.588620
std         37.867445
min        472.000000
25%        598.000000
50%        628.000000
75%        646.000000
max        694.000000
Name: score, dtype: float64