In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFECV

warnings.filterwarnings('ignore')

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
df_user = reduce_mem_usage(pd.read_hdf('../input/user_features.h5'))
df_voc = reduce_mem_usage(pd.read_hdf('../input/voc_features.h5'))
df_sms = reduce_mem_usage(pd.read_hdf('../input/sms_features.h5'))
# df_app = reduce_mem_usage(pd.read_hdf('../input/app_features.h5'))

Memory usage of dataframe is 3651648.00 MB
Memory usage after optimization is: 1760616.00 MB
Decreased by 51.8%
Memory usage of dataframe is 8227056.00 MB
Memory usage after optimization is: 2165372.00 MB
Decreased by 73.7%
Memory usage of dataframe is 3850000.00 MB
Memory usage after optimization is: 1031250.00 MB
Decreased by 73.2%


In [4]:
df_user.columns

Index(['phone_no_m', 'city_name', 'county_name', 'idcard_cnt', 'label',
       'arpu_202004', 'city_name_county_name', 'idcard_cnt*arpu_202004',
       'arpu_202004/idcard_cnt', 'city_name_idcard_cnt_max',
       'city_name_idcard_cnt_min', 'city_name_idcard_cnt_median',
       'city_name_idcard_cnt_mean', 'city_name_idcard_cnt_skew',
       'city_name_idcard_cnt_nunique', 'city_name_arpu_202004_max',
       'city_name_arpu_202004_min', 'city_name_arpu_202004_median',
       'city_name_arpu_202004_mean', 'city_name_arpu_202004_skew',
       'city_name_arpu_202004_nunique', 'city_name_idcard_cnt*arpu_202004_max',
       'city_name_idcard_cnt*arpu_202004_min',
       'city_name_idcard_cnt*arpu_202004_median',
       'city_name_idcard_cnt*arpu_202004_mean',
       'city_name_idcard_cnt*arpu_202004_skew',
       'city_name_idcard_cnt*arpu_202004_nunique',
       'city_name_arpu_202004/idcard_cnt_max',
       'city_name_arpu_202004/idcard_cnt_min',
       'city_name_arpu_202004/idcard_cnt_m

In [5]:
df_user.shape, df_voc.shape, df_sms.shape #, df_app.shape

((8151, 85), (6788, 152), (6875, 69))

In [6]:
df = df_user.merge(df_voc, on='phone_no_m', how='left')
df = df.merge(df_sms, on='phone_no_m', how='left')
# df = df.merge(df_app, on='phone_no_m', how='left')

del df_user, df_voc, df_sms#, df_app
gc.collect()

11

In [7]:
df.shape

(8151, 304)

In [8]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 304), (2045, 304))

In [9]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop('label', axis=1), df_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [10]:
train_cols = [i for i in X_train.columns if i not in ['phone_no_m', 'label']]

In [11]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0.1,
          'lambda_l2': 0,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [12]:
useful_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_valid= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_valid, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.52:
        useful_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

city_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684155	valid_0's auc: 0.671655
[40]	training's auc: 0.684156	valid_0's auc: 0.672408
[60]	training's auc: 0.684156	valid_0's auc: 0.672408
[80]	training's auc: 0.684158	valid_0's auc: 0.671655
Early stopping, best iteration is:
[37]	training's auc: 0.684156	valid_0's auc: 0.672408
*****
0.6724078657865786
********************


county_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.767358	valid_0's auc: 0.72344
[40]	training's auc: 0.772283	valid_0's auc: 0.728765
[60]	training's auc: 0.77374	valid_0's auc: 0.730307
[80]	training's auc: 0.774604	valid_0's auc: 0.730896
[100]	training's auc: 0.775263	valid_0's auc: 0.73182
[120]	training's auc: 0.775831	valid_0's auc: 0.731191
[140]	training's auc: 0.776325	valid_0's auc: 0.732008
Early stopping, best iteration is:
[90]	training's auc: 0.775065	valid_0's auc: 0.732017
*****
0.7320170875783231
***********

[40]	training's auc: 0.684125	valid_0's auc: 0.671655
[60]	training's auc: 0.684125	valid_0's auc: 0.671655
[80]	training's auc: 0.684125	valid_0's auc: 0.671655
Early stopping, best iteration is:
[42]	training's auc: 0.684117	valid_0's auc: 0.672001
*****
0.6720013033912087
********************


city_name_arpu_202004/idcard_cnt_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.578299	valid_0's auc: 0.561841
[40]	training's auc: 0.578299	valid_0's auc: 0.561841
Early stopping, best iteration is:
[1]	training's auc: 0.578299	valid_0's auc: 0.561841
*****
0.5618408308222127
********************


city_name_arpu_202004/idcard_cnt_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.683867	valid_0's auc: 0.670468
[40]	training's auc: 0.683867	valid_0's auc: 0.670468
Early stopping, best iteration is:
[1]	training's auc: 0.683867	valid_0's auc: 0.670468
*****
0.6704677261204381
********************


city_name_arpu

[200]	training's auc: 0.775093	valid_0's auc: 0.72895
[220]	training's auc: 0.775222	valid_0's auc: 0.728884
[240]	training's auc: 0.775373	valid_0's auc: 0.729282
[260]	training's auc: 0.775445	valid_0's auc: 0.729339
[280]	training's auc: 0.77551	valid_0's auc: 0.729216
Early stopping, best iteration is:
[245]	training's auc: 0.775401	valid_0's auc: 0.729494
*****
0.729494009183527
********************


county_name_idcard_cnt*arpu_202004_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.598225	valid_0's auc: 0.555203
[40]	training's auc: 0.599398	valid_0's auc: 0.560246
[60]	training's auc: 0.59977	valid_0's auc: 0.562061
[80]	training's auc: 0.60037	valid_0's auc: 0.562076
[100]	training's auc: 0.600409	valid_0's auc: 0.56215
[120]	training's auc: 0.60044	valid_0's auc: 0.562216
[140]	training's auc: 0.600465	valid_0's auc: 0.562144
[160]	training's auc: 0.6006	valid_0's auc: 0.562183
Early stopping, best iteration is:
[120]	training's auc: 0.6

[300]	training's auc: 0.776218	valid_0's auc: 0.734234
[320]	training's auc: 0.776297	valid_0's auc: 0.733953
Early stopping, best iteration is:
[288]	training's auc: 0.77616	valid_0's auc: 0.734343
*****
0.7343428636341895
********************


county_name_arpu_202004/idcard_cnt_skew
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.760753	valid_0's auc: 0.719007
[40]	training's auc: 0.766226	valid_0's auc: 0.722786
[60]	training's auc: 0.768889	valid_0's auc: 0.724419
[80]	training's auc: 0.770585	valid_0's auc: 0.724952
[100]	training's auc: 0.772333	valid_0's auc: 0.726204
[120]	training's auc: 0.772965	valid_0's auc: 0.727244
[140]	training's auc: 0.773575	valid_0's auc: 0.727422
[160]	training's auc: 0.774061	valid_0's auc: 0.727395
[180]	training's auc: 0.774384	valid_0's auc: 0.727428
Early stopping, best iteration is:
[132]	training's auc: 0.77336	valid_0's auc: 0.727966
*****
0.7279664107715119
********************


county_name_arpu_202004/

[80]	training's auc: 0.774612	valid_0's auc: 0.726315
[100]	training's auc: 0.775115	valid_0's auc: 0.727316
[120]	training's auc: 0.775744	valid_0's auc: 0.728363
[140]	training's auc: 0.776071	valid_0's auc: 0.728285
[160]	training's auc: 0.776339	valid_0's auc: 0.729019
[180]	training's auc: 0.776563	valid_0's auc: 0.729279
[200]	training's auc: 0.776757	valid_0's auc: 0.729413
[220]	training's auc: 0.776916	valid_0's auc: 0.729661
[240]	training's auc: 0.777064	valid_0's auc: 0.729709
[260]	training's auc: 0.777142	valid_0's auc: 0.729769
[280]	training's auc: 0.777198	valid_0's auc: 0.729877
[300]	training's auc: 0.777244	valid_0's auc: 0.729859
[320]	training's auc: 0.7773	valid_0's auc: 0.729812
[340]	training's auc: 0.777359	valid_0's auc: 0.730252
[360]	training's auc: 0.777415	valid_0's auc: 0.73027
[380]	training's auc: 0.777449	valid_0's auc: 0.730231
Early stopping, best iteration is:
[341]	training's auc: 0.777357	valid_0's auc: 0.730374
*****
0.7303743961352657
*********

[200]	training's auc: 0.777238	valid_0's auc: 0.732969
Early stopping, best iteration is:
[155]	training's auc: 0.776443	valid_0's auc: 0.733173
*****
0.7331725020328119
********************


city_name_county_name_arpu_202004/idcard_cnt_skew
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.763054	valid_0's auc: 0.721787
[40]	training's auc: 0.767787	valid_0's auc: 0.725283
[60]	training's auc: 0.770716	valid_0's auc: 0.727348
[80]	training's auc: 0.772503	valid_0's auc: 0.727951
[100]	training's auc: 0.773771	valid_0's auc: 0.7288
[120]	training's auc: 0.774536	valid_0's auc: 0.729491
[140]	training's auc: 0.775127	valid_0's auc: 0.730177
[160]	training's auc: 0.775543	valid_0's auc: 0.729744
[180]	training's auc: 0.775995	valid_0's auc: 0.729956
[200]	training's auc: 0.776308	valid_0's auc: 0.729365
[220]	training's auc: 0.7766	valid_0's auc: 0.729058
Early stopping, best iteration is:
[171]	training's auc: 0.775798	valid_0's auc: 0.73018
*****
0.73

[8]	training's auc: 0.810773	valid_0's auc: 0.782377
*****
0.7823770148754006
********************


city_name_county_name_nunique_x
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.810828	valid_0's auc: 0.782029
[40]	training's auc: 0.811117	valid_0's auc: 0.781527
Early stopping, best iteration is:
[8]	training's auc: 0.810773	valid_0's auc: 0.782377
*****
0.7823770148754006
********************


phone2opposite_cnt_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.912874	valid_0's auc: 0.862673
[40]	training's auc: 0.915995	valid_0's auc: 0.86524
Early stopping, best iteration is:
[1]	training's auc: 0.90085	valid_0's auc: 0.866538
*****
0.8665384201463625
********************


phone2opposite_cnt_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.854745	valid_0's auc: 0.856477
[40]	training's auc: 0.854862	valid_0's auc: 0.856738
[60]	training's auc: 0.854901	valid_0's 

[20]	training's auc: 0.846148	valid_0's auc: 0.822881
[40]	training's auc: 0.846345	valid_0's auc: 0.822926
[60]	training's auc: 0.846369	valid_0's auc: 0.823189
Early stopping, best iteration is:
[17]	training's auc: 0.846121	valid_0's auc: 0.823404
*****
0.8234039436552351
********************


voc_hour_mode_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.872088	valid_0's auc: 0.854358
[40]	training's auc: 0.872424	valid_0's auc: 0.854655
[60]	training's auc: 0.87266	valid_0's auc: 0.855222
Early stopping, best iteration is:
[11]	training's auc: 0.87169	valid_0's auc: 0.855917
*****
0.855916977567322
********************


voc_hour_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.863728	valid_0's auc: 0.840329
[40]	training's auc: 0.86382	valid_0's auc: 0.840332
[60]	training's auc: 0.863851	valid_0's auc: 0.840332
[80]	training's auc: 0.863861	valid_0's auc: 0.840332
Early stopping, best iteration 

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.761157	valid_0's auc: 0.737866
[40]	training's auc: 0.761484	valid_0's auc: 0.735977
Early stopping, best iteration is:
[7]	training's auc: 0.760881	valid_0's auc: 0.738262
*****
0.738262005548381
********************


voc_day7_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.74877	valid_0's auc: 0.739713
[40]	training's auc: 0.748915	valid_0's auc: 0.741171
[60]	training's auc: 0.74908	valid_0's auc: 0.741299
[80]	training's auc: 0.749142	valid_0's auc: 0.741407
[100]	training's auc: 0.74942	valid_0's auc: 0.741332
[120]	training's auc: 0.749432	valid_0's auc: 0.741371
Early stopping, best iteration is:
[84]	training's auc: 0.749143	valid_0's auc: 0.741443
*****
0.7414427584062754
********************


voc_day14_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.75402	valid_0's auc: 0.744502
[40]	training's auc: 0.754056	

[1]	training's auc: 0.776765	valid_0's auc: 0.737561
*****
0.7375609843593055
********************


voc_day27_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.777922	valid_0's auc: 0.735639
[40]	training's auc: 0.779824	valid_0's auc: 0.732872
[60]	training's auc: 0.780725	valid_0's auc: 0.732941
Early stopping, best iteration is:
[21]	training's auc: 0.778092	valid_0's auc: 0.735968
*****
0.7359676185009806
********************


voc_day13_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.751661	valid_0's auc: 0.718642
[40]	training's auc: 0.753027	valid_0's auc: 0.717822
[60]	training's auc: 0.754018	valid_0's auc: 0.717814
[80]	training's auc: 0.754626	valid_0's auc: 0.717151
Early stopping, best iteration is:
[43]	training's auc: 0.753235	valid_0's auc: 0.720822
*****
0.7208216745587602
********************


voc_day28_call_dur_sum
Training until validation scores don't improve for 50 ro

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.737823	valid_0's auc: 0.712139
[40]	training's auc: 0.737925	valid_0's auc: 0.712418
Early stopping, best iteration is:
[7]	training's auc: 0.73691	valid_0's auc: 0.713658
*****
0.7136575070550534
********************


voc_hour18_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.789131	valid_0's auc: 0.782123
[40]	training's auc: 0.789336	valid_0's auc: 0.78184
Early stopping, best iteration is:
[8]	training's auc: 0.788726	valid_0's auc: 0.782506
*****
0.7825055603386426
********************


voc_hour11_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.808897	valid_0's auc: 0.762991
[40]	training's auc: 0.8091	valid_0's auc: 0.762862
Early stopping, best iteration is:
[1]	training's auc: 0.807812	valid_0's auc: 0.765337
*****
0.7653372674223944
********************


voc_hour15_count
Training until validation scores don't

[20]	training's auc: 0.748102	valid_0's auc: 0.714196
[40]	training's auc: 0.750391	valid_0's auc: 0.713541
[60]	training's auc: 0.751199	valid_0's auc: 0.712665
Early stopping, best iteration is:
[20]	training's auc: 0.748102	valid_0's auc: 0.714196
*****
0.714195604343043
********************


voc_hour18_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.795628	valid_0's auc: 0.782038
[40]	training's auc: 0.798427	valid_0's auc: 0.779561
[60]	training's auc: 0.800201	valid_0's auc: 0.778068
Early stopping, best iteration is:
[10]	training's auc: 0.793532	valid_0's auc: 0.782875
*****
0.782874754866791
********************


voc_hour11_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.820653	valid_0's auc: 0.778051
[40]	training's auc: 0.823168	valid_0's auc: 0.77638
Early stopping, best iteration is:
[2]	training's auc: 0.815176	valid_0's auc: 0.781724
*****
0.7817238245563687
**************

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.865047	valid_0's auc: 0.808621
[40]	training's auc: 0.869481	valid_0's auc: 0.804801
Early stopping, best iteration is:
[2]	training's auc: 0.855136	valid_0's auc: 0.811762
*****
0.811761610943703
********************


sms_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.851567	valid_0's auc: 0.811213
[40]	training's auc: 0.854139	valid_0's auc: 0.805103
Early stopping, best iteration is:
[2]	training's auc: 0.846415	valid_0's auc: 0.812941
*****
0.8129409408332138
********************


sms_avg
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.868811	valid_0's auc: 0.809376
[40]	training's auc: 0.87315	valid_0's auc: 0.80759
[60]	training's auc: 0.875144	valid_0's auc: 0.807523
Early stopping, best iteration is:
[17]	training's auc: 0.867674	valid_0's auc: 0.810898
*****
0.8108976658535418
********************


sms_calltype1

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.830731	valid_0's auc: 0.813301
[40]	training's auc: 0.83089	valid_0's auc: 0.813412
[60]	training's auc: 0.831012	valid_0's auc: 0.813055
Early stopping, best iteration is:
[16]	training's auc: 0.830803	valid_0's auc: 0.814497
*****
0.8144969388243172
********************


sms_day10_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.837666	valid_0's auc: 0.799508
[40]	training's auc: 0.838101	valid_0's auc: 0.7983
[60]	training's auc: 0.837797	valid_0's auc: 0.799179
Early stopping, best iteration is:
[23]	training's auc: 0.837507	valid_0's auc: 0.79988
*****
0.7998801238819534
********************


sms_day11_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.810393	valid_0's auc: 0.794777
[40]	training's auc: 0.81044	valid_0's auc: 0.794758
Early stopping, best iteration is:
[5]	training's auc: 0.810262	valid_0's auc: 0.795

[40]	training's auc: 0.864117	valid_0's auc: 0.836182
[60]	training's auc: 0.864243	valid_0's auc: 0.835806
Early stopping, best iteration is:
[21]	training's auc: 0.863997	valid_0's auc: 0.836423
*****
0.8364229085952073
********************


sms_hour10_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.840865	valid_0's auc: 0.802379
[40]	training's auc: 0.841822	valid_0's auc: 0.803316
[60]	training's auc: 0.842539	valid_0's auc: 0.803325
Early stopping, best iteration is:
[27]	training's auc: 0.841209	valid_0's auc: 0.804532
*****
0.8045316759936864
********************


sms_hour14_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.822721	valid_0's auc: 0.804301
[40]	training's auc: 0.824165	valid_0's auc: 0.807316
[60]	training's auc: 0.82493	valid_0's auc: 0.806279
[80]	training's auc: 0.825467	valid_0's auc: 0.805047
[100]	training's auc: 0.82567	valid_0's auc: 0.804476
Early stopping, best iteration 

[20]	training's auc: 0.799209	valid_0's auc: 0.781265
[40]	training's auc: 0.799179	valid_0's auc: 0.780999
Early stopping, best iteration is:
[3]	training's auc: 0.799054	valid_0's auc: 0.781999
*****
0.781998852059119
********************


sms_hour23_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.577381	valid_0's auc: 0.572232
[40]	training's auc: 0.5779	valid_0's auc: 0.573691
[60]	training's auc: 0.577917	valid_0's auc: 0.573652
[80]	training's auc: 0.577946	valid_0's auc: 0.573766
Early stopping, best iteration is:
[34]	training's auc: 0.577481	valid_0's auc: 0.577721
*****
0.5777206796766633
********************


sms_hour6_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.551104	valid_0's auc: 0.556588
[40]	training's auc: 0.551152	valid_0's auc: 0.55684
[60]	training's auc: 0.551189	valid_0's auc: 0.556804
[80]	training's auc: 0.551257	valid_0's auc: 0.556783
Early stopping, best iteration is:
[

In [13]:
print(useful_cols)

['city_name', 'county_name', 'idcard_cnt', 'arpu_202004', 'city_name_county_name', 'idcard_cnt*arpu_202004', 'arpu_202004/idcard_cnt', 'city_name_idcard_cnt_max', 'city_name_idcard_cnt_min', 'city_name_idcard_cnt_median', 'city_name_idcard_cnt_mean', 'city_name_idcard_cnt_skew', 'city_name_idcard_cnt_nunique', 'city_name_arpu_202004_max', 'city_name_arpu_202004_min', 'city_name_arpu_202004_median', 'city_name_arpu_202004_mean', 'city_name_arpu_202004_nunique', 'city_name_idcard_cnt*arpu_202004_max', 'city_name_idcard_cnt*arpu_202004_min', 'city_name_idcard_cnt*arpu_202004_median', 'city_name_idcard_cnt*arpu_202004_mean', 'city_name_idcard_cnt*arpu_202004_nunique', 'city_name_arpu_202004/idcard_cnt_max', 'city_name_arpu_202004/idcard_cnt_min', 'city_name_arpu_202004/idcard_cnt_median', 'city_name_arpu_202004/idcard_cnt_mean', 'city_name_arpu_202004/idcard_cnt_skew', 'city_name_arpu_202004/idcard_cnt_nunique', 'county_name_idcard_cnt_max', 'county_name_idcard_cnt_min', 'county_name_idcar

In [14]:
print(useless_cols)

['city_name_arpu_202004_skew', 'city_name_idcard_cnt*arpu_202004_skew', 'county_name_idcard_cnt*arpu_202004_skew', 'city_name_county_name_idcard_cnt*arpu_202004_skew', 'voc_hour1_count', 'voc_hour3_count', 'voc_hour2_count', 'voc_hour4_count', 'voc_hour1_call_dur_sum', 'voc_hour3_call_dur_sum', 'voc_hour2_call_dur_sum', 'voc_hour4_call_dur_sum', 'sms_hour3_count', 'sms_hour5_count', 'sms_hour4_count']


In [15]:
# # RFECV
# clf = lgb.LGBMClassifier(n_estimators=25)
# selector = RFECV(estimator=clf,
#                  step=2,
#                  n_jobs=-1,
#                  min_features_to_select=200,
#                  cv=3)  # 使用5折交叉验证

# selector = selector.fit(df_train[useful_cols], df_train['label'])

# df_train_rfe = selector.transform(df_train[useful_cols])
# df_test_rfe = selector.transform(df_test[useful_cols])

In [None]:
backward_model = SFS(lgb.LGBMClassifier(),
                     k_features=200,
                     forward=False,
                     verbose=20,
                     cv=3,
                     n_jobs=-1,
                     scoring='roc_auc')
backward_model.fit(df_train[useful_cols], df_train['label'])
backward_cols = backward_model.k_feature_names_
print(backward_cols)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   42.2s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   47.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   48.5s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  1

[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed: 13.3min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 137 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 13.8min
[Parallel(n_jobs=-1)]: Done 139 tasks      | elapsed: 14.0min
[Parallel(n_jobs=-1)]: Done 140 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed: 14.3min
[Parallel(n_jobs=-1)]: Done 142 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 143 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed: 14.4min
[Parallel(n_jobs=-1)]: Done 145 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 14.6min
[Parallel(n_jobs=-1)]: Done 147 tasks      | elapsed: 14.7min
[Paralle

[Parallel(n_jobs=-1)]: Done 265 tasks      | elapsed: 26.1min
[Parallel(n_jobs=-1)]: Done 266 tasks      | elapsed: 26.2min
[Parallel(n_jobs=-1)]: Done 267 tasks      | elapsed: 26.2min
[Parallel(n_jobs=-1)]: Done 268 tasks      | elapsed: 26.3min
[Parallel(n_jobs=-1)]: Done 269 tasks      | elapsed: 26.5min
[Parallel(n_jobs=-1)]: Done 270 tasks      | elapsed: 26.7min
[Parallel(n_jobs=-1)]: Done 271 tasks      | elapsed: 26.7min
[Parallel(n_jobs=-1)]: Done 272 tasks      | elapsed: 26.7min
[Parallel(n_jobs=-1)]: Done 287 out of 287 | elapsed: 28.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done 287 out of 287 | elapsed: 28.0min finished

[2020-06-18 15:11:02] Features: 286/200 -- score: 0.9597725099943427[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   39.4s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   39.6s
[Parallel(n_jobs=-1)]

[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed: 12.1min
[Parallel(n_jobs=-1)]: Done 122 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 123 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed: 12.5min
[Parallel(n_jobs=-1)]: Done 125 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done 126 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 127 tasks      | elapsed: 12.8min
[Parallel(n_jobs=-1)]: Done 128 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed: 13.0min
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 131 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done 132 tasks      | elapsed: 13.4min
[Parallel(n_jobs=-1)]: Done 133 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 134 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 135 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 136 tasks      | elapsed: 13.7min
[Paralle

In [None]:
lgb_train = lgb.Dataset(X_train[backward_cols].values, y_train) 

lgb_valid= lgb.Dataset(X_valid[backward_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

lgb_val_0 = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_valid, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

print('Done!')

In [None]:
# 验证集结果
X_valid['prob'] = lgb_val_0.predict(X_valid[backward_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.4735, 1, 0)

f1_04735 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_04735 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_04735: ', f1_04735)
print('auc_04735: ', auc_04735)

In [None]:
lgb_train_all = lgb.Dataset(df_train[backward_cols].values, df_train['label'])   

print('Start training...')

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_val_0.best_iteration + 20)

print('Done!')

In [None]:
df_test['label'] = np.where(lgb_model.predict(df_test[backward_cols]) > 0.4735, 1, 0)
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_04735), index=False)