In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import RFECV

warnings.filterwarnings('ignore')

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
df_user = reduce_mem_usage(pd.read_hdf('../input/user_features.h5'))
df_voc = reduce_mem_usage(pd.read_hdf('../input/voc_features.h5'))
df_sms = reduce_mem_usage(pd.read_hdf('../input/sms_features.h5'))
# df_app = reduce_mem_usage(pd.read_hdf('../input/app_features.h5'))

Memory usage of dataframe is 3651648.00 MB
Memory usage after optimization is: 1760616.00 MB
Decreased by 51.8%
Memory usage of dataframe is 8227056.00 MB
Memory usage after optimization is: 2165372.00 MB
Decreased by 73.7%
Memory usage of dataframe is 3850000.00 MB
Memory usage after optimization is: 1031250.00 MB
Decreased by 73.2%


In [4]:
df_user.columns

Index(['phone_no_m', 'city_name', 'county_name', 'idcard_cnt', 'label',
       'arpu_202004', 'city_name_county_name', 'idcard_cnt*arpu_202004',
       'arpu_202004/idcard_cnt', 'city_name_idcard_cnt_max',
       'city_name_idcard_cnt_min', 'city_name_idcard_cnt_median',
       'city_name_idcard_cnt_mean', 'city_name_idcard_cnt_skew',
       'city_name_idcard_cnt_nunique', 'city_name_arpu_202004_max',
       'city_name_arpu_202004_min', 'city_name_arpu_202004_median',
       'city_name_arpu_202004_mean', 'city_name_arpu_202004_skew',
       'city_name_arpu_202004_nunique', 'city_name_idcard_cnt*arpu_202004_max',
       'city_name_idcard_cnt*arpu_202004_min',
       'city_name_idcard_cnt*arpu_202004_median',
       'city_name_idcard_cnt*arpu_202004_mean',
       'city_name_idcard_cnt*arpu_202004_skew',
       'city_name_idcard_cnt*arpu_202004_nunique',
       'city_name_arpu_202004/idcard_cnt_max',
       'city_name_arpu_202004/idcard_cnt_min',
       'city_name_arpu_202004/idcard_cnt_m

In [5]:
df_user.shape, df_voc.shape, df_sms.shape #, df_app.shape

((8151, 85), (6788, 152), (6875, 69))

In [6]:
df = df_user.merge(df_voc, on='phone_no_m', how='left')
df = df.merge(df_sms, on='phone_no_m', how='left')
# df = df.merge(df_app, on='phone_no_m', how='left')

del df_user, df_voc, df_sms#, df_app
gc.collect()

11

In [7]:
df.shape

(8151, 304)

In [8]:
def correlation(df, threshold):
    """
    去除特征相关系数大于阈值的特征
    :param df:
    :param threshold:
    :return:
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName = corr_matrix.columns[i]
                col_corr.add(colName)

    return col_corr

In [9]:
cols = correlation(df.drop('label', axis=1), threshold=0.95)
print(cols)

{'city_name_county_name_nunique_y', 'city_name_count', 'county_name_idcard_cnt*arpu_202004_nunique', 'city_name_idcard_cnt*arpu_202004_max', 'city_name_county_name_nunique_x', 'city_name_county_name_count', 'city_name_county_name_idcard_cnt*arpu_202004_skew', 'city_name_idcard_cnt*arpu_202004_min', 'city_name_county_name_arpu_202004_mean', 'city_name_county_name_arpu_202004_nunique', 'sms_hour10_count', 'city_name_arpu_202004/idcard_cnt_min', 'county_name_nunique_y', 'city_name_county_name_idcard_cnt_nunique', 'city_name_county_name_idcard_cnt*arpu_202004_max', 'city_name_arpu_202004/idcard_cnt_skew', 'city_name_county_name_idcard_cnt*arpu_202004_min', 'sms_hour_mode_count', 'sms_hour20_count', 'sms_hour3_count', 'city_name_county_name_idcard_cnt_min', 'city_name_county_name_idcard_cnt*arpu_202004_mean', 'sms_day7_count', 'city_name_county_name_arpu_202004/idcard_cnt_skew', 'sms_calltype1_cnt', 'sms_hour18_count', 'sms_hour9_count', 'sms_hour6_count', 'sms_hour5_count', 'sms_hour2_coun

In [10]:
df.drop(cols, axis=1, inplace=True)

In [11]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 233), (2045, 233))

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop('label', axis=1), df_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [13]:
train_cols = [i for i in X_train.columns if i not in ['phone_no_m', 'label']]

In [14]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0.1,
          'lambda_l2': 0,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [15]:
useful_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_valid= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_valid, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.52:
        useful_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

city_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684155	valid_0's auc: 0.671655
[40]	training's auc: 0.684156	valid_0's auc: 0.672408
[60]	training's auc: 0.684156	valid_0's auc: 0.672408
[80]	training's auc: 0.684158	valid_0's auc: 0.671655
Early stopping, best iteration is:
[37]	training's auc: 0.684156	valid_0's auc: 0.672408
*****
0.6724078657865786
********************


county_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.767358	valid_0's auc: 0.72344
[40]	training's auc: 0.772283	valid_0's auc: 0.728765
[60]	training's auc: 0.77374	valid_0's auc: 0.730307
[80]	training's auc: 0.774604	valid_0's auc: 0.730896
[100]	training's auc: 0.775263	valid_0's auc: 0.73182
[120]	training's auc: 0.775831	valid_0's auc: 0.731191
[140]	training's auc: 0.776325	valid_0's auc: 0.732008
Early stopping, best iteration is:
[90]	training's auc: 0.775065	valid_0's auc: 0.732017
*****
0.7320170875783231
***********

*****
0.6698100516573396
********************


county_name_idcard_cnt_max
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.633927	valid_0's auc: 0.644783
[40]	training's auc: 0.633927	valid_0's auc: 0.644783
Early stopping, best iteration is:
[1]	training's auc: 0.633927	valid_0's auc: 0.644783
*****
0.6447825489070647
********************


county_name_idcard_cnt_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.607294	valid_0's auc: 0.605336
[40]	training's auc: 0.607294	valid_0's auc: 0.605336
Early stopping, best iteration is:
[1]	training's auc: 0.607294	valid_0's auc: 0.605336
*****
0.6053355335533553
********************


county_name_idcard_cnt_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.605693	valid_0's auc: 0.605673
[40]	training's auc: 0.606365	valid_0's auc: 0.606492
[60]	training's auc: 0.606365	valid_0's auc: 0.606492
[80]	training's auc: 0.606365	valid

[80]	training's auc: 0.775041	valid_0's auc: 0.724427
[100]	training's auc: 0.776064	valid_0's auc: 0.725753
[120]	training's auc: 0.776988	valid_0's auc: 0.726455
[140]	training's auc: 0.777543	valid_0's auc: 0.727016
[160]	training's auc: 0.777934	valid_0's auc: 0.727061
[180]	training's auc: 0.778264	valid_0's auc: 0.727219
[200]	training's auc: 0.778505	valid_0's auc: 0.727234
[220]	training's auc: 0.778771	valid_0's auc: 0.727679
[240]	training's auc: 0.778926	valid_0's auc: 0.72779
[260]	training's auc: 0.779027	valid_0's auc: 0.728029
[280]	training's auc: 0.779183	valid_0's auc: 0.727503
Early stopping, best iteration is:
[245]	training's auc: 0.778968	valid_0's auc: 0.72811
*****
0.7281099033816425
********************


county_name_arpu_202004/idcard_cnt_max
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.763834	valid_0's auc: 0.724674
[40]	training's auc: 0.768782	valid_0's auc: 0.727325
[60]	training's auc: 0.771232	valid_0's auc: 0.72956

[60]	training's auc: 0.877354	valid_0's auc: 0.837341
Early stopping, best iteration is:
[11]	training's auc: 0.869341	valid_0's auc: 0.843964
*****
0.843963744200507
********************


voc_calltype_id_1_30s_cnt
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.834601	valid_0's auc: 0.792535
[40]	training's auc: 0.836674	valid_0's auc: 0.788126
Early stopping, best iteration is:
[2]	training's auc: 0.832094	valid_0's auc: 0.796179
*****
0.7961792103123356
********************


voc_calltype_id_1_300s_cnt
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.673163	valid_0's auc: 0.679384
[40]	training's auc: 0.673297	valid_0's auc: 0.679185
[60]	training's auc: 0.674437	valid_0's auc: 0.680079
[80]	training's auc: 0.675302	valid_0's auc: 0.679905
[100]	training's auc: 0.675362	valid_0's auc: 0.679801
Early stopping, best iteration is:
[56]	training's auc: 0.67381	valid_0's auc: 0.681131
*****
0.6811310207107667
*******

[200]	training's auc: 0.873703	valid_0's auc: 0.805541
[220]	training's auc: 0.874074	valid_0's auc: 0.806094
[240]	training's auc: 0.874316	valid_0's auc: 0.806085
[260]	training's auc: 0.874496	valid_0's auc: 0.806337
[280]	training's auc: 0.874661	valid_0's auc: 0.806445
[300]	training's auc: 0.874816	valid_0's auc: 0.806944
[320]	training's auc: 0.874925	valid_0's auc: 0.80718
[340]	training's auc: 0.875027	valid_0's auc: 0.807355
[360]	training's auc: 0.875086	valid_0's auc: 0.807648
[380]	training's auc: 0.875158	valid_0's auc: 0.80763
[400]	training's auc: 0.875209	valid_0's auc: 0.807929
[420]	training's auc: 0.875278	valid_0's auc: 0.807836
[440]	training's auc: 0.875326	valid_0's auc: 0.807884
Early stopping, best iteration is:
[396]	training's auc: 0.875213	valid_0's auc: 0.8081
*****
0.8080995599559956
********************


call_dur_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.842091	valid_0's auc: 0.836261
[40]	training's auc: 0.

[20]	training's auc: 0.7523	valid_0's auc: 0.730793
[40]	training's auc: 0.752645	valid_0's auc: 0.730512
[60]	training's auc: 0.753177	valid_0's auc: 0.729581
[80]	training's auc: 0.753962	valid_0's auc: 0.731228
[100]	training's auc: 0.753985	valid_0's auc: 0.731126
Early stopping, best iteration is:
[65]	training's auc: 0.753684	valid_0's auc: 0.731476
*****
0.7314760008609557
********************


voc_day28_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.762309	valid_0's auc: 0.743637
[40]	training's auc: 0.762482	valid_0's auc: 0.744081
[60]	training's auc: 0.762666	valid_0's auc: 0.742664
Early stopping, best iteration is:
[11]	training's auc: 0.762338	valid_0's auc: 0.744148
*****
0.7441481919931123
********************


voc_day30_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.77047	valid_0's auc: 0.752275
[40]	training's auc: 0.770554	valid_0's auc: 0.752052
Early stopping, best iteration is:

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.774361	valid_0's auc: 0.731767
[40]	training's auc: 0.775916	valid_0's auc: 0.731258
[60]	training's auc: 0.777294	valid_0's auc: 0.731388
Early stopping, best iteration is:
[14]	training's auc: 0.773543	valid_0's auc: 0.732403
*****
0.732402723968049
********************


voc_day1_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.736102	valid_0's auc: 0.721919
[40]	training's auc: 0.737709	valid_0's auc: 0.722627
[60]	training's auc: 0.738257	valid_0's auc: 0.722234
[80]	training's auc: 0.739056	valid_0's auc: 0.722508
Early stopping, best iteration is:
[43]	training's auc: 0.736963	valid_0's auc: 0.726826
*****
0.7268259434639116
********************


voc_day3_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.744264	valid_0's auc: 0.707839
[40]	training's auc: 0.745883	valid_0's auc: 0.708051
Early stopping

[13]	training's auc: 0.745266	valid_0's auc: 0.709969
*****
0.7099685512029463
********************


voc_day18_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.764157	valid_0's auc: 0.733144
[40]	training's auc: 0.766299	valid_0's auc: 0.731122
Early stopping, best iteration is:
[1]	training's auc: 0.759518	valid_0's auc: 0.734095
*****
0.7340947409958387
********************


voc_day31_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.77871	valid_0's auc: 0.751803
[40]	training's auc: 0.780369	valid_0's auc: 0.751484
Early stopping, best iteration is:
[1]	training's auc: 0.774404	valid_0's auc: 0.75361
*****
0.7536097359735974
********************


voc_day4_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.751538	valid_0's auc: 0.710025
[40]	training's auc: 0.754063	valid_0's auc: 0.707345
Early stopping, best iteration is:
[1]	training's auc

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.508312	valid_0's auc: 0.515309
[40]	training's auc: 0.508313	valid_0's auc: 0.515309
[60]	training's auc: 0.509004	valid_0's auc: 0.51275
Early stopping, best iteration is:
[12]	training's auc: 0.508308	valid_0's auc: 0.515357
*****
0.5153566987133495
********************


voc_hour21_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.748102	valid_0's auc: 0.714196
[40]	training's auc: 0.750391	valid_0's auc: 0.713541
[60]	training's auc: 0.751199	valid_0's auc: 0.712665
Early stopping, best iteration is:
[20]	training's auc: 0.748102	valid_0's auc: 0.714196
*****
0.714195604343043
********************


voc_hour18_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.795628	valid_0's auc: 0.782038
[40]	training's auc: 0.798427	valid_0's auc: 0.779561
[60]	training's auc: 0.800201	valid_0's auc: 0.778068
Early stopp

[40]	training's auc: 0.869481	valid_0's auc: 0.804801
Early stopping, best iteration is:
[2]	training's auc: 0.855136	valid_0's auc: 0.811762
*****
0.811761610943703
********************


sms_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.851567	valid_0's auc: 0.811213
[40]	training's auc: 0.854139	valid_0's auc: 0.805103
Early stopping, best iteration is:
[2]	training's auc: 0.846415	valid_0's auc: 0.812941
*****
0.8129409408332138
********************


sms_avg
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.868811	valid_0's auc: 0.809376
[40]	training's auc: 0.87315	valid_0's auc: 0.80759
[60]	training's auc: 0.875144	valid_0's auc: 0.807523
Early stopping, best iteration is:
[17]	training's auc: 0.867674	valid_0's auc: 0.810898
*****
0.8108976658535418
********************


sms_calltype1_rate
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.62821	valid_0's auc: 0.58

Early stopping, best iteration is:
[4]	training's auc: 0.834647	valid_0's auc: 0.82589
*****
0.8258896541828096
********************


sms_day18_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.816939	valid_0's auc: 0.797693
[40]	training's auc: 0.817099	valid_0's auc: 0.795744
[60]	training's auc: 0.817175	valid_0's auc: 0.794363
Early stopping, best iteration is:
[14]	training's auc: 0.816878	valid_0's auc: 0.79811
*****
0.798110381690343
********************


sms_day19_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.852865	valid_0's auc: 0.826189
[40]	training's auc: 0.852946	valid_0's auc: 0.825909
Early stopping, best iteration is:
[2]	training's auc: 0.852679	valid_0's auc: 0.827292
*****
0.8272916965609605
********************


sms_day20_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.807959	valid_0's auc: 0.766336
[40]	training's auc: 0.808019	valid_0's

[40]	training's auc: 0.5779	valid_0's auc: 0.573691
[60]	training's auc: 0.577917	valid_0's auc: 0.573652
[80]	training's auc: 0.577946	valid_0's auc: 0.573766
Early stopping, best iteration is:
[34]	training's auc: 0.577481	valid_0's auc: 0.577721
*****
0.5777206796766633
********************




In [16]:
print(useful_cols)

['city_name', 'county_name', 'idcard_cnt', 'arpu_202004', 'idcard_cnt*arpu_202004', 'arpu_202004/idcard_cnt', 'city_name_idcard_cnt_max', 'city_name_idcard_cnt_min', 'city_name_idcard_cnt_median', 'city_name_idcard_cnt_mean', 'city_name_idcard_cnt_skew', 'city_name_idcard_cnt_nunique', 'city_name_arpu_202004_max', 'city_name_arpu_202004_min', 'city_name_arpu_202004_median', 'city_name_arpu_202004_mean', 'city_name_arpu_202004_nunique', 'city_name_idcard_cnt*arpu_202004_median', 'city_name_idcard_cnt*arpu_202004_mean', 'city_name_arpu_202004/idcard_cnt_max', 'city_name_arpu_202004/idcard_cnt_median', 'city_name_arpu_202004/idcard_cnt_mean', 'county_name_idcard_cnt_max', 'county_name_idcard_cnt_min', 'county_name_idcard_cnt_median', 'county_name_idcard_cnt_mean', 'county_name_idcard_cnt_skew', 'county_name_idcard_cnt_nunique', 'county_name_arpu_202004_max', 'county_name_arpu_202004_min', 'county_name_arpu_202004_median', 'county_name_arpu_202004_mean', 'county_name_arpu_202004_nunique', 

In [17]:
print(useless_cols)

['city_name_arpu_202004_skew', 'city_name_idcard_cnt*arpu_202004_skew', 'voc_hour3_count', 'voc_hour4_count', 'voc_hour3_call_dur_sum', 'voc_hour4_call_dur_sum']


In [18]:
lgb_train = lgb.Dataset(X_train[useful_cols].values, y_train) 

lgb_valid= lgb.Dataset(X_valid[useful_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

lgb_val_0 = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_valid, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

print('Done!')

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.98758	valid_0's auc: 0.94422
[20]	training's auc: 0.996539	valid_0's auc: 0.946938
[30]	training's auc: 0.998658	valid_0's auc: 0.950494
[40]	training's auc: 0.999454	valid_0's auc: 0.95144
[50]	training's auc: 0.999663	valid_0's auc: 0.951057
[60]	training's auc: 0.999795	valid_0's auc: 0.952262
[70]	training's auc: 0.999852	valid_0's auc: 0.952561
[80]	training's auc: 0.999981	valid_0's auc: 0.953096
[90]	training's auc: 0.999995	valid_0's auc: 0.953801
[100]	training's auc: 1	valid_0's auc: 0.954058
[110]	training's auc: 1	valid_0's auc: 0.954301
[120]	training's auc: 1	valid_0's auc: 0.954387
[130]	training's auc: 1	valid_0's auc: 0.955296
[140]	training's auc: 1	valid_0's auc: 0.955099
[150]	training's auc: 1	valid_0's auc: 0.954815
[160]	training's auc: 1	valid_0's auc: 0.955718
[170]	training's auc: 1	valid_0's auc: 0.956247
[180]	training's auc: 1	valid_0's auc: 0.955963
[190]

In [19]:
# 验证集结果
X_valid['prob'] = lgb_val_0.predict(X_valid[useful_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.4735, 1, 0)

f1_04735 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_04735 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_04735: ', f1_04735)
print('auc_04735: ', auc_04735)

f1_04735:  0.8851
auc_04735:  0.9568774812263835


In [20]:
lgb_train_all = lgb.Dataset(df_train[useful_cols].values, df_train['label'])   

print('Start training...')

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_val_0.best_iteration + 20)

print('Done!')

Start training...
Done!


In [21]:
df_test['label'] = np.where(lgb_model.predict(df_test[useful_cols]) > 0.4735, 1, 0)
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_04735), index=False)