In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from itertools import product
import lightgbm as lgb
from sklearn.metrics import f1_score, roc_auc_score
import gc
import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('max_colwidth', 200)

In [2]:
# count编码
def count_coding(df, fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return df

# 交叉特征统计
def cross_cat_num(df, num_col, cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median'
            })
            df = df.merge(feat, on=f1, how='left')
    return df

In [3]:
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg(['mean', 'size'])
        col_avg_y['size'] = prior_weight_func(col_avg_y['size'])
        col_avg_y[nf_name] = col_avg_y['size'] * prior + (1 - col_avg_y['size']) * col_avg_y['mean']
        col_avg_y.drop(['size', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

### 用户表

In [4]:
train_user = pd.read_hdf('../input/train_user.h5')
test_user = pd.read_hdf('../input/test_user.h5')

In [5]:
train_user.head()

Unnamed: 0,phone_no_m,city_name,county_name,idcard_cnt,label,arpu_202004
0,672ddbf02a5544d32e4ecc9433b1981bffe23bf912273a3a835f6cccb78b8ed7554e9ab0fbcd33d19eb6063ce00542dd223cc5cc83c68f07bcf933547b6776b1,绵阳,江油分公司,1,0,45.0
1,5e1272273e041e82cb275ae877710be98cdaf5b0a8f34de8d361f71d5268fa0851edffd3950e170df1e3846fcf90cc7cc8299be9139a2ac4b5c5e5121d832674,德阳,旌阳分公司,1,0,60.0
2,eaab3472ec87b076e69e6e8bb62b14341638fc63661a6c682d6add360a4332a8ad294d8470d64a73c6e53e8413f0ad93b9ea65afab717e58d312554f33553ee7,成都,金堂分公司,2,0,63.0
3,0ce1bb415704178bf44e9c9b431a39b083a132c8e6d99f3627cd4520064b93ec66a72d085a67f0f40eebb9f901072965073e398f04c4ae500a1db4dfb13a972c,成都,高新分公司,2,0,23.203125
4,28b87f35f63f65096a53e3a4c97eaffd4a6c43ffa7e92d7706b20520aa11fff0f4ce06203f66758ec0f0e70780e4e258aae6aa1f23439e115528e08eda71ed20,德阳,旌阳分公司,1,0,50.0


In [6]:
test_user.head()

Unnamed: 0,phone_no_m,city_name,county_name,idcard_cnt,arpu_202004
0,22d522340df77e2252c1a4d92b4bcb00d515e36f3ec6bf94e017b4ffd67e26520af18637ad91bde69abd831cc36cdc0c5af5e57bc280f95f79efef7bfa9c6926,成都,金牛分公司,1,9.0
1,5220d4b8429bdba3971a7b46a088c6b8fa6710f40607598ac1219daab6071510b3f66f78637420574dbb2d5488d44de9d4360a644bbe4ec45aaa4b7513bdc886,天府新区,天府直属部门,1,
2,7d19dd2b50ced56f03d23bf928cf34dc570a48525571a868ffdcdc53a865b7a41c386a6f1d6567dd4a710530b933308c77d1ef89f77b580d1983dadd3de162ea,成都,锦江分公司,1,9.0
3,134a4a591185c9d3788021896dcfc235e9e0a6a1e3f8a4fd3a9f9d9bf4a033e791b7199db980b0678020bdf689d719306a5c694ffa47b4ba2bec43ceda68dfc1,绵阳,江油分公司,1,138.75
4,2356dcd6759d50455ddaeed03c838843558e9182d5962f8a4c81a9a178063c49ab988750f795bb6c5462aee5dace2ad9fd18dbb1a6619dbb1e771e9b0f192da9,成都,青羊分公司,2,39.0


In [7]:
train_user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6106 entries, 0 to 6105
Data columns (total 6 columns):
phone_no_m     6106 non-null object
city_name      6106 non-null object
county_name    6106 non-null object
idcard_cnt     6106 non-null int8
label          6106 non-null int8
arpu_202004    5369 non-null float16
dtypes: float16(1), int8(2), object(3)
memory usage: 214.7+ KB


In [8]:
test_user.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2045 entries, 0 to 2044
Data columns (total 5 columns):
phone_no_m     2045 non-null object
city_name      2045 non-null object
county_name    2045 non-null object
idcard_cnt     2045 non-null int8
arpu_202004    1869 non-null float16
dtypes: float16(1), int8(1), object(3)
memory usage: 69.9+ KB


In [9]:
df_user = pd.concat([train_user, test_user])

In [10]:
# 号码量*消费值
df_user['idcard_cnt*arpu_202004'] = df_user['idcard_cnt'] * df_user['arpu_202004']

In [11]:
# count编码
count_list = ['city_name', 'county_name', 'idcard_cnt']

df_user = count_coding(df_user, count_list)

In [12]:
cross_cat = ['city_name', 'county_name']
cross_num = ['idcard_cnt']
df_user = cross_cat_num(df_user, cross_num, cross_cat)

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 62.54it/s]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 37.76it/s]


### 电话表

In [13]:
train_voc = pd.read_hdf('../input/train_voc.h5')
test_voc = pd.read_hdf('../input/test_voc.h5')

In [14]:
train_voc.head()

Unnamed: 0,phone_no_m,opposite_no_m,calltype_id,start_datetime,call_dur,city_name,county_name,imei_m
9,b3994b4c46e39954cfe0cb8ebd2a79703a2ace6612fa18241509e8b2dfe9d75f503823b6df2d10180a9033ccd22390aaeda397037ca4cd26604d6db6e3470874,1b01124a6ad0443ad868757c6594a605a0c5179c2b5b57071d5cec8cd8c4c89d7598587f953f08887965b42d2e465d08388c5465fe43b67f1e9ca5509c5529f2,1,2020-03-22 21:38:40,175,,,90ba61111dad175330029e10da8264a86dfdb3a281ccb8556eca87a6f1af32e1d5ff4db4a7d4b11f760e4866de974933f247c4c389073f28634229889bfe18bf
10,b3994b4c46e39954cfe0cb8ebd2a79703a2ace6612fa18241509e8b2dfe9d75f503823b6df2d10180a9033ccd22390aaeda397037ca4cd26604d6db6e3470874,1b01124a6ad0443ad868757c6594a605a0c5179c2b5b57071d5cec8cd8c4c89d7598587f953f08887965b42d2e465d08388c5465fe43b67f1e9ca5509c5529f2,1,2020-03-23 18:05:41,33,,,90ba61111dad175330029e10da8264a86dfdb3a281ccb8556eca87a6f1af32e1d5ff4db4a7d4b11f760e4866de974933f247c4c389073f28634229889bfe18bf
219,c5502a6d3e4cabaf8c0e298c4aac693b6e80835d50d4e4f393d8426bd49c0d4d00c3959d2a2c8341410e57af28bee6dc5c80dbad7cf147018ec0ece7bfa0cf5f,f1b73c46a691c068bfd38253e20e882d4993c8d3e58016aeed804a8cdbf54542112e529ebf7fdd0e62cf3556571fb17b865d8c3d57c42be851d75c90f4df4603,1,2020-03-01 11:24:37,39,,,e1e1149b2c06e9972293c457293abb32ad05b0c82b3b64c3786896b0414c897b60a539329ce3b7df83ebee19ad20fee3971303edd776a593578f03d16ee8981a
222,c5502a6d3e4cabaf8c0e298c4aac693b6e80835d50d4e4f393d8426bd49c0d4d00c3959d2a2c8341410e57af28bee6dc5c80dbad7cf147018ec0ece7bfa0cf5f,5bc6d97a3017578edfa7979fa43653cd74bad827f18337c8c84728f79778939b6f939ea3aa2661907f2e79bcf70feb93adb79c3f7d4a934368c459d9b6aeb54e,1,2020-03-01 11:23:44,32,,,e1e1149b2c06e9972293c457293abb32ad05b0c82b3b64c3786896b0414c897b60a539329ce3b7df83ebee19ad20fee3971303edd776a593578f03d16ee8981a
276,2bdbc0a45d6228970b353d691e2ec229f0cf60790a1a39d5e01c0175b5e20cd2ca7d39aff8506741884fd7a4cd4fd9fdec5112ddb764d81c2983752c989f1078,c42eeddeff98feb87860441ea6548f8bda2a080c935e02c7a6ae30223cf37dfc2d16760224c79b93ea44a6d3fd77139e03a20f47608b9455372a96cb026f239e,2,2020-03-03 15:34:21,3,,,3e808a14fa2aa61524c70f262539d1c8532b8e3a8c7c87c863b3f8420750f1bf4a63aa900aef09620c1fae1a6a7fadf1f42e30477e24336fc5c2a8fae9b5de6e


In [15]:
test_voc.head()

Unnamed: 0,phone_no_m,opposite_no_m,calltype_id,start_datetime,call_dur,city_name,county_name,imei_m
0,b3dce36871f3e88164b18d4953b114163f008cb51c28fe932ae3c734f3b1d1e2853d63fb3fb52a09fd9d0997b64fe5796507d3b50768fe0dce23819c8b24729c,f87f526ee776ac8b6b28392620fbb6049af9eaadab7de2ee9d7c5a31d7404566b8a3c25f404f793e513448a8c5dad1940a597a488e6f165a1aad6b103cf40f59,1,2020-04-13 21:04:10,1909,,,685a0bfcd91b4ecec2ff35d656a35a4e4f9e1c7a92b15f5c3c0675fcf8b6c9adc35731ab15e91401f2367f1d9d2910f9e8829e2e5052c96bc800fcde6c505214
1,b3dce36871f3e88164b18d4953b114163f008cb51c28fe932ae3c734f3b1d1e2853d63fb3fb52a09fd9d0997b64fe5796507d3b50768fe0dce23819c8b24729c,f87f526ee776ac8b6b28392620fbb6049af9eaadab7de2ee9d7c5a31d7404566b8a3c25f404f793e513448a8c5dad1940a597a488e6f165a1aad6b103cf40f59,1,2020-04-13 18:32:50,2510,,,685a0bfcd91b4ecec2ff35d656a35a4e4f9e1c7a92b15f5c3c0675fcf8b6c9adc35731ab15e91401f2367f1d9d2910f9e8829e2e5052c96bc800fcde6c505214
2,b3dce36871f3e88164b18d4953b114163f008cb51c28fe932ae3c734f3b1d1e2853d63fb3fb52a09fd9d0997b64fe5796507d3b50768fe0dce23819c8b24729c,f87f526ee776ac8b6b28392620fbb6049af9eaadab7de2ee9d7c5a31d7404566b8a3c25f404f793e513448a8c5dad1940a597a488e6f165a1aad6b103cf40f59,1,2020-04-13 19:55:12,25,,,685a0bfcd91b4ecec2ff35d656a35a4e4f9e1c7a92b15f5c3c0675fcf8b6c9adc35731ab15e91401f2367f1d9d2910f9e8829e2e5052c96bc800fcde6c505214
3,7ec68a368fbbec3279a6a34847f7959623dbff4638351aa0c4ef3e6c719a4cce873e332d11c2c8f860a67edcc4bc3706da02976e26c218474c67a3ed5cdffed5,ed8280e5590e40e35a2af7d8708a0754540b0612600d5255b8b79e576d1543765e552e30cb50aae3f4d424e08e8104a848f021316949d72e20a9efe0c29eba7c,1,2020-04-20 14:11:44,276,,,a60e82a4383faa79ea972adc48686a729f689b7d1923bce82ca67c7deae306fec30004b30260b56e13d77f791271222594954cc4f88a65c3348b97a9eeb47ad2
4,7ec68a368fbbec3279a6a34847f7959623dbff4638351aa0c4ef3e6c719a4cce873e332d11c2c8f860a67edcc4bc3706da02976e26c218474c67a3ed5cdffed5,ba792e76c3e7ee47d2403083f6def313372ae450ddf0b946d2f3d237e05cd99e75265be9d91ce58bc149919c97e1355f56082da790dccb9cffb0d7e11c4fa7ed,2,2020-04-20 12:56:33,539,,,a60e82a4383faa79ea972adc48686a729f689b7d1923bce82ca67c7deae306fec30004b30260b56e13d77f791271222594954cc4f88a65c3348b97a9eeb47ad2


In [16]:
train_voc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 709144 entries, 9 to 5015413
Data columns (total 8 columns):
phone_no_m        709144 non-null object
opposite_no_m     709144 non-null object
calltype_id       709144 non-null int8
start_datetime    709144 non-null object
call_dur          709144 non-null int16
city_name         709144 non-null object
county_name       709144 non-null object
imei_m            709144 non-null object
dtypes: int16(1), int8(1), object(6)
memory usage: 39.9+ MB


In [17]:
test_voc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 276522 entries, 0 to 276521
Data columns (total 8 columns):
phone_no_m        276522 non-null object
opposite_no_m     276522 non-null object
calltype_id       276522 non-null int8
start_datetime    276522 non-null object
call_dur          276522 non-null int16
city_name         276522 non-null object
county_name       276522 non-null object
imei_m            276522 non-null object
dtypes: int16(1), int8(1), object(6)
memory usage: 15.6+ MB


In [18]:
train_voc.nunique()

phone_no_m          4823
opposite_no_m     281103
calltype_id            3
start_datetime    561615
call_dur            2653
city_name             22
county_name          193
imei_m              6025
dtype: int64

In [19]:
test_voc.nunique()

phone_no_m          1965
opposite_no_m     107683
calltype_id            3
start_datetime    253144
call_dur            2089
city_name             22
county_name          173
imei_m              2656
dtype: int64

In [20]:
df_voc = pd.concat([train_voc, test_voc])

In [21]:
df_voc['voc_day'] = df_voc['start_datetime'].astype('datetime64').dt.day
df_voc['voc_hour'] = df_voc['start_datetime'].astype('datetime64').dt.hour
df_voc['voc_dayofweek'] = df_voc['start_datetime'].astype('datetime64').dt.dayofweek

In [22]:
df_voc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 985666 entries, 9 to 276521
Data columns (total 11 columns):
phone_no_m        985666 non-null object
opposite_no_m     985666 non-null object
calltype_id       985666 non-null int8
start_datetime    985666 non-null object
call_dur          985666 non-null int16
city_name         985666 non-null object
county_name       985666 non-null object
imei_m            985666 non-null object
voc_day           985666 non-null int64
voc_hour          985666 non-null int64
voc_dayofweek     985666 non-null int64
dtypes: int16(1), int64(3), int8(1), object(6)
memory usage: 78.0+ MB


In [23]:
# 每天通话次数统计
df_voc['voc_day_count'] = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].transform('count')
df_voc['voc_day_count_max'] = df_voc.groupby('phone_no_m')['voc_day_count'].transform('max')
df_voc['voc_day_count_min'] = df_voc.groupby('phone_no_m')['voc_day_count'].transform('min')
df_voc['voc_day_count_mean'] = df_voc.groupby('phone_no_m')['voc_day_count'].transform('mean')
df_voc['voc_day_count_std'] = df_voc.groupby('phone_no_m')['voc_day_count'].transform('std')

del df_voc['voc_day_count']


# 每天与不同的人通话次数统计
df_voc['voc_day_diff_count'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_day'])['phone_no_m'].transform('count')
df_voc['voc_day_diff_count_max'] = df_voc.groupby('phone_no_m')['voc_day_diff_count'].transform('max')
df_voc['voc_day_diff_count_min'] = df_voc.groupby('phone_no_m')['voc_day_diff_count'].transform('min')
df_voc['voc_day_diff_count_mean'] = df_voc.groupby('phone_no_m')['voc_day_diff_count'].transform('mean')
df_voc['voc_day_diff_count_std'] = df_voc.groupby('phone_no_m')['voc_day_diff_count'].transform('std')

del df_voc['voc_day_diff_count']

In [24]:
# 一小时内通话统计量

df_voc['voc_hour_count'] = df_voc.groupby(['phone_no_m', 'voc_hour'])['phone_no_m'].transform('count')
df_voc['voc_hour_count_max'] = df_voc.groupby('phone_no_m')['voc_hour_count'].transform('max')
df_voc['voc_hour_count_min'] = df_voc.groupby('phone_no_m')['voc_hour_count'].transform('min')
df_voc['voc_hour_count_mean'] = df_voc.groupby('phone_no_m')['voc_hour_count'].transform('mean')
df_voc['voc_hour_count_std'] = df_voc.groupby('phone_no_m')['voc_hour_count'].transform('std')

del df_voc['voc_hour_count']


# 一小时内与不同的人通话统计量

df_voc['voc_hour_diff_count'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_hour'])['phone_no_m'].transform('count')
df_voc['voc_hour_diff_count_max'] = df_voc.groupby('phone_no_m')['voc_hour_diff_count'].transform('max')
df_voc['voc_hour_diff_count_min'] = df_voc.groupby('phone_no_m')['voc_hour_diff_count'].transform('min')
df_voc['voc_hour_diff_count_mean'] = df_voc.groupby('phone_no_m')['voc_hour_diff_count'].transform('mean')
df_voc['voc_hour_diff_count_std'] = df_voc.groupby('phone_no_m')['voc_hour_diff_count'].transform('std')

del df_voc['voc_hour_diff_count']

In [25]:
# 周几通话统计量

df_voc['voc_dayofweek_count'] = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['phone_no_m'].transform('count')
df_voc['voc_dayofweek_count_max'] = df_voc.groupby('phone_no_m')['voc_dayofweek'].transform('max')
df_voc['voc_dayofweek_count_min'] = df_voc.groupby('phone_no_m')['voc_dayofweek'].transform('min')
df_voc['voc_dayofweek_count_mean'] = df_voc.groupby('phone_no_m')['voc_dayofweek'].transform('mean')
df_voc['voc_dayofweek_count_std'] = df_voc.groupby('phone_no_m')['voc_dayofweek'].transform('std')

del df_voc['voc_dayofweek_count']

In [26]:
# 该月总的通话次数
df_voc['voc_count'] = df_voc.groupby('phone_no_m')['phone_no_m'].transform('count')

# 相互打电话次数
df_voc['voc_count_mutual'] = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['phone_no_m'].transform('count')

### 短信表

In [27]:
train_sms = pd.read_hdf('../input/train_sms.h5')
test_sms = pd.read_hdf('../input/test_sms.h5')

In [28]:
train_sms = train_sms[train_sms['request_datetime'] >= '2020-03-01 00:00:00']

In [29]:
df_sms = pd.concat([train_sms, test_sms])

In [30]:
df_sms.rename(columns={'calltype_id': 'calltype_id_sms'}, inplace=True)

In [31]:
df_sms['sms_day'] = df_sms['request_datetime'].astype('datetime64').dt.day
df_sms['sms_hour'] = df_sms['request_datetime'].astype('datetime64').dt.hour
df_sms['sms_dayofweek'] = df_sms['request_datetime'].astype('datetime64').dt.dayofweek

In [32]:
# 一天内短信统计量

df_sms['sms_day_count'] = df_sms.groupby(['phone_no_m', 'sms_day'])['phone_no_m'].transform('count')
df_sms['sms_day_count_max'] = df_sms.groupby('phone_no_m')['sms_day_count'].transform('max')
df_sms['sms_day_count_min'] = df_sms.groupby('phone_no_m')['sms_day_count'].transform('min')
df_sms['sms_day_count_mean'] = df_sms.groupby('phone_no_m')['sms_day_count'].transform('mean')
df_sms['sms_day_count_std'] = df_sms.groupby('phone_no_m')['sms_day_count'].transform('std')

del df_sms['sms_day_count']


# 一天内与不同的人短信统计量
df_sms['sms_day_diff_count'] = df_sms.groupby(['phone_no_m', 'opposite_no_m', 'sms_day'])['phone_no_m'].transform('count')
df_sms['sms_day_diff_count_max'] = df_sms.groupby('phone_no_m')['sms_day_diff_count'].transform('max')
df_sms['sms_day_diff_count_min'] = df_sms.groupby('phone_no_m')['sms_day_diff_count'].transform('min')
df_sms['sms_day_diff_count_mean'] = df_sms.groupby('phone_no_m')['sms_day_diff_count'].transform('mean')
df_sms['sms_day_diff_count_std'] = df_sms.groupby('phone_no_m')['sms_day_diff_count'].transform('std')

del df_sms['sms_day_diff_count']

In [33]:
# 一小时内短信统计量

df_sms['sms_hour_count'] = df_sms.groupby(['phone_no_m', 'sms_hour'])['phone_no_m'].transform('count')
df_sms['sms_hour_count_max'] = df_sms.groupby('phone_no_m')['sms_hour_count'].transform('max')
df_sms['sms_hour_count_min'] = df_sms.groupby('phone_no_m')['sms_hour_count'].transform('min')
df_sms['sms_hour_count_mean'] = df_sms.groupby('phone_no_m')['sms_hour_count'].transform('mean')
df_sms['sms_hour_count_std'] = df_sms.groupby('phone_no_m')['sms_hour_count'].transform('std')

del df_sms['sms_hour_count']


# 一小时内与不同的人短信统计量
df_sms['sms_hour_diff_count'] = df_sms.groupby(['phone_no_m', 'opposite_no_m', 'sms_hour'])['phone_no_m'].transform('count')
df_sms['sms_hour_diff_count_max'] = df_sms.groupby('phone_no_m')['sms_hour_diff_count'].transform('max')
df_sms['sms_hour_diff_count_min'] = df_sms.groupby('phone_no_m')['sms_hour_diff_count'].transform('min')
df_sms['sms_hour_diff_count_mean'] = df_sms.groupby('phone_no_m')['sms_hour_diff_count'].transform('mean')
df_sms['sms_hour_diff_count_std'] = df_sms.groupby('phone_no_m')['sms_hour_diff_count'].transform('std')

del df_sms['sms_hour_diff_count']

In [34]:
# 周几短信统计量

df_sms['sms_dayofweek_count'] = df_sms.groupby(['phone_no_m', 'sms_dayofweek'])['phone_no_m'].transform('count')
df_sms['sms_dayofweek_count_max'] = df_sms.groupby('phone_no_m')['sms_dayofweek_count'].transform('max')
df_sms['sms_dayofweek_count_min'] = df_sms.groupby('phone_no_m')['sms_dayofweek_count'].transform('min')
df_sms['sms_dayofweek_count_mean'] = df_sms.groupby('phone_no_m')['sms_dayofweek_count'].transform('mean')
df_sms['sms_dayofweek_count_std'] = df_sms.groupby('phone_no_m')['sms_dayofweek_count'].transform('std')

del df_sms['sms_dayofweek_count']

In [35]:
# 该月总的短信次数
df_sms['sms_count'] = df_sms.groupby('phone_no_m')['phone_no_m'].transform('count')

# 相互发送短信次数
df_sms['sms_count_mutual'] = df_sms.groupby(['phone_no_m', 'opposite_no_m'])['phone_no_m'].transform('count')

### 应用表

In [36]:
train_app = pd.read_hdf('../input/train_app.h5')
test_app = pd.read_hdf('../input/test_app.h5')

In [37]:
train_app = train_app[train_app['month_id'] == '2020-03']

In [38]:
df_app = pd.concat([train_app, test_app])

In [39]:
# 用户流量统计

df_app['total_flow'] = df_app.groupby('phone_no_m')['flow'].transform('sum')
df_app['flow_max'] = df_app.groupby('phone_no_m')['flow'].transform('max')
df_app['flow_min'] = df_app.groupby('phone_no_m')['flow'].transform('min')
df_app['flow_mean'] = df_app.groupby('phone_no_m')['flow'].transform('mean')
df_app['flow_std'] = df_app.groupby('phone_no_m')['flow'].transform('std')

# 应用数
df_app['app_count'] = df_app.groupby('phone_no_m')['phone_no_m'].transform('count')

In [40]:
# 用户在每个app上的流量统计

df_app['busi_name_total_flow'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('sum')
df_app['busi_name_flow_max'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('max')
df_app['busi_name_flow_min'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('min')
df_app['busi_name_flow_mean'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('mean')
df_app['busi_name_flow_std'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('std')

### 合并数据

In [41]:
df_user.shape, df_voc.shape, df_sms.shape, df_app.shape

((8151, 16), (985666, 33), (1319506, 29), (512100, 15))

In [42]:
df_voc.columns.tolist()

['phone_no_m',
 'opposite_no_m',
 'calltype_id',
 'start_datetime',
 'call_dur',
 'city_name',
 'county_name',
 'imei_m',
 'voc_day',
 'voc_hour',
 'voc_dayofweek',
 'voc_day_count_max',
 'voc_day_count_min',
 'voc_day_count_mean',
 'voc_day_count_std',
 'voc_day_diff_count_max',
 'voc_day_diff_count_min',
 'voc_day_diff_count_mean',
 'voc_day_diff_count_std',
 'voc_hour_count_max',
 'voc_hour_count_min',
 'voc_hour_count_mean',
 'voc_hour_count_std',
 'voc_hour_diff_count_max',
 'voc_hour_diff_count_min',
 'voc_hour_diff_count_mean',
 'voc_hour_diff_count_std',
 'voc_dayofweek_count_max',
 'voc_dayofweek_count_min',
 'voc_dayofweek_count_mean',
 'voc_dayofweek_count_std',
 'voc_count',
 'voc_count_mutual']

In [43]:
df_sms.columns.tolist()

['phone_no_m',
 'opposite_no_m',
 'calltype_id_sms',
 'request_datetime',
 'sms_day',
 'sms_hour',
 'sms_dayofweek',
 'sms_day_count_max',
 'sms_day_count_min',
 'sms_day_count_mean',
 'sms_day_count_std',
 'sms_day_diff_count_max',
 'sms_day_diff_count_min',
 'sms_day_diff_count_mean',
 'sms_day_diff_count_std',
 'sms_hour_count_max',
 'sms_hour_count_min',
 'sms_hour_count_mean',
 'sms_hour_count_std',
 'sms_hour_diff_count_max',
 'sms_hour_diff_count_min',
 'sms_hour_diff_count_mean',
 'sms_hour_diff_count_std',
 'sms_dayofweek_count_max',
 'sms_dayofweek_count_min',
 'sms_dayofweek_count_mean',
 'sms_dayofweek_count_std',
 'sms_count',
 'sms_count_mutual']

In [44]:
df_app.columns.tolist()

['phone_no_m',
 'busi_name',
 'flow',
 'month_id',
 'total_flow',
 'flow_max',
 'flow_min',
 'flow_mean',
 'flow_std',
 'app_count',
 'busi_name_total_flow',
 'busi_name_flow_max',
 'busi_name_flow_min',
 'busi_name_flow_mean',
 'busi_name_flow_std']

In [45]:
df_voc = df_voc[['phone_no_m', 'calltype_id', 'call_dur', 'voc_day', 'voc_hour', 'voc_dayofweek',
                 'voc_day_count_max', 'voc_day_count_min', 'voc_day_count_mean', 'voc_day_count_std',
                 'voc_hour_count_max', 'voc_hour_count_min', 'voc_hour_count_mean', 'voc_hour_count_std',
                 'voc_dayofweek_count_max', 'voc_dayofweek_count_min', 'voc_dayofweek_count_mean', 'voc_dayofweek_count_std',
                 'voc_count', 'voc_count_mutual',
                 'voc_day_diff_count_max', 'voc_day_diff_count_min', 'voc_day_diff_count_mean', 'voc_day_diff_count_std',
                 'voc_hour_diff_count_max', 'voc_hour_diff_count_min', 'voc_hour_diff_count_mean', 'voc_hour_diff_count_std'
                ]].drop_duplicates(subset=['phone_no_m'])

In [46]:
df_sms = df_sms[['phone_no_m', 'calltype_id_sms', 'sms_day', 'sms_hour', 'sms_dayofweek',
                 'sms_day_count_max', 'sms_day_count_min', 'sms_day_count_mean', 'sms_day_count_std',
                 'sms_hour_count_max', 'sms_hour_count_min', 'sms_hour_count_mean', 'sms_hour_count_std',
                 'sms_dayofweek_count_max', 'sms_dayofweek_count_min', 'sms_dayofweek_count_mean', 'sms_dayofweek_count_std',
                 'sms_count', 'sms_count_mutual',
                 'sms_day_diff_count_max', 'sms_day_diff_count_min', 'sms_day_diff_count_mean', 'sms_day_diff_count_std',
                 'sms_hour_diff_count_max', 'sms_hour_diff_count_min', 'sms_hour_diff_count_mean', 'sms_hour_diff_count_std'
                ]].drop_duplicates(subset=['phone_no_m'])

In [47]:
df_app = df_app[['phone_no_m', 'busi_name', 'flow',
                 'total_flow', 'flow_max', 'flow_min', 'flow_mean', 'flow_std',
                 'app_count',
                 'busi_name_total_flow','busi_name_flow_max', 'busi_name_flow_min', 'busi_name_flow_mean', 'busi_name_flow_std'
                 ]].drop_duplicates(subset=['phone_no_m'])

In [48]:
df = pd.merge(df_user, df_voc, how='left', on='phone_no_m')
df = pd.merge(df, df_sms, how='left', on='phone_no_m')
df = pd.merge(df, df_app, how='left', on='phone_no_m')

In [49]:
del df_user, df_voc, df_sms, df_app
gc.collect()

104

In [50]:
lbl = LabelEncoder()

for f in tqdm(['city_name', 'county_name', 'busi_name']):
    df[f] = df[f].fillna('NA')
    df[f] = lbl.fit_transform(df[f].astype(str))

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 115.45it/s]


In [51]:
df.columns

Index(['arpu_202004', 'city_name', 'county_name', 'idcard_cnt', 'label',
       'phone_no_m', 'idcard_cnt*arpu_202004', 'city_name_count',
       'county_name_count', 'idcard_cnt_count', 'city_name_idcard_cnt_max',
       'city_name_idcard_cnt_min', 'city_name_idcard_cnt_median',
       'county_name_idcard_cnt_max', 'county_name_idcard_cnt_min',
       'county_name_idcard_cnt_median', 'calltype_id', 'call_dur', 'voc_day',
       'voc_hour', 'voc_dayofweek', 'voc_day_count_max', 'voc_day_count_min',
       'voc_day_count_mean', 'voc_day_count_std', 'voc_hour_count_max',
       'voc_hour_count_min', 'voc_hour_count_mean', 'voc_hour_count_std',
       'voc_dayofweek_count_max', 'voc_dayofweek_count_min',
       'voc_dayofweek_count_mean', 'voc_dayofweek_count_std', 'voc_count',
       'voc_count_mutual', 'voc_day_diff_count_max', 'voc_day_diff_count_min',
       'voc_day_diff_count_mean', 'voc_day_diff_count_std',
       'voc_hour_diff_count_max', 'voc_hour_diff_count_min',
       'voc_ho

In [52]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 82), (2045, 82))

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop('label', axis=1), df_train['label'], random_state=2020)

In [54]:
train_cols = [i for i in X_train if i not in ['phone_no_m', 'label']]

In [55]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0,
          'lambda_l2': 1,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [56]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0,
          'lambda_l2': 1,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

use_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_eval= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=10,
                         valid_sets=[lgb_eval, lgb_train],
                         early_stopping_rounds=5,
                         verbose_eval=1)
    
    auc = roc_auc_score(y_train, lgb_test.predict(X_train[[i]]))
    if auc > 0.5:
        use_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

arpu_202004
[1]	training's auc: 0.803919	valid_0's auc: 0.774554
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.804885	valid_0's auc: 0.77426
[3]	training's auc: 0.806832	valid_0's auc: 0.774972
[4]	training's auc: 0.808242	valid_0's auc: 0.776228
[5]	training's auc: 0.80766	valid_0's auc: 0.777569
[6]	training's auc: 0.808825	valid_0's auc: 0.775938
[7]	training's auc: 0.809293	valid_0's auc: 0.776673
[8]	training's auc: 0.810724	valid_0's auc: 0.776911
[9]	training's auc: 0.811426	valid_0's auc: 0.777286
[10]	training's auc: 0.811849	valid_0's auc: 0.775407
Did not meet early stopping. Best iteration is:
[10]	training's auc: 0.811849	valid_0's auc: 0.775407
********************


city_name
[1]	training's auc: 0.684712	valid_0's auc: 0.671773
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.684824	valid_0's auc: 0.671976
[3]	training's auc: 0.684824	valid_0's auc: 0.671976
[4]	training's auc: 0.684824	valid_0's auc: 0

[4]	training's auc: 0.833226	valid_0's auc: 0.826044
[5]	training's auc: 0.833226	valid_0's auc: 0.826044
[6]	training's auc: 0.833226	valid_0's auc: 0.826044
Early stopping, best iteration is:
[1]	training's auc: 0.833226	valid_0's auc: 0.826044
********************


call_dur
[1]	training's auc: 0.836735	valid_0's auc: 0.790613
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.838043	valid_0's auc: 0.790494
[3]	training's auc: 0.838929	valid_0's auc: 0.787079
[4]	training's auc: 0.840119	valid_0's auc: 0.790451
[5]	training's auc: 0.840256	valid_0's auc: 0.789837
[6]	training's auc: 0.841841	valid_0's auc: 0.787631
Early stopping, best iteration is:
[1]	training's auc: 0.836735	valid_0's auc: 0.790613
********************


voc_day
[1]	training's auc: 0.837525	valid_0's auc: 0.81743
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.837525	valid_0's auc: 0.81743
[3]	training's auc: 0.83753	valid_0's auc: 0.817384
[4]	trai

[2]	training's auc: 0.839021	valid_0's auc: 0.822927
********************


voc_dayofweek_count_min
[1]	training's auc: 0.825545	valid_0's auc: 0.816535
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.825545	valid_0's auc: 0.816535
[3]	training's auc: 0.825576	valid_0's auc: 0.816557
[4]	training's auc: 0.825576	valid_0's auc: 0.816557
[5]	training's auc: 0.825576	valid_0's auc: 0.816557
[6]	training's auc: 0.825576	valid_0's auc: 0.816557
[7]	training's auc: 0.825576	valid_0's auc: 0.816557
[8]	training's auc: 0.825576	valid_0's auc: 0.816557
Early stopping, best iteration is:
[3]	training's auc: 0.825576	valid_0's auc: 0.816557
********************


voc_dayofweek_count_mean
[1]	training's auc: 0.856173	valid_0's auc: 0.826865
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.857037	valid_0's auc: 0.826537
[3]	training's auc: 0.859183	valid_0's auc: 0.822086
[4]	training's auc: 0.861672	valid_0's auc: 0.825108
[5]	trai

********************


sms_day
[1]	training's auc: 0.83682	valid_0's auc: 0.819007
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.836816	valid_0's auc: 0.819904
[3]	training's auc: 0.836942	valid_0's auc: 0.819411
[4]	training's auc: 0.836942	valid_0's auc: 0.819411
[5]	training's auc: 0.836954	valid_0's auc: 0.819403
[6]	training's auc: 0.836954	valid_0's auc: 0.819403
[7]	training's auc: 0.836965	valid_0's auc: 0.819479
Early stopping, best iteration is:
[2]	training's auc: 0.836816	valid_0's auc: 0.819904
********************


sms_hour
[1]	training's auc: 0.810397	valid_0's auc: 0.806184
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.810397	valid_0's auc: 0.806184
[3]	training's auc: 0.810397	valid_0's auc: 0.806184
[4]	training's auc: 0.810397	valid_0's auc: 0.806184
[5]	training's auc: 0.810397	valid_0's auc: 0.806184
[6]	training's auc: 0.810397	valid_0's auc: 0.806184
Early stopping, best iteration is:
[1]	tr

[4]	training's auc: 0.85887	valid_0's auc: 0.823454
[5]	training's auc: 0.858967	valid_0's auc: 0.824091
[6]	training's auc: 0.860497	valid_0's auc: 0.82343
Early stopping, best iteration is:
[1]	training's auc: 0.853887	valid_0's auc: 0.824814
********************


sms_dayofweek_count_std
[1]	training's auc: 0.865438	valid_0's auc: 0.846543
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.865525	valid_0's auc: 0.848343
[3]	training's auc: 0.866629	valid_0's auc: 0.848335
[4]	training's auc: 0.868051	valid_0's auc: 0.848875
[5]	training's auc: 0.868301	valid_0's auc: 0.849528
[6]	training's auc: 0.868749	valid_0's auc: 0.849607
[7]	training's auc: 0.86904	valid_0's auc: 0.848953
[8]	training's auc: 0.869479	valid_0's auc: 0.848485
[9]	training's auc: 0.870002	valid_0's auc: 0.849105
[10]	training's auc: 0.870022	valid_0's auc: 0.850427
Did not meet early stopping. Best iteration is:
[10]	training's auc: 0.870022	valid_0's auc: 0.850427
****************

[8]	training's auc: 0.831164	valid_0's auc: 0.796281
[9]	training's auc: 0.83214	valid_0's auc: 0.797099
[10]	training's auc: 0.832462	valid_0's auc: 0.797115
Did not meet early stopping. Best iteration is:
[10]	training's auc: 0.832462	valid_0's auc: 0.797115
********************


total_flow
[1]	training's auc: 0.825316	valid_0's auc: 0.810011
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.827839	valid_0's auc: 0.807508
[3]	training's auc: 0.828117	valid_0's auc: 0.806002
[4]	training's auc: 0.829023	valid_0's auc: 0.807064
[5]	training's auc: 0.829621	valid_0's auc: 0.806538
[6]	training's auc: 0.830228	valid_0's auc: 0.806073
Early stopping, best iteration is:
[1]	training's auc: 0.825316	valid_0's auc: 0.810011
********************


flow_max
[1]	training's auc: 0.822922	valid_0's auc: 0.799302
Training until validation scores don't improve for 5 rounds
[2]	training's auc: 0.823109	valid_0's auc: 0.801653
[3]	training's auc: 0.824046	valid_0's au

In [57]:
print(use_cols)

['arpu_202004', 'city_name', 'county_name', 'idcard_cnt', 'idcard_cnt*arpu_202004', 'city_name_count', 'county_name_count', 'idcard_cnt_count', 'city_name_idcard_cnt_max', 'city_name_idcard_cnt_min', 'city_name_idcard_cnt_median', 'county_name_idcard_cnt_max', 'county_name_idcard_cnt_min', 'county_name_idcard_cnt_median', 'calltype_id', 'call_dur', 'voc_day', 'voc_hour', 'voc_dayofweek', 'voc_day_count_max', 'voc_day_count_min', 'voc_day_count_mean', 'voc_day_count_std', 'voc_hour_count_max', 'voc_hour_count_min', 'voc_hour_count_mean', 'voc_hour_count_std', 'voc_dayofweek_count_max', 'voc_dayofweek_count_min', 'voc_dayofweek_count_mean', 'voc_dayofweek_count_std', 'voc_count', 'voc_count_mutual', 'voc_day_diff_count_max', 'voc_day_diff_count_min', 'voc_day_diff_count_mean', 'voc_day_diff_count_std', 'voc_hour_diff_count_max', 'voc_hour_diff_count_min', 'voc_hour_diff_count_mean', 'voc_hour_diff_count_std', 'calltype_id_sms', 'sms_day', 'sms_hour', 'sms_dayofweek', 'sms_day_count_max',

In [58]:
print(useless_cols)

[]


In [59]:
lgb_train = lgb.Dataset(X_train[use_cols].values, y_train) 

lgb_eval= lgb.Dataset(X_valid[use_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

# train

lgb_val_0 = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_eval, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.979093	valid_0's auc: 0.934074
[20]	training's auc: 0.99165	valid_0's auc: 0.94308
[30]	training's auc: 0.997122	valid_0's auc: 0.947472
[40]	training's auc: 0.99889	valid_0's auc: 0.948817
[50]	training's auc: 0.999451	valid_0's auc: 0.94998
[60]	training's auc: 0.999703	valid_0's auc: 0.950674
[70]	training's auc: 0.999819	valid_0's auc: 0.950188
[80]	training's auc: 0.999896	valid_0's auc: 0.949726
[90]	training's auc: 0.999942	valid_0's auc: 0.949097
[100]	training's auc: 0.999967	valid_0's auc: 0.949828
[110]	training's auc: 0.999975	valid_0's auc: 0.949701
[120]	training's auc: 0.99999	valid_0's auc: 0.94942
[130]	training's auc: 0.999998	valid_0's auc: 0.949205
[140]	training's auc: 1	valid_0's auc: 0.949046
[150]	training's auc: 1	valid_0's auc: 0.949377
[160]	training's auc: 1	valid_0's auc: 0.949639
Early stopping, best iteration is:
[64]	training's auc: 0.999746	valid_0's a

In [60]:
lgb_train_all = lgb.Dataset(df_train[use_cols].values, df_train['label'])   

print('Start training...')

# train

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_val_0.best_iteration + 20)

Start training...


In [61]:
df_train['prob'] = lgb_model.predict(df_train[use_cols])
df_train['pred'] = np.where(df_train['prob'] > 0.5, 1, 0)

f1 = np.round(f1_score(df_train['label'], df_train['pred']), 4)
auc = roc_auc_score(df_train['label'], df_train['prob'])

print('f1: ', f1)
print('auc: ', auc)

f1:  0.9964
auc:  0.9997677272619933


In [63]:
df_test['label'] = np.where(lgb_model.predict(df_test[use_cols]) > 0.5, 1, 0)
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1), index=False)