In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import LabelEncoder
from itertools import product
import lightgbm as lgb
from sklearn.metrics import f1_score, roc_auc_score
import gc
import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('max_colwidth', 200)

In [2]:
# count编码
def count_coding(df, fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return df

# 交叉特征统计
def cross_cat_num(df, num_col, cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median'
            })
            df = df.merge(feat, on=f1, how='left')
    return df

In [3]:
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg(['mean', 'size'])
        col_avg_y['size'] = prior_weight_func(col_avg_y['size'])
        col_avg_y[nf_name] = col_avg_y['size'] * prior + (1 - col_avg_y['size']) * col_avg_y['mean']
        col_avg_y.drop(['size', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

### 用户表

In [4]:
train_user = pd.read_hdf('../input/train_user.h5')
test_user = pd.read_hdf('../input/test_user.h5')

In [5]:
df_user = pd.concat([train_user, test_user])

In [6]:
# 号码量*消费值
df_user['idcard_cnt*arpu_202004'] = df_user['idcard_cnt'] * df_user['arpu_202004']

# 消费值/号码量
df_user['arpu_202004/idcard_cnt'] = df_user['arpu_202004'] / df_user['idcard_cnt']

In [7]:
# count编码
count_list = ['city_name', 'county_name', 'idcard_cnt']

df_user = count_coding(df_user, count_list)

In [8]:
lbl = LabelEncoder()

for f in tqdm(['city_name', 'county_name']):
    df_user[f] = df_user[f].fillna('NA')
    df_user[f] = lbl.fit_transform(df_user[f].astype(str))

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 286.48it/s]


In [9]:
# 交叉特征
cross_cat = ['city_name', 'county_name']
cross_num = ['idcard_cnt', 'arpu_202004']
df_user = cross_cat_num(df_user, cross_num, cross_cat)

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 115.03it/s][A

100%|███████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 133.69it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 50.32it/s]


### 电话表

In [10]:
cols = ['phone_no_m', 'opposite_no_m', 'calltype_id', 'start_datetime', 'call_dur', 'imei_m']

train_voc = pd.read_hdf('../input/train_voc.h5', usecols=cols)
test_voc = pd.read_hdf('../input/test_voc.h5', usecols=cols)

In [11]:
df_voc = pd.concat([train_voc, test_voc])

In [12]:
df_voc['voc_day'] = df_voc['start_datetime'].astype('datetime64').dt.day
df_voc['voc_hour'] = df_voc['start_datetime'].astype('datetime64').dt.hour
df_voc['voc_dayofweek'] = df_voc['start_datetime'].astype('datetime64').dt.dayofweek
df_voc['voc_time'] = (df_voc['voc_day'] - df_voc['voc_day'].min()) * 24 + df_voc['voc_hour']

In [13]:
# 每个phone_no_m有几个独立的手机
df_voc['voc_phone_nunique'] = df_voc.groupby('phone_no_m')['imei_m'].transform('nunique')

# 每个phone_no_m在该月与多少个不同的人通话
df_voc['voc_opposite_nunique'] = df_voc.groupby('phone_no_m')['opposite_no_m'].transform('nunique')

# 每个phone_no_m在该月打了多少通电话
df_voc['voc_count'] = df_voc.groupby('phone_no_m')['phone_no_m'].transform('count')

# 每个phone_no_m在该月总的通话时长
df_voc['call_dur_sum'] = df_voc.groupby('phone_no_m')['call_dur'].transform('sum')

# 每个phone_no_m在该月与每个opposite_no_m打了多少通电话
df_voc['voc_count_mutual'] = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['phone_no_m'].transform('count')

# 每个phone_no_m在该月与每个opposite_no_m打了多久的电话
df_voc['call_dur_sum_mutual'] = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['call_dur'].transform('sum')

#### 通话次数

##### 每天通话次数统计 day

In [14]:
## 用户每个小时,每天出现的次数

# for time in ['day','time']:
#     print('user_id_'+time)
#     data['user_id_'+time] = data['user_id']  + '_' + data[time].astype(str)
#     dic_ = data['user_id_'+time].value_counts().to_dict()
#     data['user_id_'+time +'_count'] = data['user_id_'+time].apply(lambda x: dic_[x])
#     data.drop('user_id_'+time, axis=1,inplace = True)

In [15]:
# 每天通话次数统计

# 每个phone_no_m在每天的通话次数
df_voc['voc_day_count'] = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].transform('count')
# 每个phone_no_m每天的通话次数的最大值
df_voc['voc_day_count_max'] = df_voc.groupby('phone_no_m')['voc_day_count'].transform('max')
# df_voc['voc_day_count_min'] = df_voc.groupby('phone_no_m')['voc_day_count'].transform('min')
df_voc['voc_day_count_mean'] = df_voc.groupby('phone_no_m')['voc_day_count'].transform('mean')
df_voc['voc_day_count_std'] = df_voc.groupby('phone_no_m')['voc_day_count'].transform('std')


# 每天与不同的人通话次数统计
df_voc['voc_day_diff_count'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_day'])['phone_no_m'].transform('count')
df_voc['voc_day_diff_count_max'] = df_voc.groupby('phone_no_m')['voc_day_diff_count'].transform('max')
df_voc['voc_day_diff_count_min'] = df_voc.groupby('phone_no_m')['voc_day_diff_count'].transform('min')
df_voc['voc_day_diff_count_mean'] = df_voc.groupby('phone_no_m')['voc_day_diff_count'].transform('mean')
df_voc['voc_day_diff_count_std'] = df_voc.groupby('phone_no_m')['voc_day_diff_count'].transform('std')

del df_voc['voc_day_count'], df_voc['voc_day_diff_count']
gc.collect()

# 每个phone_no_m在几号的通话次数
voc_day_res = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()

for i in df_voc['voc_day'].unique():
    df_voc['voc_day{}_count'.format(i)] = df_voc['phone_no_m'].map(voc_day_res[i])

##### 小时通话次数统计 voc_time

In [16]:
# 小时通话次数统计
df_voc['voc_time_count'] = df_voc.groupby(['phone_no_m', 'voc_time'])['phone_no_m'].transform('count')
df_voc['voc_time_count_max'] = df_voc.groupby('phone_no_m')['voc_time_count'].transform('max')
# df_voc['voc_time_count_min'] = df_voc.groupby('phone_no_m')['voc_time_count'].transform('min')
df_voc['voc_time_count_mean'] = df_voc.groupby('phone_no_m')['voc_time_count'].transform('mean')
df_voc['voc_time_count_std'] = df_voc.groupby('phone_no_m')['voc_time_count'].transform('std')


# 小时与不同的人通话次数统计
df_voc['voc_time_diff_count'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_time'])['phone_no_m'].transform('count')
df_voc['voc_time_diff_count_max'] = df_voc.groupby('phone_no_m')['voc_time_diff_count'].transform('max')
# df_voc['voc_time_diff_count_min'] = df_voc.groupby('phone_no_m')['voc_time_diff_count'].transform('min')
df_voc['voc_time_diff_count_mean'] = df_voc.groupby('phone_no_m')['voc_time_diff_count'].transform('mean')
df_voc['voc_time_diff_count_std'] = df_voc.groupby('phone_no_m')['voc_time_diff_count'].transform('std')

del df_voc['voc_time_count'], df_voc['voc_time_diff_count']
gc.collect()

0

##### 每小时通话次数统计 hour

In [17]:
# 一小时内通话次数

df_voc['voc_hour_count'] = df_voc.groupby(['phone_no_m', 'voc_hour'])['phone_no_m'].transform('count')
df_voc['voc_hour_count_max'] = df_voc.groupby('phone_no_m')['voc_hour_count'].transform('max')
# df_voc['voc_hour_count_min'] = df_voc.groupby('phone_no_m')['voc_hour_count'].transform('min')
df_voc['voc_hour_count_mean'] = df_voc.groupby('phone_no_m')['voc_hour_count'].transform('mean')
df_voc['voc_hour_count_std'] = df_voc.groupby('phone_no_m')['voc_hour_count'].transform('std')


# 一小时内与不同的人通话次数

df_voc['voc_hour_diff_count'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_hour'])['phone_no_m'].transform('count')
df_voc['voc_hour_diff_count_max'] = df_voc.groupby('phone_no_m')['voc_hour_diff_count'].transform('max')
# df_voc['voc_hour_diff_count_min'] = df_voc.groupby('phone_no_m')['voc_hour_diff_count'].transform('min')
df_voc['voc_hour_diff_count_mean'] = df_voc.groupby('phone_no_m')['voc_hour_diff_count'].transform('mean')
df_voc['voc_hour_diff_count_std'] = df_voc.groupby('phone_no_m')['voc_hour_diff_count'].transform('std')

del df_voc['voc_hour_count'], df_voc['voc_hour_diff_count']
gc.collect()

0

##### 按周几通话次数统计

In [18]:
# 周几通话次数

df_voc['voc_dayofweek_count'] = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['phone_no_m'].transform('count')
df_voc['voc_dayofweek_count_max'] = df_voc.groupby('phone_no_m')['voc_dayofweek_count'].transform('max')
# df_voc['voc_dayofweek_count_min'] = df_voc.groupby('phone_no_m')['voc_dayofweek_count'].transform('min')
df_voc['voc_dayofweek_count_mean'] = df_voc.groupby('phone_no_m')['voc_dayofweek_count'].transform('mean')
df_voc['voc_dayofweek_count_std'] = df_voc.groupby('phone_no_m')['voc_dayofweek_count'].transform('std')


# 周几与不同的人通话次数

df_voc['voc_dayofweek_diff_count'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_dayofweek'])['phone_no_m'].transform('count')
df_voc['voc_dayofweek_diff_count_max'] = df_voc.groupby('phone_no_m')['voc_dayofweek_diff_count'].transform('max')
# df_voc['voc_dayofweek_diff_count_min'] = df_voc.groupby('phone_no_m')['voc_dayofweek_diff_count'].transform('min')
df_voc['voc_dayofweek_diff_count_mean'] = df_voc.groupby('phone_no_m')['voc_dayofweek_diff_count'].transform('mean')
df_voc['voc_dayofweek_diff_count_std'] = df_voc.groupby('phone_no_m')['voc_dayofweek_diff_count'].transform('std')

del df_voc['voc_dayofweek_count'], df_voc['voc_dayofweek_diff_count']
gc.collect()

# 每个phone_no_m在周几的通话次数
voc_dayofweek_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['phone_no_m'].count().unstack()

for i in df_voc['voc_dayofweek'].unique():
    df_voc['voc_dayofweek{}_count'.format(i)] = df_voc['phone_no_m'].map(voc_dayofweek_res[i])

#### 通话时长

##### 每天通话时长统计 voc_day

In [19]:
# 每天通话时长统计
df_voc['call_dur_day_sum'] = df_voc.groupby(['phone_no_m', 'voc_day'])['call_dur'].transform('sum')
df_voc['call_dur_day_sum_max'] = df_voc.groupby('phone_no_m')['call_dur_day_sum'].transform('max')
df_voc['call_dur_day_sum_min'] = df_voc.groupby('phone_no_m')['call_dur_day_sum'].transform('min')
df_voc['call_dur_day_sum_mean'] = df_voc.groupby('phone_no_m')['call_dur_day_sum'].transform('mean')
df_voc['call_dur_day_sum_std'] = df_voc.groupby('phone_no_m')['call_dur_day_sum'].transform('std')


# 每天与不同的人通话时长统计
df_voc['call_dur_day_diff_sum'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_day'])['call_dur'].transform('sum')
df_voc['call_dur_day_diff_sum_max'] = df_voc.groupby('phone_no_m')['call_dur_day_diff_sum'].transform('max')
df_voc['call_dur_day_diff_sum_min'] = df_voc.groupby('phone_no_m')['call_dur_day_diff_sum'].transform('min')
df_voc['call_dur_day_diff_sum_mean'] = df_voc.groupby('phone_no_m')['call_dur_day_diff_sum'].transform('mean')
df_voc['call_dur_day_diff_sum_std'] = df_voc.groupby('phone_no_m')['call_dur_day_diff_sum'].transform('std')

del df_voc['call_dur_day_sum'], df_voc['call_dur_day_diff_sum']
gc.collect()


# 每个phone_no_m在几号的通话时长
voc_day_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_day'])['call_dur'].sum().unstack()

for i in df_voc['voc_day'].unique():
    df_voc['voc_day{}_call_dur_sum'.format(i)] = df_voc['phone_no_m'].map(voc_day_call_dur_res[i])

##### 小时通话时长统计 voc_time

In [20]:
# 小时通话时长统计
df_voc['call_dur_time_sum'] = df_voc.groupby(['phone_no_m', 'voc_time'])['call_dur'].transform('sum')
df_voc['call_dur_time_sum_max'] = df_voc.groupby('phone_no_m')['call_dur_time_sum'].transform('max')
# df_voc['call_dur_time_sum_min'] = df_voc.groupby('phone_no_m')['call_dur_time_sum'].transform('min')
df_voc['call_dur_time_sum_mean'] = df_voc.groupby('phone_no_m')['call_dur_time_sum'].transform('mean')
df_voc['call_dur_time_sum_std'] = df_voc.groupby('phone_no_m')['call_dur_time_sum'].transform('std')


# 小时与不同的人通话时长统计
df_voc['call_dur_time_diff_sum'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_time'])['call_dur'].transform('sum')
df_voc['call_dur_time_diff_sum_max'] = df_voc.groupby('phone_no_m')['call_dur_time_diff_sum'].transform('max')
# df_voc['call_dur_time_diff_sum_min'] = df_voc.groupby('phone_no_m')['call_dur_time_diff_sum'].transform('min')
df_voc['call_dur_time_diff_sum_mean'] = df_voc.groupby('phone_no_m')['call_dur_time_diff_sum'].transform('mean')
df_voc['call_dur_time_diff_sum_std'] = df_voc.groupby('phone_no_m')['call_dur_time_diff_sum'].transform('std')

del df_voc['call_dur_time_sum'], df_voc['call_dur_time_diff_sum']
gc.collect()


# 每个phone_no_m在几号的通话时长
# voc_day_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_day'])['call_dur'].sum().unstack()

# for i in df_voc['voc_day'].unique():
#     df_voc['voc_day_call_dur{}_sum'.format(i)] = df_voc['phone_no_m'].map(voc_day_call_dur_res[i])

0

##### 每小时通话时长统计 hour

In [21]:
# 每小时通话时长统计
df_voc['call_dur_hour_sum'] = df_voc.groupby(['phone_no_m', 'voc_hour'])['call_dur'].transform('sum')
df_voc['call_dur_hour_sum_max'] = df_voc.groupby('phone_no_m')['call_dur_hour_sum'].transform('max')
df_voc['call_dur_hour_sum_min'] = df_voc.groupby('phone_no_m')['call_dur_hour_sum'].transform('min')
df_voc['call_dur_hour_sum_mean'] = df_voc.groupby('phone_no_m')['call_dur_hour_sum'].transform('mean')
df_voc['call_dur_hour_sum_std'] = df_voc.groupby('phone_no_m')['call_dur_hour_sum'].transform('std')


# 每小时与不同的人通话时长统计
df_voc['call_dur_hour_diff_sum'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_hour'])['call_dur'].transform('sum')
df_voc['call_dur_hour_diff_sum_max'] = df_voc.groupby('phone_no_m')['call_dur_hour_diff_sum'].transform('max')
df_voc['call_dur_hour_diff_sum_min'] = df_voc.groupby('phone_no_m')['call_dur_hour_diff_sum'].transform('min')
df_voc['call_dur_hour_diff_sum_mean'] = df_voc.groupby('phone_no_m')['call_dur_hour_diff_sum'].transform('mean')
df_voc['call_dur_hour_diff_sum_std'] = df_voc.groupby('phone_no_m')['call_dur_hour_diff_sum'].transform('std')

del df_voc['call_dur_hour_sum'], df_voc['call_dur_hour_diff_sum']
gc.collect()


# 每个phone_no_m在几时的通话时长
voc_hour_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['call_dur'].sum().unstack()

for i in df_voc['voc_hour'].unique():
    df_voc['voc_hour{}_call_dur_sum'.format(i)] = df_voc['phone_no_m'].map(voc_hour_call_dur_res[i])

##### 周几通话时长统计 dayofweek

In [22]:
# 周几通话时长统计
df_voc['call_dur_dayofweek_sum'] = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['call_dur'].transform('sum')
df_voc['call_dur_dayofweek_sum_max'] = df_voc.groupby('phone_no_m')['call_dur_dayofweek_sum'].transform('max')
df_voc['call_dur_dayofweek_sum_min'] = df_voc.groupby('phone_no_m')['call_dur_dayofweek_sum'].transform('min')
df_voc['call_dur_dayofweek_sum_mean'] = df_voc.groupby('phone_no_m')['call_dur_dayofweek_sum'].transform('mean')
df_voc['call_dur_dayofweek_sum_std'] = df_voc.groupby('phone_no_m')['call_dur_dayofweek_sum'].transform('std')


# 周几与不同的人通话时长统计
df_voc['call_dur_dayofweek_diff_sum'] = df_voc.groupby(['phone_no_m', 'opposite_no_m', 'voc_dayofweek'])['call_dur'].transform('sum')
df_voc['call_dur_dayofweek_diff_sum_max'] = df_voc.groupby('phone_no_m')['call_dur_dayofweek_diff_sum'].transform('max')
df_voc['call_dur_dayofweek_diff_sum_min'] = df_voc.groupby('phone_no_m')['call_dur_dayofweek_diff_sum'].transform('min')
df_voc['call_dur_dayofweek_diff_sum_mean'] = df_voc.groupby('phone_no_m')['call_dur_dayofweek_diff_sum'].transform('mean')
df_voc['call_dur_dayofweek_diff_sum_std'] = df_voc.groupby('phone_no_m')['call_dur_dayofweek_diff_sum'].transform('std')

del df_voc['call_dur_dayofweek_sum'], df_voc['call_dur_dayofweek_diff_sum']
gc.collect()


# 每个phone_no_m在周几的通话时长
voc_dayofweek_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['call_dur'].sum().unstack()

for i in df_voc['voc_dayofweek'].unique():
    df_voc['voc_dayofweek{}_call_dur_sum'.format(i)] = df_voc['phone_no_m'].map(voc_dayofweek_call_dur_res[i])

### 短信表

In [23]:
train_sms = pd.read_hdf('../input/train_sms.h5')
test_sms = pd.read_hdf('../input/test_sms.h5')

In [24]:
df_sms = pd.concat([train_sms, test_sms])

In [25]:
df_sms['sms_day'] = df_sms['request_datetime'].astype('datetime64').dt.day
df_sms['sms_hour'] = df_sms['request_datetime'].astype('datetime64').dt.hour
df_sms['sms_dayofweek'] = df_sms['request_datetime'].astype('datetime64').dt.dayofweek
df_sms['sms_time'] = (df_sms['sms_day'] - df_sms['sms_day'].min()) * 24 + df_sms['sms_hour']

In [26]:
df_sms.rename(columns={'calltype_id': 'calltype_id_sms'}, inplace=True)

In [27]:
# 该月总的短信次数
df_sms['sms_count'] = df_sms.groupby('phone_no_m')['phone_no_m'].transform('count')

# 相互发送短信次数
df_sms['sms_count_mutual'] = df_sms.groupby(['phone_no_m', 'opposite_no_m'])['phone_no_m'].transform('count')

In [28]:
# 一天内短信统计量

df_sms['sms_day_count'] = df_sms.groupby(['phone_no_m', 'sms_day'])['phone_no_m'].transform('count')
df_sms['sms_day_count_max'] = df_sms.groupby('phone_no_m')['sms_day_count'].transform('max')
df_sms['sms_day_count_min'] = df_sms.groupby('phone_no_m')['sms_day_count'].transform('min')
df_sms['sms_day_count_mean'] = df_sms.groupby('phone_no_m')['sms_day_count'].transform('mean')
df_sms['sms_day_count_std'] = df_sms.groupby('phone_no_m')['sms_day_count'].transform('std')

del df_sms['sms_day_count']


# 一天内与不同的人短信统计量
df_sms['sms_day_diff_count'] = df_sms.groupby(['phone_no_m', 'opposite_no_m', 'sms_day'])['phone_no_m'].transform('count')
df_sms['sms_day_diff_count_max'] = df_sms.groupby('phone_no_m')['sms_day_diff_count'].transform('max')
df_sms['sms_day_diff_count_min'] = df_sms.groupby('phone_no_m')['sms_day_diff_count'].transform('min')
df_sms['sms_day_diff_count_mean'] = df_sms.groupby('phone_no_m')['sms_day_diff_count'].transform('mean')
df_sms['sms_day_diff_count_std'] = df_sms.groupby('phone_no_m')['sms_day_diff_count'].transform('std')

del df_sms['sms_day_diff_count']
gc.collect()

# 几号短信次数统计
sms_day_res = df_sms.groupby(['phone_no_m', 'sms_day'])['phone_no_m'].count().unstack()

for i in df_sms['sms_day'].unique():
    df_sms['sms_day{}_count'.format(i)] = df_sms['phone_no_m'].map(sms_day_res[i])

In [29]:
# 一小时内短信统计量

df_sms['sms_hour_count'] = df_sms.groupby(['phone_no_m', 'sms_hour'])['phone_no_m'].transform('count')
df_sms['sms_hour_count_max'] = df_sms.groupby('phone_no_m')['sms_hour_count'].transform('max')
df_sms['sms_hour_count_min'] = df_sms.groupby('phone_no_m')['sms_hour_count'].transform('min')
df_sms['sms_hour_count_mean'] = df_sms.groupby('phone_no_m')['sms_hour_count'].transform('mean')
df_sms['sms_hour_count_std'] = df_sms.groupby('phone_no_m')['sms_hour_count'].transform('std')




# 一小时内与不同的人短信统计量
df_sms['sms_hour_diff_count'] = df_sms.groupby(['phone_no_m', 'opposite_no_m', 'sms_hour'])['phone_no_m'].transform('count')
df_sms['sms_hour_diff_count_max'] = df_sms.groupby('phone_no_m')['sms_hour_diff_count'].transform('max')
df_sms['sms_hour_diff_count_min'] = df_sms.groupby('phone_no_m')['sms_hour_diff_count'].transform('min')
df_sms['sms_hour_diff_count_mean'] = df_sms.groupby('phone_no_m')['sms_hour_diff_count'].transform('mean')
df_sms['sms_hour_diff_count_std'] = df_sms.groupby('phone_no_m')['sms_hour_diff_count'].transform('std')

del df_sms['sms_hour_count'], df_sms['sms_hour_diff_count']
gc.collect()

0

In [30]:
# 周几短信统计量

df_sms['sms_dayofweek_count'] = df_sms.groupby(['phone_no_m', 'sms_dayofweek'])['phone_no_m'].transform('count')
df_sms['sms_dayofweek_count_max'] = df_sms.groupby('phone_no_m')['sms_dayofweek_count'].transform('max')
df_sms['sms_dayofweek_count_min'] = df_sms.groupby('phone_no_m')['sms_dayofweek_count'].transform('min')
df_sms['sms_dayofweek_count_mean'] = df_sms.groupby('phone_no_m')['sms_dayofweek_count'].transform('mean')
df_sms['sms_dayofweek_count_std'] = df_sms.groupby('phone_no_m')['sms_dayofweek_count'].transform('std')

# 周几与不同的人短信统计量
df_sms['sms_dayodweek_diff_count'] = df_sms.groupby(['phone_no_m', 'opposite_no_m', 'sms_dayofweek'])['phone_no_m'].transform('count')
df_sms['sms_dayodweek_diff_count_max'] = df_sms.groupby('phone_no_m')['sms_dayodweek_diff_count'].transform('max')
df_sms['sms_dayodweek_diff_count_min'] = df_sms.groupby('phone_no_m')['sms_dayodweek_diff_count'].transform('min')
df_sms['sms_dayodweek_diff_count_mean'] = df_sms.groupby('phone_no_m')['sms_dayodweek_diff_count'].transform('mean')
df_sms['sms_dayodweek_diff_count_std'] = df_sms.groupby('phone_no_m')['sms_dayodweek_diff_count'].transform('std')

del df_sms['sms_dayofweek_count'], df_sms['sms_dayodweek_diff_count']
gc.collect()

# 周几短信次数统计
sms_dayofweek_res = df_sms.groupby(['phone_no_m', 'sms_dayofweek'])['phone_no_m'].count().unstack()

for i in df_sms['sms_dayofweek'].unique():
    df_sms['sms_dayofweek{}_count'.format(i)] = df_sms['phone_no_m'].map(sms_dayofweek_res[i])

### 应用表

In [31]:
train_app = pd.read_hdf('../input/train_app.h5')
test_app = pd.read_hdf('../input/test_app.h5')

In [32]:
df_app = pd.concat([train_app, test_app])

In [33]:
# 用户流量统计

df_app['total_flow'] = df_app.groupby('phone_no_m')['flow'].transform('sum')
df_app['flow_max'] = df_app.groupby('phone_no_m')['flow'].transform('max')
df_app['flow_min'] = df_app.groupby('phone_no_m')['flow'].transform('min')
df_app['flow_mean'] = df_app.groupby('phone_no_m')['flow'].transform('mean')
df_app['flow_std'] = df_app.groupby('phone_no_m')['flow'].transform('std')

# 应用数
df_app['app_count'] = df_app.groupby('phone_no_m')['phone_no_m'].transform('count')

df_app['total_flow/app_count'] = np.round(df_app['total_flow'] / df_app['app_count'], 2)

In [34]:
# count编码
app_count_list = ['busi_name']

df_app = count_coding(df_app, app_count_list)

In [35]:
lbl = LabelEncoder()

for f in tqdm(['busi_name']):
    df_app[f] = df_app[f].fillna('NA')
    df_app[f] = lbl.fit_transform(df_app[f].astype(str))

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.56it/s]


In [36]:
# 用户在每个app上的流量统计

df_app['busi_name_total_flow'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('sum')
df_app['busi_name_flow_max'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('max')
df_app['busi_name_flow_min'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('min')
df_app['busi_name_flow_mean'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('mean')
df_app['busi_name_flow_std'] = df_app.groupby(['phone_no_m', 'busi_name'])['flow'].transform('std')

### 合并数据

In [37]:
df_user.shape, df_voc.shape, df_sms.shape, df_app.shape

((8151, 23), (985666, 173), (1319506, 72), (512100, 17))

In [38]:
print(df_user.columns.tolist())

['phone_no_m', 'city_name', 'county_name', 'idcard_cnt', 'label', 'arpu_202004', 'idcard_cnt*arpu_202004', 'arpu_202004/idcard_cnt', 'city_name_count', 'county_name_count', 'idcard_cnt_count', 'city_name_idcard_cnt_max', 'city_name_idcard_cnt_min', 'city_name_idcard_cnt_median', 'city_name_arpu_202004_max', 'city_name_arpu_202004_min', 'city_name_arpu_202004_median', 'county_name_idcard_cnt_max', 'county_name_idcard_cnt_min', 'county_name_idcard_cnt_median', 'county_name_arpu_202004_max', 'county_name_arpu_202004_min', 'county_name_arpu_202004_median']


In [39]:
print(df_voc.columns.tolist())

['phone_no_m', 'opposite_no_m', 'calltype_id', 'start_datetime', 'call_dur', 'city_name', 'county_name', 'imei_m', 'voc_day', 'voc_hour', 'voc_dayofweek', 'voc_time', 'voc_phone_nunique', 'voc_opposite_nunique', 'voc_count', 'call_dur_sum', 'voc_count_mutual', 'call_dur_sum_mutual', 'voc_day_count_max', 'voc_day_count_mean', 'voc_day_count_std', 'voc_day_diff_count_max', 'voc_day_diff_count_min', 'voc_day_diff_count_mean', 'voc_day_diff_count_std', 'voc_day22_count', 'voc_day23_count', 'voc_day1_count', 'voc_day3_count', 'voc_day9_count', 'voc_day6_count', 'voc_day10_count', 'voc_day26_count', 'voc_day27_count', 'voc_day13_count', 'voc_day28_count', 'voc_day30_count', 'voc_day20_count', 'voc_day16_count', 'voc_day19_count', 'voc_day21_count', 'voc_day29_count', 'voc_day17_count', 'voc_day7_count', 'voc_day14_count', 'voc_day15_count', 'voc_day5_count', 'voc_day2_count', 'voc_day12_count', 'voc_day8_count', 'voc_day18_count', 'voc_day31_count', 'voc_day4_count', 'voc_day24_count', 'voc_

In [40]:
voc_cols = [i for i in df_voc.columns if i not in ['opposite_no_m', 'start_datetime', 'city_name', 'county_name', 'imei_m']]
df_voc = df_voc[voc_cols].drop_duplicates(subset=['phone_no_m'])

In [41]:
print(df_sms.columns.tolist())

['phone_no_m', 'opposite_no_m', 'calltype_id_sms', 'request_datetime', 'sms_day', 'sms_hour', 'sms_dayofweek', 'sms_time', 'sms_count', 'sms_count_mutual', 'sms_day_count_max', 'sms_day_count_min', 'sms_day_count_mean', 'sms_day_count_std', 'sms_day_diff_count_max', 'sms_day_diff_count_min', 'sms_day_diff_count_mean', 'sms_day_diff_count_std', 'sms_day1_count', 'sms_day2_count', 'sms_day3_count', 'sms_day4_count', 'sms_day5_count', 'sms_day6_count', 'sms_day7_count', 'sms_day8_count', 'sms_day9_count', 'sms_day10_count', 'sms_day11_count', 'sms_day12_count', 'sms_day13_count', 'sms_day14_count', 'sms_day15_count', 'sms_day16_count', 'sms_day17_count', 'sms_day18_count', 'sms_day19_count', 'sms_day20_count', 'sms_day21_count', 'sms_day22_count', 'sms_day23_count', 'sms_day24_count', 'sms_day25_count', 'sms_day26_count', 'sms_day27_count', 'sms_day28_count', 'sms_day29_count', 'sms_day30_count', 'sms_day31_count', 'sms_hour_count_max', 'sms_hour_count_min', 'sms_hour_count_mean', 'sms_ho

In [42]:
sms_cols = [i for i in df_sms.columns if i not in ['opposite_no_m', 'request_datetime']]
df_sms = df_sms[sms_cols].drop_duplicates(subset=['phone_no_m'])

In [43]:
print(df_app.columns.tolist())

['phone_no_m', 'busi_name', 'flow', 'month_id', 'total_flow', 'flow_max', 'flow_min', 'flow_mean', 'flow_std', 'app_count', 'total_flow/app_count', 'busi_name_count', 'busi_name_total_flow', 'busi_name_flow_max', 'busi_name_flow_min', 'busi_name_flow_mean', 'busi_name_flow_std']


In [44]:
app_cols = [i for i in df_app.columns if i not in ['month_id']]
df_app = df_app[app_cols].drop_duplicates(subset=['phone_no_m'])

In [45]:
df = pd.merge(df_user, df_voc, how='left', on='phone_no_m')
df = pd.merge(df, df_sms, how='left', on='phone_no_m')
df = pd.merge(df, df_app, how='left', on='phone_no_m')

In [46]:
del df_user, df_voc, df_sms, df_app
gc.collect()

20

In [47]:
# 通话费用=消费额-短信次数
df['arpu_202004-sms_count'] = df['arpu_202004'] - df['sms_count']

# 消费额/总流量
df['arpu_202004/total_flow'] = np.round(df['arpu_202004'] / df['total_flow'], 2)

In [48]:
# 通话费用/多少个不同的人
df['arpu_202004-sms_count/voc_opposite_nunique'] = np.round(df['arpu_202004-sms_count'] / df['voc_opposite_nunique'], 2)

# 每次通话平均通话费用
df['arpu_202004-sms_count/voc_count'] = np.round(df['arpu_202004-sms_count'] / df['voc_count'], 2)

# 每分钟通话费用
df['arpu_202004-sms_count/call_dur_sum'] = np.round(df['arpu_202004-sms_count'] / df['call_dur_sum'], 2)

In [49]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 279), (2045, 279))

In [50]:
df_train_train = df_train[df_train['voc_day'] <= 23]
df_train_valid = df_train[df_train['voc_day'] > 23]

X_train = df_train_train.drop('label', axis=1)
y_train = df_train_train['label']
X_valid = df_train_valid.drop('label', axis=1)
y_valid = df_train_valid['label']

In [51]:
train_cols = [i for i in X_train if i not in ['phone_no_m', 'label']]

In [52]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0,
          'lambda_l2': 1,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [53]:
use_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_eval= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_eval, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.52:
        use_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

city_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.646321	valid_0's auc: 0.597995
[40]	training's auc: 0.646321	valid_0's auc: 0.597995
Early stopping, best iteration is:
[1]	training's auc: 0.64515	valid_0's auc: 0.601982
*****
0.601982421875
********************


county_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.748006	valid_0's auc: 0.609027
[40]	training's auc: 0.753241	valid_0's auc: 0.610905
Early stopping, best iteration is:
[2]	training's auc: 0.731584	valid_0's auc: 0.614792
*****
0.6147916666666666
********************


idcard_cnt
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.689453	valid_0's auc: 0.653587
[40]	training's auc: 0.689453	valid_0's auc: 0.653587
Early stopping, best iteration is:
[3]	training's auc: 0.689453	valid_0's auc: 0.653587
*****
0.6535872395833333
********************


arpu_202004
Training until validation scores don't improv

[40]	training's auc: 0.563116	valid_0's auc: 0.5
Early stopping, best iteration is:
[1]	training's auc: 0.546379	valid_0's auc: 0.5
*****
0.5
********************


voc_hour
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.582403	valid_0's auc: 0.508031
[40]	training's auc: 0.582462	valid_0's auc: 0.508415
Early stopping, best iteration is:
[1]	training's auc: 0.574698	valid_0's auc: 0.51847
*****
0.5184700520833333
********************


voc_dayofweek
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.531814	valid_0's auc: 0.509134
[40]	training's auc: 0.531814	valid_0's auc: 0.509134
Early stopping, best iteration is:
[1]	training's auc: 0.523237	valid_0's auc: 0.524248
*****
0.524248046875
********************


voc_time
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.673209	valid_0's auc: 0.5
[40]	training's auc: 0.679583	valid_0's auc: 0.5
Early stopping, best iteration is:
[1]	

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.694287	valid_0's auc: 0.597881
[40]	training's auc: 0.694549	valid_0's auc: 0.596663
[60]	training's auc: 0.695961	valid_0's auc: 0.59805
Early stopping, best iteration is:
[14]	training's auc: 0.693701	valid_0's auc: 0.602028
*****
0.6020279947916667
********************


voc_day13_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.689505	valid_0's auc: 0.656032
[40]	training's auc: 0.689582	valid_0's auc: 0.655882
Early stopping, best iteration is:
[1]	training's auc: 0.681876	valid_0's auc: 0.65749
*****
0.657490234375
********************


voc_day28_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.690149	valid_0's auc: 0.643818
[40]	training's auc: 0.690381	valid_0's auc: 0.642399
[60]	training's auc: 0.691194	valid_0's auc: 0.644629
[80]	training's auc: 0.691253	valid_0's auc: 0.644844
[100]	training's auc: 0.692258	v

[45]	training's auc: 0.707742	valid_0's auc: 0.644368
*****
0.6443684895833334
********************


voc_day11_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.703522	valid_0's auc: 0.676055
[40]	training's auc: 0.703746	valid_0's auc: 0.675029
[60]	training's auc: 0.703916	valid_0's auc: 0.675856
Early stopping, best iteration is:
[21]	training's auc: 0.703519	valid_0's auc: 0.676126
*****
0.6761263020833334
********************


voc_time_count_max
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.782229	valid_0's auc: 0.727926
[40]	training's auc: 0.78231	valid_0's auc: 0.728027
Early stopping, best iteration is:
[2]	training's auc: 0.781536	valid_0's auc: 0.72945
*****
0.7294498697916667
********************


voc_time_count_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.858719	valid_0's auc: 0.739167
[40]	training's auc: 0.865208	valid_0's auc: 0.732965
Early stopp

[40]	training's auc: 0.745521	valid_0's auc: 0.662533
Early stopping, best iteration is:
[3]	training's auc: 0.737388	valid_0's auc: 0.665049
*****
0.665048828125
********************


voc_dayofweek3_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.757538	valid_0's auc: 0.619974
[40]	training's auc: 0.759539	valid_0's auc: 0.618503
Early stopping, best iteration is:
[2]	training's auc: 0.754127	valid_0's auc: 0.626322
*****
0.6263216145833334
********************


voc_dayofweek5_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.74351	valid_0's auc: 0.661227
[40]	training's auc: 0.745484	valid_0's auc: 0.662201
[60]	training's auc: 0.747183	valid_0's auc: 0.66251
Early stopping, best iteration is:
[17]	training's auc: 0.743467	valid_0's auc: 0.664577
*****
0.6645768229166666
********************


voc_dayofweek2_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.7582

[1]	training's auc: 0.714835	valid_0's auc: 0.617718
*****
0.6177180989583333
********************


voc_day20_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.721194	valid_0's auc: 0.64096
[40]	training's auc: 0.725916	valid_0's auc: 0.639814
Early stopping, best iteration is:
[1]	training's auc: 0.708569	valid_0's auc: 0.649014
*****
0.649013671875
********************


voc_day16_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.707899	valid_0's auc: 0.625638
[40]	training's auc: 0.713831	valid_0's auc: 0.621888
Early stopping, best iteration is:
[8]	training's auc: 0.7004	valid_0's auc: 0.631999
*****
0.6319986979166666
********************


voc_day19_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.717761	valid_0's auc: 0.601191
[40]	training's auc: 0.723473	valid_0's auc: 0.605208
[60]	training's auc: 0.729123	valid_0's auc: 0.600911
[80]

[40]	training's auc: 0.687692	valid_0's auc: 0.54346
Early stopping, best iteration is:
[2]	training's auc: 0.633367	valid_0's auc: 0.559258
*****
0.5592578125
********************


call_dur_hour_sum_max
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.718857	valid_0's auc: 0.609046
[40]	training's auc: 0.727234	valid_0's auc: 0.607074
Early stopping, best iteration is:
[6]	training's auc: 0.692338	valid_0's auc: 0.613499
*****
0.6134993489583334
********************


call_dur_hour_sum_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.703739	valid_0's auc: 0.601344
[40]	training's auc: 0.710442	valid_0's auc: 0.605381
[60]	training's auc: 0.715202	valid_0's auc: 0.606605
[80]	training's auc: 0.718264	valid_0's auc: 0.606585
[100]	training's auc: 0.720131	valid_0's auc: 0.605589
[120]	training's auc: 0.721941	valid_0's auc: 0.605088
Early stopping, best iteration is:
[70]	training's auc: 0.716599	valid_0's auc: 0

*****
0.6402473958333333
********************


voc_hour17_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.722072	valid_0's auc: 0.594902
[40]	training's auc: 0.730511	valid_0's auc: 0.591423
Early stopping, best iteration is:
[4]	training's auc: 0.703396	valid_0's auc: 0.601553
*****
0.601552734375
********************


voc_hour7_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.667367	valid_0's auc: 0.644736
[40]	training's auc: 0.672393	valid_0's auc: 0.644831
Early stopping, best iteration is:
[1]	training's auc: 0.658345	valid_0's auc: 0.649714
*****
0.6497135416666666
********************


voc_hour5_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.527833	valid_0's auc: 0.51738
[40]	training's auc: 0.530958	valid_0's auc: 0.518252
Early stopping, best iteration is:
[1]	training's auc: 0.516551	valid_0's auc: 0.524691
*****
0.524690755208

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.755478	valid_0's auc: 0.645456
[40]	training's auc: 0.761533	valid_0's auc: 0.641979
Early stopping, best iteration is:
[3]	training's auc: 0.740085	valid_0's auc: 0.646911
*****
0.6469108072916666
********************


voc_dayofweek3_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.76502	valid_0's auc: 0.636012
[40]	training's auc: 0.770177	valid_0's auc: 0.630781
[60]	training's auc: 0.774449	valid_0's auc: 0.628851
Early stopping, best iteration is:
[17]	training's auc: 0.763674	valid_0's auc: 0.637933
*****
0.6379329427083333
********************


voc_dayofweek5_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.75995	valid_0's auc: 0.640771
[40]	training's auc: 0.766214	valid_0's auc: 0.646143
[60]	training's auc: 0.76953	valid_0's auc: 0.647337
[80]	training's auc: 0.771685	valid_0's auc: 0.648522
[100]

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.672438	valid_0's auc: 0.628623
[40]	training's auc: 0.672631	valid_0's auc: 0.631481
[60]	training's auc: 0.673487	valid_0's auc: 0.630876
Early stopping, best iteration is:
[24]	training's auc: 0.672564	valid_0's auc: 0.631914
*****
0.6319140625
********************


sms_day6_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.678133	valid_0's auc: 0.590374
[40]	training's auc: 0.679143	valid_0's auc: 0.586048
Early stopping, best iteration is:
[9]	training's auc: 0.677141	valid_0's auc: 0.593568
*****
0.5935677083333334
********************


sms_day7_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.65019	valid_0's auc: 0.618988
[40]	training's auc: 0.65059	valid_0's auc: 0.619118
Early stopping, best iteration is:
[1]	training's auc: 0.646442	valid_0's auc: 0.624714
*****
0.6247135416666667
********************


sms_day8

[40]	training's auc: 0.793516	valid_0's auc: 0.626725
[60]	training's auc: 0.793578	valid_0's auc: 0.626178
Early stopping, best iteration is:
[22]	training's auc: 0.793459	valid_0's auc: 0.628577
*****
0.6285774739583333
********************


sms_day28_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.791787	valid_0's auc: 0.644811
[40]	training's auc: 0.791861	valid_0's auc: 0.642484
Early stopping, best iteration is:
[1]	training's auc: 0.7911	valid_0's auc: 0.649538
*****
0.6495377604166667
********************


sms_day29_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.663709	valid_0's auc: 0.554479
[40]	training's auc: 0.66373	valid_0's auc: 0.55487
Early stopping, best iteration is:
[1]	training's auc: 0.659324	valid_0's auc: 0.558633
*****
0.5586328125
********************


sms_day30_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.789752	valid_0's auc: 0

*****
0.585
********************


sms_dayofweek4_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.708637	valid_0's auc: 0.60832
[40]	training's auc: 0.710277	valid_0's auc: 0.606631
Early stopping, best iteration is:
[5]	training's auc: 0.705415	valid_0's auc: 0.611048
*****
0.6110481770833334
********************


sms_dayofweek5_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.715491	valid_0's auc: 0.583955
[40]	training's auc: 0.716599	valid_0's auc: 0.58363
Early stopping, best iteration is:
[1]	training's auc: 0.708147	valid_0's auc: 0.596071
*****
0.5960709635416667
********************


busi_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.616577	valid_0's auc: 0.579424
[40]	training's auc: 0.616802	valid_0's auc: 0.578851
[60]	training's auc: 0.616821	valid_0's auc: 0.578434
Early stopping, best iteration is:
[19]	training's auc: 0.616577	valid_0's auc: 0.

In [54]:
print(useless_cols)

['city_name_arpu_202004_min', 'call_dur', 'voc_day', 'voc_hour', 'voc_time', 'voc_day_diff_count_min', 'voc_hour1_call_dur_sum', 'voc_hour3_call_dur_sum', 'voc_hour4_call_dur_sum', 'sms_day', 'sms_dayofweek', 'sms_time', 'sms_day_diff_count_min', 'sms_hour_count_min', 'sms_hour_diff_count_min', 'sms_dayodweek_diff_count_min']


In [55]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'binary_logloss',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0,
          'lambda_l2': 1,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [56]:
lgb_train = lgb.Dataset(X_train[use_cols].values, y_train) 

lgb_eval= lgb.Dataset(X_valid[use_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

# train

lgb_val_0 = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_eval, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's binary_logloss: 0.243228	valid_0's binary_logloss: 0.295694
[20]	training's binary_logloss: 0.168116	valid_0's binary_logloss: 0.273783
[30]	training's binary_logloss: 0.12044	valid_0's binary_logloss: 0.253521
[40]	training's binary_logloss: 0.0875677	valid_0's binary_logloss: 0.238933
[50]	training's binary_logloss: 0.0636606	valid_0's binary_logloss: 0.230784
[60]	training's binary_logloss: 0.0466134	valid_0's binary_logloss: 0.225667
[70]	training's binary_logloss: 0.034559	valid_0's binary_logloss: 0.223033
[80]	training's binary_logloss: 0.025951	valid_0's binary_logloss: 0.218926
[90]	training's binary_logloss: 0.0198123	valid_0's binary_logloss: 0.219906
[100]	training's binary_logloss: 0.0152716	valid_0's binary_logloss: 0.222576
[110]	training's binary_logloss: 0.0120816	valid_0's binary_logloss: 0.225995
[120]	training's binary_logloss: 0.00966144	valid_0's binary_logloss: 0.22893

In [57]:
# 验证集结果
X_valid['prob'] = lgb_val_0.predict(X_valid[use_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.4735, 1, 0)

f1_04735 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_04735 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_04735: ', f1_04735)
print('auc_04735: ', auc_04735)

f1_04735:  0.5751
auc_04735:  0.8563541666666666


In [58]:
# 验证集结果
X_valid['prob'] = lgb_val_0.predict(X_valid[use_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.5, 1, 0)

f1_05 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_05 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_05: ', f1_05)
print('auc_05: ', auc_05)

f1_05:  0.5727
auc_05:  0.8563541666666666


In [59]:
lgb_train_all = lgb.Dataset(df_train[use_cols].values, df_train['label'])   

print('Start training...')

# train

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_val_0.best_iteration + 20)

Start training...


In [60]:
df_test['label'] = np.where(lgb_model.predict(df_test[use_cols]) > 0.5, 1, 0)
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_05), index=False)

In [61]:
df_test['label'] = np.where(lgb_model.predict(df_test[use_cols]) > 0.4735, 1, 0)
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_04735), index=False)