In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from feature_selector import FeatureSelector
from sklearn.decomposition import NMF

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg(['mean', 'size'])
        col_avg_y['size'] = prior_weight_func(col_avg_y['size'])
        col_avg_y[nf_name] = col_avg_y['size'] * prior + (1 - col_avg_y['size']) * col_avg_y['mean']
        col_avg_y.drop(['size', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

In [4]:
def count_encode(df, cols):
    for col in tqdm(cols):
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df
        
        
def label_encode(df, cols):
    le = LabelEncoder()
    for col in tqdm(cols):
        df[col] = df[col].fillna('NA')
        df[col] = le.fit_transform(df[col].astype(str))
    return df


def cross_cat_num(df, cat_col, num_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            df_new = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_nunique'.format(f1, f2): 'nunique'
            })
            df = df.merge(df_new, on=f1, how='left')
            del df_new
            gc.collect()
    return df

In [5]:
train_user = reduce_mem_usage(pd.read_hdf('../input/train_user.h5'))
test_user = reduce_mem_usage(pd.read_hdf('../input/test_user.h5'))

Memory usage of dataframe is 219816.00 MB
Memory usage after optimization is: 219816.00 MB
Decreased by 0.0%
Memory usage of dataframe is 71575.00 MB
Memory usage after optimization is: 71575.00 MB
Decreased by 0.0%


In [6]:
# 将city_name和county_name拼接起来
train_user['city_name_county_name'] = train_user['city_name'].astype(str) + '_' + train_user['county_name'].astype(str)
test_user['city_name_county_name'] = test_user['city_name'].astype(str) + '_' + test_user['county_name'].astype(str)

In [7]:
cat_feat = ['city_name', 'county_name', 'city_name_county_name']
num_feat = ['idcard_cnt', 'arpu_202004', 'idcard_cnt*arpu_202004', 'arpu_202004/idcard_cnt']

In [8]:
y = train_user['label']

ME = MeanEncoder(categorical_features=cat_feat,
                 n_splits=3,
                 target_type='classification',
                 prior_weight_func=None)
X_data = ME.fit_transform(train_user.drop('label', axis=1), y)
X_test = ME.transform(test_user)

train_user = X_data.copy()
train_user['label'] = y
test_user = X_test.copy()

del X_data, X_test
gc.collect()

train_user.shape, test_user.shape

((6106, 13), (2045, 12))

In [9]:
df_user = pd.concat([train_user, test_user])

del train_user, test_user
gc.collect()

20

In [10]:
user_phone_no_m = df_user[['phone_no_m']].copy()

In [11]:
# 电话的数量*月消费额
df_user['idcard_cnt*arpu_202004'] = df_user['idcard_cnt'] * df_user['arpu_202004']

# 月消费额/电话的数量
df_user['arpu_202004/idcard_cnt'] = df_user['arpu_202004'] / (df_user['idcard_cnt'] + 0.0001)

In [12]:
df_user = cross_cat_num(df_user, cat_feat, num_feat)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 21.22it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  5.20it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:00<00:00, 12.06it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 12.79it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:00<00:00,  4.26it/s]
  0%|                    

In [13]:
df_user = count_encode(df_user, cat_feat + ['idcard_cnt'])

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 272.61it/s]

city_name
county_name
city_name_county_name
idcard_cnt





In [14]:
df_user = label_encode(df_user, cat_feat)

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 273.46it/s]


In [15]:
df_user.to_hdf('../input/user_features.h5', 'df', index=False)

del df_user
gc.collect()

55

## voc表

In [16]:
train_voc = pd.read_hdf('../input/train_voc.h5')
test_voc = pd.read_hdf('../input/test_voc.h5')

In [17]:
df_voc = pd.concat([train_voc, test_voc])

del train_voc, test_voc
gc.collect()

128

In [18]:
df_voc['city_name_county_name'] = df_voc['city_name'] + '_' + df_voc['county_name']

In [19]:
df_voc = user_phone_no_m.merge(df_voc, on='phone_no_m', how='left')
df_voc = df_voc[df_voc['start_datetime'].notnull()]

len(set(df_voc['phone_no_m']))

6788

In [20]:
df_voc['city_name_county_name'] = df_voc['city_name'] + '_' + df_voc['county_name']

In [21]:
df_voc['voc_day'] = df_voc['start_datetime'].astype('datetime64').dt.day
df_voc['voc_hour'] = df_voc['start_datetime'].astype('datetime64').dt.hour
df_voc['voc_dayofweek'] = df_voc['start_datetime'].astype('datetime64').dt.dayofweek

### phone_no_m为主键

In [22]:
phone_no_m = df_voc[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [23]:
# 通话次数，通话人数
tmp = df_voc.groupby('phone_no_m')['opposite_no_m'].agg(opposite_cnt='count', opposite_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, how='left', on='phone_no_m')

# 人均通话次数
phone_no_m['voc_cnt_per_capita'] = phone_no_m['opposite_cnt'] / (phone_no_m['opposite_nunique'] + 0.0001)

del tmp
gc.collect()

70

In [24]:
"""
主叫通话
"""

df_calltype_id_1 = df_voc.loc[df_voc['calltype_id'] == 1, :].copy()

# 主叫通话次数，主叫通话使用的手机个数
tmp = df_calltype_id_1.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_1_cnt='count', imeis_1='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长
tmp = df_calltype_id_1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_call_dur_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长小于10s的次数
# tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 10]
# tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_10s_cnt='count')
# phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
# del tmp1, tmp2
# gc.collect()

# 主叫通话时长小于30s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 30]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_30s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于60s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 60]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_60s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于300s（5分钟）的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] > 300]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_300s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# # 主叫通话时长小于10s的次数的占比
# phone_no_m['voc_calltype_id_1_10s_rate'] = phone_no_m['voc_calltype_id_1_10s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长小于30s的次数的占比
phone_no_m['voc_calltype_id_1_30s_rate'] = phone_no_m['voc_calltype_id_1_30s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长小于60s的次数的占比
phone_no_m['voc_calltype_id_1_60s_rate'] = phone_no_m['voc_calltype_id_1_60s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长大于300s（5分钟）的次数的占比
phone_no_m['voc_calltype_id_1_300s_rate'] = phone_no_m['voc_calltype_id_1_300s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']


# 主叫通话次数的占比
phone_no_m["call_type_id_1_rate"] = phone_no_m['voc_calltype_id_1_cnt'] / phone_no_m['opposite_cnt']

# 主叫通话时所在地市变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时所在区县变动的个数
tmp = df_calltype_id_1.groupby("phone_no_m")['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
del tmp
gc.collect()

# 主叫通话时所在地市_区县变动的个数
tmp = df_calltype_id_1.groupby("phone_no_m")['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
del tmp
gc.collect()

# 线上效果变差，不用
# # 主叫通话每天的通话次数
# calltype_id_1_voc_day_cnt_res = df_calltype_id_1.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
# for i in df_calltype_id_1['voc_day'].unique():
#     phone_no_m['calltype_id_1_voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(calltype_id_1_voc_day_cnt_res[i])

# del calltype_id_1_voc_day_cnt_res
# gc.collect()

0

In [25]:
"""
被叫通话
"""

df_calltype_id_2 = df_voc.loc[df_voc['calltype_id'] == 2, :].copy()

# 被叫通话的次数
tmp = df_calltype_id_2.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_2_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 被叫通话次数的占比
phone_no_m["call_type_id_2_rate"] = phone_no_m['voc_calltype_id_2_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_2
gc.collect()

# 线上变差，删除
# 主叫次数/被叫次数
phone_no_m['voc_calltype_1/2'] = phone_no_m['voc_calltype_id_1_cnt'] / (phone_no_m['voc_calltype_id_2_cnt'] + 0.0001)

In [26]:
"""
呼叫转移
"""

df_calltype_id_3 = df_voc.loc[df_voc['calltype_id'] == 3, :].copy()

# 呼叫转移的次数
tmp = df_calltype_id_3.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_3_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 呼叫转移次数的占比
phone_no_m["call_type_id_3_rate"] = phone_no_m['voc_calltype_id_3_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_3
gc.collect()

# 线上变差，删除
# # 主叫次数/呼叫转移次数
# phone_no_m['voc_calltype_1/3'] = phone_no_m['voc_calltype_id_1_cnt'] / (phone_no_m['voc_calltype_id_3_cnt'] + 0.0001)

20

In [27]:
"""
与对端通话统计
"""

# 与对端通话次数，与对端通话时长
tmp = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['call_dur'].agg(call_count='count',
                                                                      call_sum='sum')

# 与对端通话次数的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_count'].agg(phone2opposite_cnt_mean='mean',
                                                             phone2opposite_cnt_median='median',
                                                             phone2opposite_cnt_min='min',
                                                             phone2opposite_cnt_max='max',
                                                             phone2opposite_cnt_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite
gc.collect()


# 与对端总通话时长的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_sum'].agg(phone2opposite_call_dur_mean='mean',
                                                           phone2opposite_call_dur_median='median',
                                                           phone2opposite_call_dur_min='min',
                                                           phone2opposite_call_dur_max='max',
                                                           phone2opposite_call_dur_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite, tmp
gc.collect()

0

In [28]:
"""
通话时长的统计
"""

# 通话时长的统计量
tmp = df_voc.groupby('phone_no_m')['call_dur'].agg(call_dur_mean='mean',
                                                   call_dur_median='median',
                                                   call_dur_max='max',
                                                   call_dur_min='min',
                                                   call_dur_std='std',
                                                   call_dur_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

20

In [29]:
"""
收费号码位置变动
"""

# 收费号码所在地市的个数
tmp = df_voc.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码所在区县的个数
tmp = df_voc.groupby('phone_no_m')['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how='left')

# 收费号码所在地市_区县的个数
tmp = df_voc.groupby('phone_no_m')['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how='left')

# 收费号码通话类型的个数
tmp = df_voc.groupby('phone_no_m')['calltype_id'].agg(calltype_id_unique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [30]:
"""
通话时间点的偏好
"""

# hour通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_hour'].agg(voc_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   voc_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   voc_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# day通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_day'].agg(voc_day_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                  voc_day_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                  voc_day_nunique='nunique')                           # 一个月有多少天打了电话
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()


# dayofweek通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_dayofweek'].agg(voc_dayofweek_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                        voc_dayofweek_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                        voc_dayofweek_nunique='nunique')                           
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [31]:
# 每天通话次数的均值、标准差
tmp1 = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].agg(voc_day_cnt='count')
tmp2 = tmp1.groupby('phone_no_m')['voc_day_cnt'].agg(voc_day_cnt_mean='mean',
                                                     voc_day_cnt_std='std')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')

del tmp1, tmp2
gc.collect()

20

In [32]:
# 每天的通话次数
voc_day_cnt_res = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_cnt_res[i])

    
# 每天的通话人数
voc_day_nunique_res = df_voc.groupby(['phone_no_m', 'voc_day'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_nunique_res[i])

    
# 每天的通话时长
voc_day_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_day'])['call_dur'].sum().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_call_dur_res[i])



# 每小时的通话次数
voc_hour_cnt_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['phone_no_m'].count().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_cnt_res[i])
    
# 每小时的通话人数
voc_hour_nunique_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_nunique_res[i])
    

# 每小时的通话时长
voc_hour_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['call_dur'].sum().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_call_dur_res[i])

    

# 每周几的通话次数
voc_dayofweek_cnt_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['phone_no_m'].count().unstack()
for i in df_voc['voc_dayofweek'].unique():
    phone_no_m['voc_dayofweek{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_dayofweek_cnt_res[i])
    
# 每周几的通话人数
voc_dayoffweek_nunique_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_dayofweek'].unique():
    phone_no_m['voc_dayofweek{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_dayoffweek_nunique_res[i])
    

# 每周几的通话时长
voc_dayofweek_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['call_dur'].sum().unstack()
for i in df_voc['voc_dayofweek'].unique():
    phone_no_m['voc_dayofweek{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_dayofweek_call_dur_res[i])

In [33]:
phone_no_m.to_hdf('../input/voc_features.h5', 'df', index=False)
del phone_no_m
gc.collect()

64

### opposite_no_m为主键

In [34]:
df_voc_opposite = user_phone_no_m.merge(df_voc, left_on='phone_no_m', right_on='opposite_no_m', how='left')
df_voc_opposite = df_voc_opposite[df_voc_opposite['start_datetime'].notnull()]

df_voc_opposite = user_phone_no_m.merge(df_voc, left_on='phone_no_m', right_on='opposite_no_m', how='left')
df_voc_opposite = df_voc_opposite[df_voc_opposite['start_datetime'].notnull()]
df_voc_opposite.drop(['phone_no_m_y'], axis=1, inplace=True)
df_voc_opposite.rename(columns={'phone_no_m_x': 'phone_no_m'}, inplace=True)

len(set(df_voc_opposite['opposite_no_m']))

62

In [35]:
opposite_no_m = df_voc_opposite[['opposite_no_m']].copy()
opposite_no_m = opposite_no_m.drop_duplicates(subset=['opposite_no_m'], keep='last')

In [36]:
# 通话次数，通话人数
tmp = df_voc_opposite.groupby('opposite_no_m')['phone_no_m'].agg(phone_cnt='count', phone_nunique='nunique')
opposite_no_m = opposite_no_m.merge(tmp, how='left', on='opposite_no_m')

# 人均通话次数
opposite_no_m['voc_opposite_cnt_per_capita'] = opposite_no_m['phone_cnt'] / (opposite_no_m['phone_nunique'] + 0.0001)

del tmp
gc.collect()

40

In [37]:
"""
被叫通话
"""

df_calltype_id_2 = df_voc_opposite.loc[df_voc_opposite['calltype_id'] == 2, :].copy()

# 主叫通话次数，主叫通话使用的手机个数
tmp = df_calltype_id_2.groupby('opposite_no_m')['imei_m'].agg(voc_calltype_id_2_cnt='count', imeis_2='nunique')
opposite_no_m = opposite_no_m.merge(tmp, on='opposite_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长
tmp = df_calltype_id_2.groupby('opposite_no_m')['call_dur'].agg(voc_calltype_id_2_call_dur_sum='sum')
opposite_no_m = opposite_no_m.merge(tmp, on='opposite_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长小于10s的次数
# tmp1 = df_calltype_id_2[df_calltype_id_2['call_dur'] < 10]
# tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_10s_cnt='count')
# phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
# del tmp1, tmp2
# gc.collect()

# 主叫通话时长小于30s的次数
tmp1 = df_calltype_id_2[df_calltype_id_2['call_dur'] < 30]
tmp2 = tmp1.groupby('opposite_no_m')['call_dur'].agg(voc_calltype_id_2_30s_cnt='count')
opposite_no_m = opposite_no_m.merge(tmp2, on='opposite_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于60s的次数
tmp1 = df_calltype_id_2[df_calltype_id_2['call_dur'] < 60]
tmp2 = tmp1.groupby('opposite_no_m')['call_dur'].agg(voc_calltype_id_2_60s_cnt='count')
opposite_no_m = opposite_no_m.merge(tmp2, on='opposite_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于300s（5分钟）的次数
tmp1 = df_calltype_id_2[df_calltype_id_2['call_dur'] > 300]
tmp2 = tmp1.groupby('opposite_no_m')['call_dur'].agg(voc_calltype_id_2_300s_cnt='count')
opposite_no_m = opposite_no_m.merge(tmp2, on='opposite_no_m', how='left')
del tmp1, tmp2
gc.collect()

# # 主叫通话时长小于10s的次数的占比
# opposite_no_m['voc_calltype_id_2_10s_rate'] = opposite_no_m['voc_calltype_id_2_10s_cnt'] / opposite_no_m['voc_calltype_id_2_cnt']

# 主叫通话时长小于30s的次数的占比
opposite_no_m['voc_calltype_id_2_30s_rate'] = opposite_no_m['voc_calltype_id_2_30s_cnt'] / opposite_no_m['voc_calltype_id_2_cnt']

# 主叫通话时长小于60s的次数的占比
opposite_no_m['voc_calltype_id_2_60s_rate'] = opposite_no_m['voc_calltype_id_2_60s_cnt'] / opposite_no_m['voc_calltype_id_2_cnt']

# 主叫通话时长大于300s（5分钟）的次数的占比
opposite_no_m['voc_calltype_id_2_300s_rate'] = opposite_no_m['voc_calltype_id_2_300s_cnt'] / opposite_no_m['voc_calltype_id_2_cnt']


# 主叫通话次数的占比
opposite_no_m["call_type_id_2_rate"] = opposite_no_m['voc_calltype_id_2_cnt'] / opposite_no_m['phone_cnt']

# # 主叫通话时所在地市变动的个数
# tmp = df_calltype_id_2.groupby('opposite_no_m')['city_name'].agg(city_name_nunique_2='nunique')
# opposite_no_m = opposite_no_m.merge(tmp, on='opposite_no_m', how='left')
# del tmp
# gc.collect()

# # 主叫通话时所在区县变动的个数
# tmp = df_calltype_id_2.groupby("opposite_no_m")['county_name'].agg(county_name_nunique_2='nunique')
# opposite_no_m = opposite_no_m.merge(tmp, on="opposite_no_m", how="left")
# del tmp
# gc.collect()

# # 主叫通话时所在地市_区县变动的个数
# tmp = df_calltype_id_2.groupby("opposite_no_m")['city_name_county_name'].agg(city_name_county_name_nunique_2='nunique')
# opposite_no_m = opposite_no_m.merge(tmp, on="opposite_no_m", how="left")
# del tmp
# gc.collect()

# 线上效果变差，不用
# # 主叫通话每天的通话次数
# calltype_id_1_voc_day_cnt_res = df_calltype_id_2.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
# for i in df_calltype_id_2['voc_day'].unique():
#     phone_no_m['calltype_id_1_voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(calltype_id_1_voc_day_cnt_res[i])

# del calltype_id_1_voc_day_cnt_res
# gc.collect()

In [38]:
opposite_no_m.to_hdf('../input/voc_opposite_features.h5', 'df', index=False)
del opposite_no_m, df_voc
gc.collect()

64

## sms表

In [39]:
train_sms = pd.read_hdf('../input/train_sms.h5')
test_sms = pd.read_hdf('../input/test_sms.h5')

In [40]:
df_sms = pd.concat([train_sms, test_sms])

del train_sms, test_sms
gc.collect()

106

In [41]:
df_sms['sms_day'] = df_sms['request_datetime'].astype('datetime64').dt.day
df_sms['sms_hour'] = df_sms['request_datetime'].astype('datetime64').dt.hour
df_sms['sms_dayofweek'] = df_sms['request_datetime'].astype('datetime64').dt.dayofweek

In [42]:
phone_no_m = df_sms[["phone_no_m"]].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [43]:
# 短信次数，短信人数
tmp = df_sms.groupby('phone_no_m')['opposite_no_m'].agg(sms_cnt='count', sms_nunique='nunique')

# 人均短信次数
tmp['sms_cnt_per_capita'] = tmp['sms_cnt'] / (tmp['sms_nunique'] + 0.0001)
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [44]:
"""
短信上行，短信下行
"""

# 短信上行
df_sms_calltype1 = df_sms[df_sms['calltype_id'] == 1].copy()
tmp = df_sms_calltype1.groupby('phone_no_m')['calltype_id'].agg(sms_calltype1_cnt="count")
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行比例
phone_no_m['sms_calltype1_rate'] = phone_no_m['sms_calltype1_cnt'] / phone_no_m['sms_cnt']

del tmp, df_sms_calltype1

# 短信下行
df_sms_calltype2 = df_sms[df_sms['calltype_id'] == 2].copy()
tmp = df_sms_calltype2.groupby('opposite_no_m')['calltype_id'].agg(sms_calltype2_cnt="count")
phone_no_m = phone_no_m.merge(tmp, left_on='phone_no_m', right_on='opposite_no_m', how='left')

# 短信下行比例
phone_no_m['sms_calltype2_rate'] = phone_no_m['sms_calltype2_cnt'] / phone_no_m['sms_cnt']

# 短信上行/短信下行
phone_no_m['type1_rate/type2_rate'] = phone_no_m['sms_calltype1_cnt'] / (phone_no_m['sms_calltype2_cnt'] + 0.00001)

# 删除，效果变差
# # 短信下行/短信上行
# phone_no_m['type2_rate/type1_rate'] = phone_no_m['sms_calltype2_cnt'] / (phone_no_m['sms_calltype1_cnt'] + 0.00001)

del tmp, df_sms_calltype2
gc.collect()

0

In [45]:
"""
短信时间点的偏好
"""

tmp = df_sms.groupby('phone_no_m')['sms_hour'].agg(sms_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   sms_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   sms_hour_nunique='nunique')                          # 一个月有多少个小时发了短信
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

tmp = df_sms.groupby('phone_no_m')['sms_day'].agg(sms_day_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                  sms_day_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                  sms_day_nunique='nunique')                            # 一个月有多少天发了短信
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()


tmp = df_sms.groupby('phone_no_m')['sms_dayofweek'].agg(sms_dayofweek_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                        sms_dayofweek_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                        sms_dayofweek_nunique='nunique')                            # 一个月有多少天发了短信
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [46]:
# 每天的短信次数
sms_day_res = df_sms.groupby(['phone_no_m', 'sms_day'])['phone_no_m'].count().unstack()
for i in df_sms['sms_day'].unique():
    phone_no_m['sms_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_day_res[i])


# 每小时的短信次数
sms_hour_res = df_sms.groupby(['phone_no_m', 'sms_hour'])['phone_no_m'].count().unstack()
for i in df_sms['sms_hour'].unique():
    phone_no_m['sms_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_hour_res[i])
    
    
# 每周几的短信次数
sms_dayofweek_res = df_sms.groupby(['phone_no_m', 'sms_dayofweek'])['phone_no_m'].count().unstack()
for i in df_sms['sms_dayofweek'].unique():
    phone_no_m['sms_weekofday{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_dayofweek_res[i])

In [47]:
phone_no_m.to_hdf('../input/sms_features.h5', 'df', index=False)

del phone_no_m, df_sms
gc.collect()

64

## 读取数据，建模

In [48]:
df_user = reduce_mem_usage(pd.read_hdf('../input/user_features.h5'))
df_voc = reduce_mem_usage(pd.read_hdf('../input/voc_features.h5'))
df_voc_opposite = reduce_mem_usage(pd.read_hdf('../input/voc_opposite_features.h5'))
df_sms = reduce_mem_usage(pd.read_hdf('../input/sms_features.h5'))
# df_app = reduce_mem_usage(pd.read_hdf('../input/app_features.h5'))

Memory usage of dataframe is 4042896.00 MB
Memory usage after optimization is: 1858428.00 MB
Decreased by 54.0%
Memory usage of dataframe is 13032960.00 MB
Memory usage after optimization is: 3346484.00 MB
Decreased by 74.3%
Memory usage of dataframe is 7440.00 MB
Memory usage after optimization is: 2480.00 MB
Decreased by 66.7%
Memory usage of dataframe is 4455000.00 MB
Memory usage after optimization is: 1292500.00 MB
Decreased by 71.0%


In [49]:
df_user.shape, df_voc.shape, df_voc_opposite.shape, df_sms.shape

((8151, 91), (6788, 239), (62, 14), (6875, 80))

In [50]:
df = df_user.merge(df_voc, on='phone_no_m', how='left')
df = df.merge(df_voc_opposite, left_on='phone_no_m', right_on='opposite_no_m', how='left')
df = df.merge(df_sms, on='phone_no_m', how='left')
# df = df.merge(df_app, on='phone_no_m', how='left')

del df_user, df_voc_opposite, df_voc, df_sms#, df_app
gc.collect()

0

In [51]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 422), (2045, 422))

In [52]:
fs = FeatureSelector(data=df_train.drop(['phone_no_m', 'label'], axis=1), labels=df_train['label'])

fs.identify_all(selection_params={'missing_threshold': 0.98,
                                  'correlation_threshold': 0.98, 
                                  'task': 'classification',
                                  'eval_metric': 'auc', 
                                  'cumulative_importance': 0.99})

23 features with greater than 0.98 missing values.

3 features with a single unique value.

63 features with a correlation magnitude greater than 0.98.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[90]	valid_0's auc: 0.949007	valid_0's binary_logloss: 0.201007
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[103]	valid_0's auc: 0.951749	valid_0's binary_logloss: 0.21121
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[73]	valid_0's auc: 0.947612	valid_0's binary_logloss: 0.212846
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[95]	valid_0's auc: 0.963969	valid_0's binary_logloss: 0.163466
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[94]	valid_0's auc: 0.955859	valid_0's binary_logloss: 0.192491
Training

In [53]:
train_removed_all_once = fs.remove(methods='all')
# train_removed_all_once

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 156 features.


In [54]:
use_cols = train_removed_all_once.columns.to_list()

del train_removed_all_once
gc.collect()

42

In [55]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train[use_cols], df_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [56]:
train_cols = [i for i in X_train.columns if i not in ['phone_no_m', 'opposite_no_m', 'label']]

In [57]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0.1,
          'lambda_l2': 0,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [58]:
useful_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_valid= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_valid, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.52:
        useful_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

city_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684155	valid_0's auc: 0.671655
[40]	training's auc: 0.684156	valid_0's auc: 0.672408
[60]	training's auc: 0.684156	valid_0's auc: 0.672408
[80]	training's auc: 0.684158	valid_0's auc: 0.671655
Early stopping, best iteration is:
[37]	training's auc: 0.684156	valid_0's auc: 0.672408
*****
0.6724078657865786
********************


county_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.767358	valid_0's auc: 0.72344
[40]	training's auc: 0.772283	valid_0's auc: 0.728765
[60]	training's auc: 0.77374	valid_0's auc: 0.730307
[80]	training's auc: 0.774604	valid_0's auc: 0.730896
[100]	training's auc: 0.775263	valid_0's auc: 0.73182
[120]	training's auc: 0.775831	valid_0's auc: 0.731191
[140]	training's auc: 0.776325	valid_0's auc: 0.732008
Early stopping, best iteration is:
[90]	training's auc: 0.775065	valid_0's auc: 0.732017
*****
0.7320170875783231
***********

[100]	training's auc: 0.774547	valid_0's auc: 0.730086
[120]	training's auc: 0.774957	valid_0's auc: 0.730044
[140]	training's auc: 0.775215	valid_0's auc: 0.729886
Early stopping, best iteration is:
[101]	training's auc: 0.774606	valid_0's auc: 0.730427
*****
0.7304267111493759
********************


county_name_arpu_202004_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.759176	valid_0's auc: 0.72202
[40]	training's auc: 0.763247	valid_0's auc: 0.720864
[60]	training's auc: 0.764756	valid_0's auc: 0.720293
Early stopping, best iteration is:
[14]	training's auc: 0.755985	valid_0's auc: 0.723517
*****
0.7235166451427751
********************


county_name_arpu_202004_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.76044	valid_0's auc: 0.716641
[40]	training's auc: 0.765512	valid_0's auc: 0.720696
[60]	training's auc: 0.76765	valid_0's auc: 0.721446
[80]	training's auc: 0.768852	valid_0's auc: 0.721523
[10

[400]	training's auc: 0.762305	valid_0's auc: 0.717551
[420]	training's auc: 0.762329	valid_0's auc: 0.717566
[440]	training's auc: 0.762337	valid_0's auc: 0.717584
[460]	training's auc: 0.762357	valid_0's auc: 0.717551
[480]	training's auc: 0.762391	valid_0's auc: 0.717626
[500]	training's auc: 0.762459	valid_0's auc: 0.717605
[520]	training's auc: 0.762466	valid_0's auc: 0.717662
[540]	training's auc: 0.76247	valid_0's auc: 0.717566
[560]	training's auc: 0.762504	valid_0's auc: 0.717602
[580]	training's auc: 0.762535	valid_0's auc: 0.717883
[600]	training's auc: 0.762549	valid_0's auc: 0.717886
[620]	training's auc: 0.762554	valid_0's auc: 0.717916
[640]	training's auc: 0.762562	valid_0's auc: 0.717829
[660]	training's auc: 0.762574	valid_0's auc: 0.717949
Early stopping, best iteration is:
[626]	training's auc: 0.76256	valid_0's auc: 0.717949
*****
0.7179488329267709
********************


county_name_arpu_202004/idcard_cnt_mean
Training until validation scores don't improve for 50 

[40]	training's auc: 0.843913	valid_0's auc: 0.80508
[60]	training's auc: 0.846867	valid_0's auc: 0.804535
Early stopping, best iteration is:
[28]	training's auc: 0.841367	valid_0's auc: 0.80743
*****
0.8074299277753862
********************


voc_calltype_id_1_300s_rate
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.692583	valid_0's auc: 0.659096
[40]	training's auc: 0.69639	valid_0's auc: 0.660308
[60]	training's auc: 0.69847	valid_0's auc: 0.663771
[80]	training's auc: 0.700354	valid_0's auc: 0.663215
[100]	training's auc: 0.700994	valid_0's auc: 0.663561
Early stopping, best iteration is:
[56]	training's auc: 0.69826	valid_0's auc: 0.664002
*****
0.6640015903764289
********************


call_type_id_1_rate
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.882862	valid_0's auc: 0.844037
[40]	training's auc: 0.885226	valid_0's auc: 0.843858
[60]	training's auc: 0.886462	valid_0's auc: 0.843689
Early stopping, best

[60]	training's auc: 0.846369	valid_0's auc: 0.823189
Early stopping, best iteration is:
[17]	training's auc: 0.846121	valid_0's auc: 0.823404
*****
0.8234039436552351
********************


voc_hour_mode_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.872088	valid_0's auc: 0.854358
[40]	training's auc: 0.872424	valid_0's auc: 0.854655
[60]	training's auc: 0.87266	valid_0's auc: 0.855222
Early stopping, best iteration is:
[11]	training's auc: 0.87169	valid_0's auc: 0.855917
*****
0.855916977567322
********************


voc_hour_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.863728	valid_0's auc: 0.840329
[40]	training's auc: 0.86382	valid_0's auc: 0.840332
[60]	training's auc: 0.863851	valid_0's auc: 0.840332
[80]	training's auc: 0.863861	valid_0's auc: 0.840332
Early stopping, best iteration is:
[35]	training's auc: 0.863789	valid_0's auc: 0.840332
*****
0.8403315875065768
********************


vo

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.768572	valid_0's auc: 0.745259
[40]	training's auc: 0.76875	valid_0's auc: 0.743952
[60]	training's auc: 0.768852	valid_0's auc: 0.743982
Early stopping, best iteration is:
[22]	training's auc: 0.768583	valid_0's auc: 0.745262
*****
0.7452617544363132
********************


voc_day4_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.750421	valid_0's auc: 0.72825
[40]	training's auc: 0.751049	valid_0's auc: 0.727261
[60]	training's auc: 0.751133	valid_0's auc: 0.727176
[80]	training's auc: 0.751165	valid_0's auc: 0.727092
Early stopping, best iteration is:
[32]	training's auc: 0.750524	valid_0's auc: 0.72924
*****
0.7292399076864208
********************


voc_day1_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.735531	valid_0's auc: 0.732664
[40]	training's auc: 0.736397	valid_0's auc: 0.732053
[60]	training's auc: 0.73643	v

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.754203	valid_0's auc: 0.741199
[40]	training's auc: 0.754229	valid_0's auc: 0.741579
[60]	training's auc: 0.754293	valid_0's auc: 0.741334
[80]	training's auc: 0.754309	valid_0's auc: 0.741178
[100]	training's auc: 0.754398	valid_0's auc: 0.74119
Early stopping, best iteration is:
[57]	training's auc: 0.754286	valid_0's auc: 0.7416
*****
0.7415997034486057
********************


voc_day15_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.741999	valid_0's auc: 0.74826
[40]	training's auc: 0.742022	valid_0's auc: 0.748235
Early stopping, best iteration is:
[2]	training's auc: 0.741974	valid_0's auc: 0.748871
*****
0.7488714904099105
********************


voc_day21_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.757594	valid_0's auc: 0.738305
[40]	training's auc: 0.757597	valid_0's auc: 0.738298
Early stopping, best iter

[20]	training's auc: 0.768648	valid_0's auc: 0.730944
[40]	training's auc: 0.770325	valid_0's auc: 0.730947
Early stopping, best iteration is:
[1]	training's auc: 0.764104	valid_0's auc: 0.73348
*****
0.7334804132587172
********************


voc_day9_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.755926	valid_0's auc: 0.715077
[40]	training's auc: 0.757008	valid_0's auc: 0.713527
Early stopping, best iteration is:
[5]	training's auc: 0.753724	valid_0's auc: 0.719645
*****
0.7196453340986273
********************


voc_day26_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.780934	valid_0's auc: 0.736432
[40]	training's auc: 0.782367	valid_0's auc: 0.735594
Early stopping, best iteration is:
[1]	training's auc: 0.776765	valid_0's auc: 0.737561
*****
0.7375609843593055
********************


voc_day11_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's au

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.777128	valid_0's auc: 0.754094
[40]	training's auc: 0.778892	valid_0's auc: 0.752007
[60]	training's auc: 0.779923	valid_0's auc: 0.751493
Early stopping, best iteration is:
[21]	training's auc: 0.777229	valid_0's auc: 0.754175
*****
0.7541747381259866
********************


voc_day19_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.760056	valid_0's auc: 0.737232
[40]	training's auc: 0.762602	valid_0's auc: 0.737966
Early stopping, best iteration is:
[2]	training's auc: 0.755611	valid_0's auc: 0.738633
*****
0.7386326947912183
********************


voc_day6_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.75411	valid_0's auc: 0.718062
[40]	training's auc: 0.755645	valid_0's auc: 0.71685
Early stopping, best iteration is:
[9]	training's auc: 0.750698	valid_0's auc: 0.719801
*****
0.7198007844262687
**********

[4]	training's auc: 0.801576	valid_0's auc: 0.783773
*****
0.783773078394796
********************


voc_hour14_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.805312	valid_0's auc: 0.790164
[40]	training's auc: 0.805794	valid_0's auc: 0.790138
Early stopping, best iteration is:
[5]	training's auc: 0.804587	valid_0's auc: 0.790447
*****
0.7904469794805568
********************


voc_hour13_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.802877	valid_0's auc: 0.780256
[40]	training's auc: 0.802909	valid_0's auc: 0.78105
[60]	training's auc: 0.802933	valid_0's auc: 0.781029
[80]	training's auc: 0.803232	valid_0's auc: 0.781402
[100]	training's auc: 0.803249	valid_0's auc: 0.781444
[120]	training's auc: 0.803257	valid_0's auc: 0.781396
Early stopping, best iteration is:
[88]	training's auc: 0.803246	valid_0's auc: 0.781444
*****
0.7814443129095519
********************


voc_hour21_nunique
Training until 

[60]	training's auc: 0.789828	valid_0's auc: 0.745565
Early stopping, best iteration is:
[11]	training's auc: 0.785109	valid_0's auc: 0.745945
*****
0.7459448390491223
********************


voc_hour10_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.808841	valid_0's auc: 0.788585
[40]	training's auc: 0.811849	valid_0's auc: 0.787885
[60]	training's auc: 0.814009	valid_0's auc: 0.786724
Early stopping, best iteration is:
[12]	training's auc: 0.806013	valid_0's auc: 0.78922
*****
0.7892198187210026
********************


voc_hour15_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.790932	valid_0's auc: 0.781087
[40]	training's auc: 0.793316	valid_0's auc: 0.776808
[60]	training's auc: 0.794681	valid_0's auc: 0.776749
Early stopping, best iteration is:
[24]	training's auc: 0.791363	valid_0's auc: 0.781476
*****
0.7814757019180179
********************


voc_hour7_call_dur_sum
Training until vali

[60]	training's auc: 0.846452	valid_0's auc: 0.833913
Early stopping, best iteration is:
[19]	training's auc: 0.845862	valid_0's auc: 0.836111
*****
0.8361105132252356
********************


voc_dayofweek2_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.849686	valid_0's auc: 0.835169
[40]	training's auc: 0.850301	valid_0's auc: 0.834834
[60]	training's auc: 0.850682	valid_0's auc: 0.834493
Early stopping, best iteration is:
[26]	training's auc: 0.849784	valid_0's auc: 0.835227
*****
0.8352271368441192
********************


voc_dayofweek4_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.8487	valid_0's auc: 0.836408
[40]	training's auc: 0.848435	valid_0's auc: 0.836442
[60]	training's auc: 0.849219	valid_0's auc: 0.83604
Early stopping, best iteration is:
[27]	training's auc: 0.848417	valid_0's auc: 0.83668
*****
0.8366799995216913
********************


voc_dayofweek5_nunique
Training until validatio

[40]	training's auc: 0.811138	valid_0's auc: 0.786658
Early stopping, best iteration is:
[3]	training's auc: 0.810721	valid_0's auc: 0.787444
*****
0.7874440976706366
********************


sms_day3_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.839773	valid_0's auc: 0.829325
[40]	training's auc: 0.839847	valid_0's auc: 0.829054
Early stopping, best iteration is:
[1]	training's auc: 0.839554	valid_0's auc: 0.831579
*****
0.8315785382886115
********************


sms_day4_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.770287	valid_0's auc: 0.773856
[40]	training's auc: 0.770433	valid_0's auc: 0.773981
[60]	training's auc: 0.770485	valid_0's auc: 0.774253
[80]	training's auc: 0.770571	valid_0's auc: 0.774403
[100]	training's auc: 0.770587	valid_0's auc: 0.774427
[120]	training's auc: 0.770594	valid_0's auc: 0.774442
[140]	training's auc: 0.770621	valid_0's auc: 0.774516
[160]	training's auc: 0.770622	va

[20]	training's auc: 0.773557	valid_0's auc: 0.754412
[40]	training's auc: 0.774245	valid_0's auc: 0.754661
[60]	training's auc: 0.774399	valid_0's auc: 0.754672
Early stopping, best iteration is:
[25]	training's auc: 0.774084	valid_0's auc: 0.755384
*****
0.7553839623092744
********************


sms_day24_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.871818	valid_0's auc: 0.847497
[40]	training's auc: 0.871907	valid_0's auc: 0.846426
Early stopping, best iteration is:
[1]	training's auc: 0.870984	valid_0's auc: 0.848279
*****
0.8482789855072463
********************


sms_day25_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.849237	valid_0's auc: 0.821914
[40]	training's auc: 0.849334	valid_0's auc: 0.821976
[60]	training's auc: 0.849396	valid_0's auc: 0.821971
Early stopping, best iteration is:
[12]	training's auc: 0.849125	valid_0's auc: 0.822362
*****
0.8223621275170996
********************


sms_

[20]	training's auc: 0.831972	valid_0's auc: 0.814301
[40]	training's auc: 0.832531	valid_0's auc: 0.813359
[60]	training's auc: 0.832827	valid_0's auc: 0.811712
Early stopping, best iteration is:
[23]	training's auc: 0.832027	valid_0's auc: 0.814395
*****
0.8143952982254747
********************


sms_hour12_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.83786	valid_0's auc: 0.800333
[40]	training's auc: 0.838225	valid_0's auc: 0.798852
Early stopping, best iteration is:
[1]	training's auc: 0.836301	valid_0's auc: 0.803354
*****
0.8033538408188645
********************


sms_hour7_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.649455	valid_0's auc: 0.626466
[40]	training's auc: 0.649823	valid_0's auc: 0.626239
[60]	training's auc: 0.650124	valid_0's auc: 0.624496
[80]	training's auc: 0.65016	valid_0's auc: 0.624182
Early stopping, best iteration is:
[32]	training's auc: 0.649554	valid_0's auc: 0.627248

In [59]:
print(useless_cols)

['voc_hour3_call_dur_sum', 'sms_hour4_count']


In [60]:
lgb_train = lgb.Dataset(X_train[useful_cols].values, y_train) 

lgb_valid= lgb.Dataset(X_valid[useful_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

lgb_val_0 = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_valid, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

print('Done!')

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.989473	valid_0's auc: 0.945418
[20]	training's auc: 0.996768	valid_0's auc: 0.951355
[30]	training's auc: 0.998408	valid_0's auc: 0.954356
[40]	training's auc: 0.999538	valid_0's auc: 0.956339
[50]	training's auc: 0.999671	valid_0's auc: 0.955697
[60]	training's auc: 0.999807	valid_0's auc: 0.955284
[70]	training's auc: 0.999935	valid_0's auc: 0.955266
[80]	training's auc: 0.999999	valid_0's auc: 0.95431
[90]	training's auc: 1	valid_0's auc: 0.954614
[100]	training's auc: 1	valid_0's auc: 0.954032
[110]	training's auc: 1	valid_0's auc: 0.95407
[120]	training's auc: 1	valid_0's auc: 0.954112
[130]	training's auc: 1	valid_0's auc: 0.955171
[140]	training's auc: 1	valid_0's auc: 0.955251
Early stopping, best iteration is:
[40]	training's auc: 0.999538	valid_0's auc: 0.956339
Done!


In [61]:
# 验证集结果
X_valid['prob'] = lgb_val_0.predict(X_valid[useful_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.4735, 1, 0)

f1_04735 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_04735 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_04735: ', f1_04735)
print('auc_04735: ', auc_04735)

f1_04735:  0.8787
auc_04735:  0.9563393839383938


In [62]:
lgb_train_all = lgb.Dataset(df_train[useful_cols].values, df_train['label'])   

print('Start training...')

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_val_0.best_iteration + 20)

print('Done!')

Start training...
Done!


In [63]:
df_test['label'] = np.where(lgb_model.predict(df_test[useful_cols]) > 0.4735, 1, 0)
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_04735), index=False)