In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from feature_selector import FeatureSelector
from sklearn.decomposition import NMF

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

  import pandas.util.testing as tm


In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg(['mean', 'size'])
        col_avg_y['size'] = prior_weight_func(col_avg_y['size'])
        col_avg_y[nf_name] = col_avg_y['size'] * prior + (1 - col_avg_y['size']) * col_avg_y['mean']
        col_avg_y.drop(['size', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

In [4]:
def count_encode(df, cols):
    for col in tqdm(cols):
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df
        
        
def label_encode(df, cols):
    le = LabelEncoder()
    for col in tqdm(cols):
        df[col] = df[col].fillna('NA')
        df[col] = le.fit_transform(df[col].astype(str))
    return df


def cross_cat_num(df, cat_col, num_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            df_new = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_nunique'.format(f1, f2): 'nunique'
            })
            df = df.merge(df_new, on=f1, how='left')
            del df_new
            gc.collect()
    return df

In [5]:
train_user = reduce_mem_usage(pd.read_hdf('../input/train_user.h5'))
test_user = reduce_mem_usage(pd.read_hdf('../input/test_user.h5'))

Memory usage of dataframe is 219816.00 MB
Memory usage after optimization is: 219816.00 MB
Decreased by 0.0%
Memory usage of dataframe is 71575.00 MB
Memory usage after optimization is: 71575.00 MB
Decreased by 0.0%


In [6]:
# 将city_name和county_name拼接起来
train_user['city_name_county_name'] = train_user['city_name'].astype(str) + '_' + train_user['county_name'].astype(str)
test_user['city_name_county_name'] = test_user['city_name'].astype(str) + '_' + test_user['county_name'].astype(str)

In [7]:
cat_feat = ['city_name', 'county_name', 'city_name_county_name']
num_feat = ['idcard_cnt', 'arpu_202004', 'idcard_cnt*arpu_202004', 'arpu_202004/idcard_cnt']

In [8]:
y = train_user['label']

ME = MeanEncoder(categorical_features=cat_feat,
                 n_splits=3,
                 target_type='classification',
                 prior_weight_func=None)
X_data = ME.fit_transform(train_user.drop('label', axis=1), y)
X_test = ME.transform(test_user)

train_user = X_data.copy()
train_user['label'] = y
test_user = X_test.copy()

del X_data, X_test
gc.collect()

train_user.shape, test_user.shape

((6106, 13), (2045, 12))

In [9]:
df_user = pd.concat([train_user, test_user])

del train_user, test_user
gc.collect()

20

In [10]:
user_phone_no_m = df_user[['phone_no_m']].copy()

In [11]:
# 电话的数量*月消费额
df_user['idcard_cnt*arpu_202004'] = df_user['idcard_cnt'] * df_user['arpu_202004']

# 月消费额/电话的数量
df_user['arpu_202004/idcard_cnt'] = df_user['arpu_202004'] / (df_user['idcard_cnt'] + 0.0001)

In [12]:
df_user = cross_cat_num(df_user, cat_feat, num_feat)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s]
 50%|██████████████████████████████████████████                                          | 2/4 [00:00<00:00, 11.12it/s]
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  2.69it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s]
 25%|█████████████████████                                                               | 1/4 [00:00<00:00,  6.29it/s]
 50%|██████████████████████████████████████████                                          | 2/4 [00:00<00:00,  6.66it/s]
 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:00<00:00,  7.02it/s]
 67%|███████████████████████████████████

In [13]:
df_user = count_encode(df_user, cat_feat + ['idcard_cnt'])

  0%|                                                                                            | 0/4 [00:00<?, ?it/s]

city_name
county_name
city_name_county_name
idcard_cnt


100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 121.28it/s]


In [14]:
df_user = label_encode(df_user, cat_feat)

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 100.02it/s]


In [15]:
df_user.to_hdf('../input/user_features.h5', 'df', index=False)

del df_user
gc.collect()

77

## voc表

In [16]:
train_voc = pd.read_hdf('../input/train_voc.h5')
test_voc = pd.read_hdf('../input/test_voc.h5')

In [17]:
df_voc = pd.concat([train_voc, test_voc])

del train_voc, test_voc
gc.collect()

128

In [18]:
phone_no_m = user_phone_no_m.merge(df_voc, on='phone_no_m', how='left')
phone_no_m = phone_no_m[phone_no_m['start_datetime'].notnull()]

In [19]:
opposite_no_m = user_phone_no_m.merge(df_voc,
                                      left_on='phone_no_m',
                                      right_on='opposite_no_m',
                                      how='left')
opposite_no_m = opposite_no_m[opposite_no_m['start_datetime'].notnull()]

opposite_no_m.drop('phone_no_m_y',
                   axis=1,
                   inplace=True)
opposite_no_m.rename(columns={'opposite_no_m': 'phone_no_m',
                              'phone_no_m_x': 'opposite_no_m'},
                     inplace=True)

df_voc = pd.concat([phone_no_m, opposite_no_m])
print(df_voc.shape)

del phone_no_m, opposite_no_m
gc.collect()

(985955, 8)


0

In [20]:
df_voc['city_name_county_name'] = df_voc['city_name'] + '_' + df_voc['county_name']

In [21]:
df_voc['voc_day'] = df_voc['start_datetime'].astype('datetime64').dt.day
df_voc['voc_hour'] = df_voc['start_datetime'].astype('datetime64').dt.hour
df_voc['voc_dayofweek'] = df_voc['start_datetime'].astype('datetime64').dt.dayofweek

In [22]:
phone_no_m = df_voc[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'],
                                        keep='last')

In [23]:
# 通话次数，通话人数
tmp = df_voc.groupby('phone_no_m')['opposite_no_m'].agg(opposite_cnt='count',
                                                        opposite_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, how='left', on='phone_no_m')

# 人均通话次数
phone_no_m['voc_cnt_per_capita'] = phone_no_m['opposite_cnt'] / (phone_no_m['opposite_nunique'] + 0.0001)

del tmp
gc.collect()

20

In [24]:
"""
主叫通话
"""

df_calltype_id_1 = df_voc.loc[df_voc['calltype_id'] == 1, :].copy()

# 主叫通话次数，主叫通话使用的手机个数
tmp = df_calltype_id_1.groupby('phone_no_m')['imei_m'].agg(call_1_cnt='count',
                                                           imeis_1='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话人数
tmp = df_calltype_id_1.groupby('phone_no_m')['opposite_no_m'].agg(opposite_no_m_cnt='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()


# 主叫通话时长
tmp = df_calltype_id_1.groupby('phone_no_m')['call_dur'].agg(call_dur_1_sum='sum',
                                                             call_dur_1_max='max',
                                                             call_dur_1_min='min',
                                                             call_dur_1_std='std',
                                                             call_dur_1_median='median',
                                                             call_dur_1_mean='mean')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()


# 主叫通话时长小于等于30s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] <= 30]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_30s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于30s，小于等于60s的次数
tmp1 = df_calltype_id_1[(df_calltype_id_1['call_dur'] > 30) & (df_calltype_id_1['call_dur'] <= 60)]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_30_60s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于60s，小于120s的次数
tmp1 = df_calltype_id_1[(df_calltype_id_1['call_dur'] > 60) & (df_calltype_id_1['call_dur'] <= 120)]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_60_120s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于120s，小于300s的次数
tmp1 = df_calltype_id_1[(df_calltype_id_1['call_dur'] > 120) & (df_calltype_id_1['call_dur'] <= 300)]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_120_300s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于300s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] > 300]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_300s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()


# 主叫通话时长小于等于30s的次数的占比
phone_no_m['call_dur_30s_cnt_rate'] = phone_no_m['call_dur_30s_cnt'] / phone_no_m['call_1_cnt']

# 主叫通话时长大于30s，小于60s的次数的占比
phone_no_m['call_dur_30_60s_cnt_rate'] = phone_no_m['call_dur_30_60s_cnt'] / phone_no_m['call_1_cnt']

# 主叫通话时长大于60s，小于120s的次数的占比
phone_no_m['call_dur_60_120s_cnt_rate'] = phone_no_m['call_dur_60_120s_cnt'] / phone_no_m['call_1_cnt']

# 主叫通话时长大于120s，小于300s的次数的占比
phone_no_m['call_dur_120_300s_cnt_rate'] = phone_no_m['call_dur_120_300s_cnt'] / phone_no_m['call_1_cnt']

# 主叫通话时长大于300s的次数的占比
phone_no_m['call_dur_300s_cnt_rate'] = phone_no_m['call_dur_300s_cnt'] / phone_no_m['call_1_cnt']



# 主叫通话次数的占比
phone_no_m["call_type_id_1_rate"] = phone_no_m['call_1_cnt'] / phone_no_m['opposite_cnt']

# 主叫通话时所在地市变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时所在区县变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时所在地市_区县变动的个数
tmp = df_calltype_id_1.groupby("phone_no_m")['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 线上效果变差，不用
# # 主叫通话每天的通话次数
# calltype_id_1_voc_day_cnt_res = df_calltype_id_1.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
# for i in df_calltype_id_1['voc_day'].unique():
#     phone_no_m['calltype_id_1_voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(calltype_id_1_voc_day_cnt_res[i])

# del calltype_id_1_voc_day_cnt_res
# gc.collect()

0

In [25]:
"""
被叫通话
"""

df_calltype_id_2 = df_voc.loc[df_voc['calltype_id'] == 2, :].copy()

# 被叫通话的次数
tmp = df_calltype_id_2.groupby('phone_no_m')['imei_m'].agg(call_2_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 被叫通话次数的占比
phone_no_m["call_type_id_2_rate"] = phone_no_m['call_2_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_2
gc.collect()

# 线上变差，删除
# 主叫次数/被叫次数
phone_no_m['voc_calltype_1/2'] = phone_no_m['call_1_cnt'] / (phone_no_m['call_2_cnt'] + 0.0001)

In [26]:
"""
呼叫转移
"""

df_calltype_id_3 = df_voc.loc[df_voc['calltype_id'] == 3, :].copy()

# 呼叫转移的次数
tmp = df_calltype_id_3.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_3_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 呼叫转移次数的占比
phone_no_m["call_type_id_3_rate"] = phone_no_m['voc_calltype_id_3_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_3
gc.collect()

# 线上变差，删除
# # 主叫次数/呼叫转移次数
# phone_no_m['voc_calltype_1/3'] = phone_no_m['voc_calltype_id_1_cnt'] / (phone_no_m['voc_calltype_id_3_cnt'] + 0.0001)

0

In [27]:
"""
与对端通话统计
"""

# 与对端通话次数，与对端通话时长
tmp = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['call_dur'].agg(call_count='count',
                                                                      call_sum='sum')

# 与对端通话次数的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_count'].agg(phone2opposite_cnt_mean='mean',
                                                             phone2opposite_cnt_median='median',
                                                             phone2opposite_cnt_min='min',
                                                             phone2opposite_cnt_max='max',
                                                             phone2opposite_cnt_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite
gc.collect()


# 与对端总通话时长的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_sum'].agg(phone2opposite_call_dur_mean='mean',
                                                           phone2opposite_call_dur_median='median',
                                                           phone2opposite_call_dur_min='min',
                                                           phone2opposite_call_dur_max='max',
                                                           phone2opposite_call_dur_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite, tmp
gc.collect()

0

In [28]:
"""
通话时长的统计
"""

# 通话时长的统计量
tmp = df_voc.groupby('phone_no_m')['call_dur'].agg(call_dur_mean='mean',
                                                   call_dur_median='median',
                                                   call_dur_max='max',
                                                   call_dur_min='min',
                                                   call_dur_std='std',
                                                   call_dur_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

0

In [29]:
"""
收费号码位置变动
"""

# 收费号码所在地市的个数
tmp = df_voc.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码所在区县的个数
tmp = df_voc.groupby('phone_no_m')['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码所在地市_区县的个数
tmp = df_voc.groupby('phone_no_m')['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码通话类型的个数
tmp = df_voc.groupby('phone_no_m')['calltype_id'].agg(calltype_id_unique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [30]:
"""
通话时间点的偏好
"""

# hour通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_hour'].agg(voc_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   voc_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   voc_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# day通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_day'].agg(voc_day_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                  voc_day_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                  voc_day_nunique='nunique')                           # 一个月有多少天打了电话
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()


# dayofweek通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_dayofweek'].agg(voc_dayofweek_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                        voc_dayofweek_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                        voc_dayofweek_nunique='nunique')                           
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [31]:
# 每天通话次数的均值、标准差
tmp1 = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].agg(voc_day_cnt='count')
tmp2 = tmp1.groupby('phone_no_m')['voc_day_cnt'].agg(voc_day_cnt_mean='mean',
                                                     voc_day_cnt_std='std')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')

del tmp1, tmp2
gc.collect()

20

In [32]:
# 每天的通话次数
voc_day_cnt_res = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_cnt_res[i])

    
# 每天的通话人数
voc_day_nunique_res = df_voc.groupby(['phone_no_m', 'voc_day'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_nunique_res[i])

    
# 每天的通话时长
voc_day_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_day'])['call_dur'].sum().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_call_dur_res[i])



# 每小时的通话次数
voc_hour_cnt_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['phone_no_m'].count().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_cnt_res[i])
    
# 每小时的通话人数
voc_hour_nunique_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_nunique_res[i])
    

# 每小时的通话时长
voc_hour_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['call_dur'].sum().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_call_dur_res[i])

    

# 每周几的通话次数
voc_dayofweek_cnt_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['phone_no_m'].count().unstack()
for i in df_voc['voc_dayofweek'].unique():
    phone_no_m['voc_dayofweek{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_dayofweek_cnt_res[i])
    
# 每周几的通话人数
voc_dayoffweek_nunique_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_dayofweek'].unique():
    phone_no_m['voc_dayofweek{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_dayoffweek_nunique_res[i])
    

# 每周几的通话时长
voc_dayofweek_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['call_dur'].sum().unstack()
for i in df_voc['voc_dayofweek'].unique():
    phone_no_m['voc_dayofweek{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_dayofweek_call_dur_res[i])

In [33]:
phone_no_m.to_hdf('../input/voc_features.h5', 'df', index=False)
del phone_no_m
gc.collect()

64

## sms表

In [34]:
train_sms = pd.read_hdf('../input/train_sms.h5')
test_sms = pd.read_hdf('../input/test_sms.h5')

In [35]:
df_sms = pd.concat([train_sms, test_sms])

del train_sms, test_sms
gc.collect()

106

In [36]:
df_sms['sms_day'] = df_sms['request_datetime'].astype('datetime64').dt.day
df_sms['sms_hour'] = df_sms['request_datetime'].astype('datetime64').dt.hour
df_sms['sms_dayofweek'] = df_sms['request_datetime'].astype('datetime64').dt.dayofweek

In [37]:
phone_no_m = df_sms[["phone_no_m"]].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [38]:
# 短信次数，短信人数
tmp = df_sms.groupby('phone_no_m')['opposite_no_m'].agg(sms_cnt='count', sms_nunique='nunique')

# 人均短信次数
tmp['sms_cnt_per_capita'] = tmp['sms_cnt'] / (tmp['sms_nunique'] + 0.0001)
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [39]:
"""
短信上行，短信下行
"""

# 短信上行
df_sms_calltype1 = df_sms[df_sms['calltype_id'] == 1].copy()
tmp = df_sms_calltype1.groupby('phone_no_m')['calltype_id'].agg(sms_calltype1_cnt="count")
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行比例
phone_no_m['sms_calltype1_rate'] = phone_no_m['sms_calltype1_cnt'] / phone_no_m['sms_cnt']

del tmp, df_sms_calltype1

# 短信下行
df_sms_calltype2 = df_sms[df_sms['calltype_id'] == 2].copy()
tmp = df_sms_calltype2.groupby('opposite_no_m')['calltype_id'].agg(sms_calltype2_cnt="count")
phone_no_m = phone_no_m.merge(tmp, left_on='phone_no_m', right_on='opposite_no_m', how='left')

# 短信下行比例
phone_no_m['sms_calltype2_rate'] = phone_no_m['sms_calltype2_cnt'] / phone_no_m['sms_cnt']

# 短信上行/短信下行
phone_no_m['type1_rate/type2_rate'] = phone_no_m['sms_calltype1_cnt'] / (phone_no_m['sms_calltype2_cnt'] + 0.00001)

# 删除，效果变差
# # 短信下行/短信上行
# phone_no_m['type2_rate/type1_rate'] = phone_no_m['sms_calltype2_cnt'] / (phone_no_m['sms_calltype1_cnt'] + 0.00001)

del tmp, df_sms_calltype2
gc.collect()

0

In [40]:
"""
短信时间点的偏好
"""

tmp = df_sms.groupby('phone_no_m')['sms_hour'].agg(sms_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   sms_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   sms_hour_nunique='nunique')                          # 一个月有多少个小时发了短信
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

tmp = df_sms.groupby('phone_no_m')['sms_day'].agg(sms_day_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                  sms_day_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                  sms_day_nunique='nunique')                            # 一个月有多少天发了短信
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()


tmp = df_sms.groupby('phone_no_m')['sms_dayofweek'].agg(sms_dayofweek_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                        sms_dayofweek_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                        sms_dayofweek_nunique='nunique')                            # 一个月有多少天发了短信
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [41]:
# 每天的短信次数
sms_day_res = df_sms.groupby(['phone_no_m', 'sms_day'])['phone_no_m'].count().unstack()
for i in df_sms['sms_day'].unique():
    phone_no_m['sms_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_day_res[i])


# 每小时的短信次数
sms_hour_res = df_sms.groupby(['phone_no_m', 'sms_hour'])['phone_no_m'].count().unstack()
for i in df_sms['sms_hour'].unique():
    phone_no_m['sms_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_hour_res[i])
    
    
# 每周几的短信次数
sms_dayofweek_res = df_sms.groupby(['phone_no_m', 'sms_dayofweek'])['phone_no_m'].count().unstack()
for i in df_sms['sms_dayofweek'].unique():
    phone_no_m['sms_weekofday{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_dayofweek_res[i])

In [42]:
phone_no_m.to_hdf('../input/sms_features.h5', 'df', index=False)

del phone_no_m, df_sms
gc.collect()

64

## 读取数据，建模

In [43]:
df_user = reduce_mem_usage(pd.read_hdf('../input/user_features.h5'))
df_voc = reduce_mem_usage(pd.read_hdf('../input/voc_features.h5'))
df_sms = reduce_mem_usage(pd.read_hdf('../input/sms_features.h5'))
# df_app = reduce_mem_usage(pd.read_hdf('../input/app_features.h5'))

Memory usage of dataframe is 4042896.00 MB
Memory usage after optimization is: 1858428.00 MB
Decreased by 54.0%
Memory usage of dataframe is 13576000.00 MB
Memory usage after optimization is: 3482244.00 MB
Decreased by 74.3%
Memory usage of dataframe is 4455000.00 MB
Memory usage after optimization is: 1292500.00 MB
Decreased by 71.0%


In [44]:
df_user.shape, df_voc.shape, df_sms.shape

((8151, 91), (6788, 249), (6875, 80))

In [45]:
df = df_user.merge(df_voc, on='phone_no_m', how='left')
df = df.merge(df_sms, on='phone_no_m', how='left')
# df = df.merge(df_app, on='phone_no_m', how='left')

del df_user, df_voc, df_sms#, df_app
gc.collect()

0

In [46]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 418), (2045, 418))

In [47]:
fs = FeatureSelector(data=df_train.drop(['phone_no_m', 'label'], axis=1), labels=df_train['label'])

fs.identify_all(selection_params={'missing_threshold': 0.98,
                                  'correlation_threshold': 0.99, 
                                  'task': 'classification',
                                  'eval_metric': 'auc', 
                                  'cumulative_importance': 0.99})

9 features with greater than 0.98 missing values.

0 features with a single unique value.

42 features with a correlation magnitude greater than 0.99.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[109]	valid_0's auc: 0.96655	valid_0's binary_logloss: 0.169935
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[101]	valid_0's auc: 0.965576	valid_0's binary_logloss: 0.181879
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[87]	valid_0's auc: 0.952403	valid_0's binary_logloss: 0.212199
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's auc: 0.952731	valid_0's binary_logloss: 0.195562
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[93]	valid_0's auc: 0.953395	valid_0's binary_logloss: 0.221191
Training

In [48]:
train_removed_all_once = fs.remove(methods='all')
# train_removed_all_once

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 91 features.


In [49]:
use_cols = train_removed_all_once.columns.to_list()

del train_removed_all_once
gc.collect()

46

In [50]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train[use_cols], df_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [51]:
train_cols = [i for i in X_train.columns if i not in ['phone_no_m', 'opposite_no_m', 'label']]

In [52]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.12,
          'num_leaves': 31,
          'lambda_l1': 0.1,
          'lambda_l2': 0,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [53]:
useful_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_valid= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_valid, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.52:
        useful_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

city_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684155	valid_0's auc: 0.671655
[40]	training's auc: 0.684156	valid_0's auc: 0.672408
[60]	training's auc: 0.684158	valid_0's auc: 0.671655
[80]	training's auc: 0.684158	valid_0's auc: 0.671655
Early stopping, best iteration is:
[31]	training's auc: 0.684156	valid_0's auc: 0.672408
*****
0.6724078657865786
********************


county_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.769	valid_0's auc: 0.725784
[40]	training's auc: 0.773026	valid_0's auc: 0.729645
[60]	training's auc: 0.774292	valid_0's auc: 0.730264
[80]	training's auc: 0.775225	valid_0's auc: 0.731938
[100]	training's auc: 0.775908	valid_0's auc: 0.731651
[120]	training's auc: 0.776536	valid_0's auc: 0.731784
[140]	training's auc: 0.7769	valid_0's auc: 0.731993
[160]	training's auc: 0.777141	valid_0's auc: 0.731924
[180]	training's auc: 0.777322	valid_0's auc: 0.732627
[200]	training's 

0.6718413689195006
********************


county_name_idcard_cnt_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.760585	valid_0's auc: 0.726375
[40]	training's auc: 0.76268	valid_0's auc: 0.727183
[60]	training's auc: 0.763919	valid_0's auc: 0.726448
[80]	training's auc: 0.764713	valid_0's auc: 0.725953
Early stopping, best iteration is:
[42]	training's auc: 0.762833	valid_0's auc: 0.727863
*****
0.7278632754579806
********************


county_name_idcard_cnt_skew
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.765615	valid_0's auc: 0.722548
[40]	training's auc: 0.769772	valid_0's auc: 0.727762
[60]	training's auc: 0.771846	valid_0's auc: 0.7296
[80]	training's auc: 0.773203	valid_0's auc: 0.728352
[100]	training's auc: 0.774253	valid_0's auc: 0.72853
Early stopping, best iteration is:
[62]	training's auc: 0.771892	valid_0's auc: 0.729878
*****
0.7298781508585641
********************


county_name_idcard_cnt_

county_name_arpu_202004/idcard_cnt_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.754062	valid_0's auc: 0.707916
[40]	training's auc: 0.756969	valid_0's auc: 0.71085
[60]	training's auc: 0.758214	valid_0's auc: 0.712689
[80]	training's auc: 0.759194	valid_0's auc: 0.714019
[100]	training's auc: 0.760085	valid_0's auc: 0.714756
[120]	training's auc: 0.760433	valid_0's auc: 0.714819
[140]	training's auc: 0.760919	valid_0's auc: 0.714885
[160]	training's auc: 0.761187	valid_0's auc: 0.715426
[180]	training's auc: 0.76134	valid_0's auc: 0.71571
[200]	training's auc: 0.76157	valid_0's auc: 0.716346
[220]	training's auc: 0.761714	valid_0's auc: 0.716275
[240]	training's auc: 0.76181	valid_0's auc: 0.716385
[260]	training's auc: 0.761921	valid_0's auc: 0.716415
[280]	training's auc: 0.762041	valid_0's auc: 0.71681
[300]	training's auc: 0.762144	valid_0's auc: 0.717333
[320]	training's auc: 0.762169	valid_0's auc: 0.717273
[340]	training's auc: 0.762

0.716718682737839
********************


city_name_county_name_arpu_202004/idcard_cnt_skew
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.764172	valid_0's auc: 0.722228
[40]	training's auc: 0.768732	valid_0's auc: 0.725926
[60]	training's auc: 0.771932	valid_0's auc: 0.727908
[80]	training's auc: 0.773667	valid_0's auc: 0.728729
[100]	training's auc: 0.774696	valid_0's auc: 0.729602
[120]	training's auc: 0.775431	valid_0's auc: 0.729265
[140]	training's auc: 0.775842	valid_0's auc: 0.729747
[160]	training's auc: 0.776257	valid_0's auc: 0.729458
[180]	training's auc: 0.776617	valid_0's auc: 0.729007
Early stopping, best iteration is:
[142]	training's auc: 0.775878	valid_0's auc: 0.729959
*****
0.7299588654517626
********************


county_name_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.749138	valid_0's auc: 0.722068
[40]	training's auc: 0.750786	valid_0's auc: 0.725268
[60]	training's auc: 0.751275	va

call_dur_30_60s_cnt_rate
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.795175	valid_0's auc: 0.776364
[40]	training's auc: 0.799091	valid_0's auc: 0.77741
[60]	training's auc: 0.800857	valid_0's auc: 0.778038
[80]	training's auc: 0.801958	valid_0's auc: 0.778848
[100]	training's auc: 0.803432	valid_0's auc: 0.777455
[120]	training's auc: 0.804033	valid_0's auc: 0.777781
Early stopping, best iteration is:
[84]	training's auc: 0.802209	valid_0's auc: 0.779141
*****
0.7791409575740182
********************


call_dur_60_120s_cnt_rate
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.777638	valid_0's auc: 0.745012
[40]	training's auc: 0.780587	valid_0's auc: 0.746341
[60]	training's auc: 0.782526	valid_0's auc: 0.74806
[80]	training's auc: 0.783713	valid_0's auc: 0.747967
[100]	training's auc: 0.784637	valid_0's auc: 0.748429
[120]	training's auc: 0.785567	valid_0's auc: 0.747776
[140]	training's auc: 0.786163	valid_0's

[60]	training's auc: 0.84419	valid_0's auc: 0.834418
Early stopping, best iteration is:
[21]	training's auc: 0.842465	valid_0's auc: 0.836417
*****
0.8364169297364519
********************


call_dur_std
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.858338	valid_0's auc: 0.80205
[40]	training's auc: 0.86241	valid_0's auc: 0.802363
[60]	training's auc: 0.864349	valid_0's auc: 0.801867
Early stopping, best iteration is:
[25]	training's auc: 0.85985	valid_0's auc: 0.803348
*****
0.8033478619601091
********************


call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.878681	valid_0's auc: 0.828292
[40]	training's auc: 0.881238	valid_0's auc: 0.828272
Early stopping, best iteration is:
[1]	training's auc: 0.863442	valid_0's auc: 0.839296
*****
0.8392957502271966
********************


county_name_nunique_y
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.833327	valid_0's

[40]	training's auc: 0.765732	valid_0's auc: 0.742779
Early stopping, best iteration is:
[1]	training's auc: 0.765527	valid_0's auc: 0.743235
*****
0.7432349213182188
********************


voc_day12_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.756822	valid_0's auc: 0.733087
[40]	training's auc: 0.756899	valid_0's auc: 0.733147
[60]	training's auc: 0.757036	valid_0's auc: 0.732905
Early stopping, best iteration is:
[12]	training's auc: 0.7568	valid_0's auc: 0.733238
*****
0.7332382694791219
********************


voc_day7_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.748824	valid_0's auc: 0.740068
[40]	training's auc: 0.74892	valid_0's auc: 0.741177
[60]	training's auc: 0.749137	valid_0's auc: 0.741407
[80]	training's auc: 0.749149	valid_0's auc: 0.741434
[100]	training's auc: 0.749426	valid_0's auc: 0.741368
[120]	training's auc: 0.749443	valid_0's auc: 0.741362
Early stopping, best iteration is:


[20]	training's auc: 0.758839	valid_0's auc: 0.724346
[40]	training's auc: 0.758911	valid_0's auc: 0.723137
[60]	training's auc: 0.759161	valid_0's auc: 0.723155
Early stopping, best iteration is:
[27]	training's auc: 0.758853	valid_0's auc: 0.724535
*****
0.7245345458458889
********************


voc_day26_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.774696	valid_0's auc: 0.74396
[40]	training's auc: 0.774699	valid_0's auc: 0.74397
[60]	training's auc: 0.774702	valid_0's auc: 0.743984
[80]	training's auc: 0.774996	valid_0's auc: 0.743996
[100]	training's auc: 0.774996	valid_0's auc: 0.743999
[120]	training's auc: 0.774996	valid_0's auc: 0.744002
[140]	training's auc: 0.774996	valid_0's auc: 0.744002
[160]	training's auc: 0.774997	valid_0's auc: 0.743996
Early stopping, best iteration is:
[111]	training's auc: 0.774996	valid_0's auc: 0.744002
*****
0.744001709953604
********************


voc_day11_nunique
Training until validation scores 

[140]	training's auc: 0.770216	valid_0's auc: 0.753318
[160]	training's auc: 0.77022	valid_0's auc: 0.753318
[180]	training's auc: 0.770245	valid_0's auc: 0.753345
[200]	training's auc: 0.770245	valid_0's auc: 0.753345
[220]	training's auc: 0.770295	valid_0's auc: 0.752493
Early stopping, best iteration is:
[171]	training's auc: 0.770245	valid_0's auc: 0.753345
*****
0.7533451714736691
********************


voc_day19_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.761115	valid_0's auc: 0.754994
[40]	training's auc: 0.761153	valid_0's auc: 0.754764
[60]	training's auc: 0.761156	valid_0's auc: 0.754782
Early stopping, best iteration is:
[15]	training's auc: 0.76111	valid_0's auc: 0.755024
*****
0.7550237360692591
********************


voc_day3_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.745088	valid_0's auc: 0.724934
[40]	training's auc: 0.745261	valid_0's auc: 0.726871
[60]	training's auc: 0.74

[80]	training's auc: 0.754658	valid_0's auc: 0.726448
Early stopping, best iteration is:
[31]	training's auc: 0.752326	valid_0's auc: 0.727865
*****
0.7278647701726695
********************


voc_day5_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.755104	valid_0's auc: 0.719748
[40]	training's auc: 0.757108	valid_0's auc: 0.718704
Early stopping, best iteration is:
[4]	training's auc: 0.752355	valid_0's auc: 0.720071
*****
0.7200713277849524
********************


voc_day8_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.745847	valid_0's auc: 0.70907
[40]	training's auc: 0.747428	valid_0's auc: 0.70733
[60]	training's auc: 0.748292	valid_0's auc: 0.706913
Early stopping, best iteration is:
[16]	training's auc: 0.745538	valid_0's auc: 0.709759
*****
0.7097592911465059
********************


voc_day25_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's au

[1]	training's auc: 0.708052	valid_0's auc: 0.699697
*****
0.6996968718610992
********************


voc_hour23_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.568302	valid_0's auc: 0.566534
[40]	training's auc: 0.568304	valid_0's auc: 0.56653
Early stopping, best iteration is:
[1]	training's auc: 0.567943	valid_0's auc: 0.569308
*****
0.5693084254077582
********************


voc_hour22_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.646706	valid_0's auc: 0.635696
[40]	training's auc: 0.647892	valid_0's auc: 0.63707
[60]	training's auc: 0.647903	valid_0's auc: 0.637165
Early stopping, best iteration is:
[10]	training's auc: 0.646947	valid_0's auc: 0.640082
*****
0.6400816712105993
********************


voc_hour16_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.782637	valid_0's auc: 0.765118
[40]	training's auc: 0.783231	valid_0's auc: 0.763533
Early stopping, 

[60]	training's auc: 0.816228	valid_0's auc: 0.779349
Early stopping, best iteration is:
[14]	training's auc: 0.811176	valid_0's auc: 0.785546
*****
0.7855458100157842
********************


voc_hour13_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.806963	valid_0's auc: 0.781404
[40]	training's auc: 0.809222	valid_0's auc: 0.780057
[60]	training's auc: 0.810297	valid_0's auc: 0.779579
Early stopping, best iteration is:
[19]	training's auc: 0.806839	valid_0's auc: 0.781625
*****
0.781625173386904
********************


voc_hour21_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.748836	valid_0's auc: 0.712804
[40]	training's auc: 0.750622	valid_0's auc: 0.713036
Early stopping, best iteration is:
[8]	training's auc: 0.746832	valid_0's auc: 0.71403
*****
0.7140296910125795
********************


voc_hour18_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training

[40]	training's auc: 0.844938	valid_0's auc: 0.826743
[60]	training's auc: 0.845289	valid_0's auc: 0.826671
Early stopping, best iteration is:
[10]	training's auc: 0.843625	valid_0's auc: 0.827505
*****
0.8275054407614675
********************


voc_dayofweek5_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.842661	valid_0's auc: 0.817495
[40]	training's auc: 0.843451	valid_0's auc: 0.818338
[60]	training's auc: 0.84366	valid_0's auc: 0.818673
[80]	training's auc: 0.84374	valid_0's auc: 0.819963
[100]	training's auc: 0.843845	valid_0's auc: 0.820014
[120]	training's auc: 0.843899	valid_0's auc: 0.819799
Early stopping, best iteration is:
[73]	training's auc: 0.843678	valid_0's auc: 0.820101
*****
0.8201006241928541
********************


voc_dayofweek1_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.856707	valid_0's auc: 0.84136
[40]	training's auc: 0.857297	valid_0's auc: 0.841494
Early stopping, best 

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.850891	valid_0's auc: 0.80798
[40]	training's auc: 0.852891	valid_0's auc: 0.805757
[60]	training's auc: 0.853914	valid_0's auc: 0.804469
Early stopping, best iteration is:
[22]	training's auc: 0.851364	valid_0's auc: 0.808339
*****
0.8083387143062132
********************


sms_hour_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.880531	valid_0's auc: 0.85812
[40]	training's auc: 0.880537	valid_0's auc: 0.85821
Early stopping, best iteration is:
[1]	training's auc: 0.880204	valid_0's auc: 0.859477
*****
0.8594773879561869
********************


sms_day_mode
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.829988	valid_0's auc: 0.824639
[40]	training's auc: 0.829989	valid_0's auc: 0.824446
[60]	training's auc: 0.829989	valid_0's auc: 0.824446
Early stopping, best iteration is:
[11]	training's auc: 0.82995	valid_0's auc: 0.8255

[160]	training's auc: 0.840316	valid_0's auc: 0.835332
[180]	training's auc: 0.840317	valid_0's auc: 0.83535
[200]	training's auc: 0.840318	valid_0's auc: 0.835323
[220]	training's auc: 0.840318	valid_0's auc: 0.835329
[240]	training's auc: 0.840321	valid_0's auc: 0.835353
[260]	training's auc: 0.840321	valid_0's auc: 0.83538
[280]	training's auc: 0.840321	valid_0's auc: 0.835353
Early stopping, best iteration is:
[239]	training's auc: 0.840321	valid_0's auc: 0.83538
*****
0.8353795977423829
********************


sms_day16_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.801167	valid_0's auc: 0.790608
[40]	training's auc: 0.801588	valid_0's auc: 0.790716
[60]	training's auc: 0.802075	valid_0's auc: 0.792438
[80]	training's auc: 0.802176	valid_0's auc: 0.792561
[100]	training's auc: 0.802212	valid_0's auc: 0.792587
[120]	training's auc: 0.802634	valid_0's auc: 0.792245
[140]	training's auc: 0.802677	valid_0's auc: 0.792167
Early stopping, best i

*****
0.8256863729851246
********************


sms_hour17_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.82008	valid_0's auc: 0.799668
[40]	training's auc: 0.820546	valid_0's auc: 0.799628
Early stopping, best iteration is:
[4]	training's auc: 0.819503	valid_0's auc: 0.802544
*****
0.8025437054575023
********************


sms_hour11_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.856889	valid_0's auc: 0.820691
[40]	training's auc: 0.857407	valid_0's auc: 0.820887
[60]	training's auc: 0.857804	valid_0's auc: 0.821933
[80]	training's auc: 0.857977	valid_0's auc: 0.822441
[100]	training's auc: 0.85819	valid_0's auc: 0.821177
[120]	training's auc: 0.858411	valid_0's auc: 0.821604
Early stopping, best iteration is:
[73]	training's auc: 0.857949	valid_0's auc: 0.822776
*****
0.8227761634859138
********************


sms_hour9_count
Training until validation scores don't improve for 50 rounds
[20]	training'

[100]	training's auc: 0.861477	valid_0's auc: 0.833957
[120]	training's auc: 0.861747	valid_0's auc: 0.833706
Early stopping, best iteration is:
[82]	training's auc: 0.861193	valid_0's auc: 0.834387
*****
0.8343871071889798
********************


sms_weekofday4_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.858448	valid_0's auc: 0.840834
[40]	training's auc: 0.859323	valid_0's auc: 0.837789
[60]	training's auc: 0.85986	valid_0's auc: 0.836215
Early stopping, best iteration is:
[15]	training's auc: 0.858102	valid_0's auc: 0.841906
*****
0.8419055220739465
********************


sms_weekofday5_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.860853	valid_0's auc: 0.825108
[40]	training's auc: 0.861272	valid_0's auc: 0.825133
[60]	training's auc: 0.861583	valid_0's auc: 0.825359
[80]	training's auc: 0.861807	valid_0's auc: 0.825141
Early stopping, best iteration is:
[49]	training's auc: 0.861481	valid_0's 

In [54]:
print(useless_cols)

['voc_hour3_call_dur_sum', 'sms_hour4_count']


In [55]:
lgb_train = lgb.Dataset(X_train[useful_cols].values, y_train) 

lgb_valid= lgb.Dataset(X_valid[useful_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

lgb_val_0 = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_valid, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

print('Done!')

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.992065	valid_0's auc: 0.938889
[20]	training's auc: 0.998078	valid_0's auc: 0.950615
[30]	training's auc: 0.999342	valid_0's auc: 0.955048
[40]	training's auc: 0.999611	valid_0's auc: 0.95465
[50]	training's auc: 0.999769	valid_0's auc: 0.952124
[60]	training's auc: 0.999897	valid_0's auc: 0.954405
[70]	training's auc: 1	valid_0's auc: 0.953765
[80]	training's auc: 1	valid_0's auc: 0.95294
[90]	training's auc: 1	valid_0's auc: 0.953493
[100]	training's auc: 1	valid_0's auc: 0.954791
[110]	training's auc: 1	valid_0's auc: 0.954994
[120]	training's auc: 1	valid_0's auc: 0.955619
[130]	training's auc: 1	valid_0's auc: 0.955957
[140]	training's auc: 1	valid_0's auc: 0.956031
[150]	training's auc: 1	valid_0's auc: 0.956262
[160]	training's auc: 1	valid_0's auc: 0.956229
[170]	training's auc: 1	valid_0's auc: 0.956928
[180]	training's auc: 1	valid_0's auc: 0.956898
[190]	training's auc: 1	v

In [56]:
# 验证集结果
X_valid['prob'] = lgb_val_0.predict(X_valid[useful_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.4735, 1, 0)

f1_04735 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_04735 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_04735: ', f1_04735)
print('auc_04735: ', auc_04735)

f1_04735:  0.8804
auc_04735:  0.9574992825369492


In [57]:
lgb_train_all = lgb.Dataset(df_train[useful_cols].values, df_train['label'])   

print('Start training...')

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_val_0.best_iteration + 20)

print('Done!')

Start training...
Done!


In [58]:
df_test['label'] = np.where(lgb_model.predict(df_test[useful_cols]) > 0.4735, 1, 0)
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_04735), index=False)