In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from feature_selector import FeatureSelector
from sklearn.decomposition import NMF

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
class MeanEncoder:
    def __init__(self, categorical_features, n_splits=10, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
 
        :param n_splits: the number of splits used in mean encoding
 
        :param target_type: str, 'regression' or 'classification'
 
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        """
 
        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}
 
        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None
 
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))
 
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()
 
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression
        prior = X_train['pred_temp'].mean()
 
        col_avg_y = X_train.groupby(by=variable, axis=0)['pred_temp'].agg(['mean', 'size'])
        col_avg_y['size'] = prior_weight_func(col_avg_y['size'])
        col_avg_y[nf_name] = col_avg_y['size'] * prior + (1 - col_avg_y['size']) * col_avg_y['mean']
        col_avg_y.drop(['size', 'mean'], axis=1, inplace=True)
 
        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values
 
        return nf_train, nf_test, prior, col_avg_y
 
    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)
 
        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(X, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new
 
    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
 
        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
 
        return X_new

In [4]:
def count_encode(df, cols):
    for col in tqdm(cols):
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df
        
        
def label_encode(df, cols):
    le = LabelEncoder()
    for col in tqdm(cols):
        df[col] = df[col].fillna('NA')
        df[col] = le.fit_transform(df[col].astype(str))
    return df


def cross_cat_num(df, cat_col, num_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            df_new = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_nunique'.format(f1, f2): 'nunique'
            })
            df = df.merge(df_new, on=f1, how='left')
            del df_new
            gc.collect()
    return df

In [5]:
train_user = reduce_mem_usage(pd.read_hdf('../input/train_user.h5'))
test_user = reduce_mem_usage(pd.read_hdf('../input/test_user.h5'))

Memory usage of dataframe is 219816.00 MB
Memory usage after optimization is: 219816.00 MB
Decreased by 0.0%
Memory usage of dataframe is 71575.00 MB
Memory usage after optimization is: 71575.00 MB
Decreased by 0.0%


In [6]:
# 将city_name和county_name拼接起来
train_user['city_name_county_name'] = train_user['city_name'].astype(str) + '_' + train_user['county_name'].astype(str)
test_user['city_name_county_name'] = test_user['city_name'].astype(str) + '_' + test_user['county_name'].astype(str)

In [7]:
cat_feat = ['city_name', 'county_name', 'city_name_county_name']
num_feat = ['idcard_cnt', 'arpu_202004', 'idcard_cnt*arpu_202004', 'arpu_202004/idcard_cnt']

In [8]:
y = train_user['label']

ME = MeanEncoder(categorical_features=cat_feat,
                 n_splits=3,
                 target_type='classification',
                 prior_weight_func=None)
X_data = ME.fit_transform(train_user.drop('label', axis=1), y)
X_test = ME.transform(test_user)

train_user = X_data.copy()
train_user['label'] = y
test_user = X_test.copy()

del X_data, X_test
gc.collect()

train_user.shape, test_user.shape

((6106, 13), (2045, 12))

In [9]:
df_user = pd.concat([train_user, test_user])

del train_user, test_user
gc.collect()

20

In [10]:
user_phone_no_m = df_user[['phone_no_m']].copy()

In [11]:
# 电话的数量*月消费额
df_user['idcard_cnt*arpu_202004'] = df_user['idcard_cnt'] * df_user['arpu_202004']

# 月消费额/电话的数量
df_user['arpu_202004/idcard_cnt'] = df_user['arpu_202004'] / (df_user['idcard_cnt'] + 0.0001)

In [12]:
df_user = cross_cat_num(df_user, cat_feat, num_feat)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 21.62it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  5.29it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:00<00:00, 13.16it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 13.21it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:00<00:00,  4.45it/s]
  0%|                    

In [13]:
df_user = count_encode(df_user, cat_feat + ['idcard_cnt'])

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 529.22it/s]

city_name
county_name
city_name_county_name
idcard_cnt





In [14]:
df_user = label_encode(df_user, cat_feat)

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 300.83it/s]


In [15]:
df_user.to_hdf('../input/user_features.h5', 'df', index=False)

del df_user
gc.collect()

66

## voc表

In [16]:
train_voc = pd.read_hdf('../input/train_voc.h5')
test_voc = pd.read_hdf('../input/test_voc.h5')

In [17]:
df_voc = pd.concat([train_voc, test_voc])

del train_voc, test_voc
gc.collect()

128

In [18]:
phone_no_m = user_phone_no_m.merge(df_voc, on='phone_no_m', how='left')
phone_no_m = phone_no_m[phone_no_m['start_datetime'].notnull()]

In [19]:
opposite_no_m = user_phone_no_m.merge(df_voc,
                                      left_on='phone_no_m',
                                      right_on='opposite_no_m',
                                      how='left')
opposite_no_m = opposite_no_m[opposite_no_m['start_datetime'].notnull()]

opposite_no_m.drop('phone_no_m_y',
                   axis=1,
                   inplace=True)
opposite_no_m.rename(columns={'opposite_no_m': 'phone_no_m',
                              'phone_no_m_x': 'opposite_no_m'},
                     inplace=True)

df_voc = pd.concat([phone_no_m, opposite_no_m])
print(df_voc.shape)

del phone_no_m, opposite_no_m
gc.collect()

(985955, 8)


0

In [20]:
df_voc['city_name_county_name'] = df_voc['city_name'] + '_' + df_voc['county_name']

In [21]:
df_voc['voc_day'] = df_voc['start_datetime'].astype('datetime64').dt.day
df_voc['voc_hour'] = df_voc['start_datetime'].astype('datetime64').dt.hour
df_voc['voc_dayofweek'] = df_voc['start_datetime'].astype('datetime64').dt.dayofweek

In [22]:
phone_no_m = df_voc[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'],
                                        keep='last')

In [23]:
# 通话次数，通话人数
tmp = df_voc.groupby('phone_no_m')['opposite_no_m'].agg(opposite_cnt='count',
                                                        opposite_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, how='left', on='phone_no_m')

# 人均通话次数
phone_no_m['voc_cnt_per_capita'] = phone_no_m['opposite_cnt'] / (phone_no_m['opposite_nunique'] + 0.0001)

del tmp
gc.collect()

20

In [24]:
"""
主叫通话
"""

df_calltype_id_1 = df_voc.loc[df_voc['calltype_id'] == 1, :].copy()

# 主叫通话次数，主叫通话使用的手机个数
tmp = df_calltype_id_1.groupby('phone_no_m')['imei_m'].agg(call_1_cnt='count',
                                                           imeis_1='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话人数
tmp = df_calltype_id_1.groupby('phone_no_m')['opposite_no_m'].agg(opposite_no_m_cnt='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()


# 主叫通话时长
tmp = df_calltype_id_1.groupby('phone_no_m')['call_dur'].agg(call_dur_1_sum='sum',
                                                             call_dur_1_max='max',
                                                             call_dur_1_min='min',
                                                             call_dur_1_std='std',
                                                             call_dur_1_median='median',
                                                             call_dur_1_mean='mean')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()


# 主叫通话时长小于等于30s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] <= 30]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_30s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于30s，小于等于60s的次数
tmp1 = df_calltype_id_1[(df_calltype_id_1['call_dur'] > 30) & (df_calltype_id_1['call_dur'] <= 60)]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_30_60s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于60s，小于120s的次数
tmp1 = df_calltype_id_1[(df_calltype_id_1['call_dur'] > 60) & (df_calltype_id_1['call_dur'] <= 120)]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_60_120s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于120s，小于300s的次数
tmp1 = df_calltype_id_1[(df_calltype_id_1['call_dur'] > 120) & (df_calltype_id_1['call_dur'] <= 300)]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_120_300s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于300s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] > 300]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(call_dur_300s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()


# 主叫通话时长小于等于30s的次数的占比
phone_no_m['call_dur_30s_cnt_rate'] = phone_no_m['call_dur_30s_cnt'] / phone_no_m['call_1_cnt']

# 主叫通话时长大于30s，小于60s的次数的占比
phone_no_m['call_dur_30_60s_cnt_rate'] = phone_no_m['call_dur_30_60s_cnt'] / phone_no_m['call_1_cnt']

# 主叫通话时长大于60s，小于120s的次数的占比
phone_no_m['call_dur_60_120s_cnt_rate'] = phone_no_m['call_dur_60_120s_cnt'] / phone_no_m['call_1_cnt']

# 主叫通话时长大于120s，小于300s的次数的占比
phone_no_m['call_dur_120_300s_cnt_rate'] = phone_no_m['call_dur_120_300s_cnt'] / phone_no_m['call_1_cnt']

# 主叫通话时长大于300s的次数的占比
phone_no_m['call_dur_300s_cnt_rate'] = phone_no_m['call_dur_300s_cnt'] / phone_no_m['call_1_cnt']



# 主叫通话次数的占比
phone_no_m["call_type_id_1_rate"] = phone_no_m['call_1_cnt'] / phone_no_m['opposite_cnt']

# 主叫通话时所在地市变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时所在区县变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时所在地市_区县变动的个数
tmp = df_calltype_id_1.groupby("phone_no_m")['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 线上效果变差，不用
# # 主叫通话每天的通话次数
# calltype_id_1_voc_day_cnt_res = df_calltype_id_1.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
# for i in df_calltype_id_1['voc_day'].unique():
#     phone_no_m['calltype_id_1_voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(calltype_id_1_voc_day_cnt_res[i])

# del calltype_id_1_voc_day_cnt_res
# gc.collect()

0

In [25]:
"""
被叫通话
"""

df_calltype_id_2 = df_voc.loc[df_voc['calltype_id'] == 2, :].copy()

# 被叫通话的次数
tmp = df_calltype_id_2.groupby('phone_no_m')['imei_m'].agg(call_2_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 被叫通话次数的占比
phone_no_m["call_type_id_2_rate"] = phone_no_m['call_2_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_2
gc.collect()

# 线上变差，删除
# 主叫次数/被叫次数
phone_no_m['voc_calltype_1/2'] = phone_no_m['call_1_cnt'] / (phone_no_m['call_2_cnt'] + 0.0001)

In [26]:
"""
呼叫转移
"""

df_calltype_id_3 = df_voc.loc[df_voc['calltype_id'] == 3, :].copy()

# 呼叫转移的次数
tmp = df_calltype_id_3.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_3_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 呼叫转移次数的占比
phone_no_m["call_type_id_3_rate"] = phone_no_m['voc_calltype_id_3_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_3
gc.collect()

# 线上变差，删除
# # 主叫次数/呼叫转移次数
# phone_no_m['voc_calltype_1/3'] = phone_no_m['voc_calltype_id_1_cnt'] / (phone_no_m['voc_calltype_id_3_cnt'] + 0.0001)

0

In [27]:
"""
与对端通话统计
"""

# 与对端通话次数，与对端通话时长
tmp = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['call_dur'].agg(call_count='count',
                                                                      call_sum='sum')

# 与对端通话次数的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_count'].agg(phone2opposite_cnt_mean='mean',
                                                             phone2opposite_cnt_median='median',
                                                             phone2opposite_cnt_min='min',
                                                             phone2opposite_cnt_max='max',
                                                             phone2opposite_cnt_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite
gc.collect()


# 与对端总通话时长的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_sum'].agg(phone2opposite_call_dur_mean='mean',
                                                           phone2opposite_call_dur_median='median',
                                                           phone2opposite_call_dur_min='min',
                                                           phone2opposite_call_dur_max='max',
                                                           phone2opposite_call_dur_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite, tmp
gc.collect()

0

In [28]:
"""
通话时长的统计
"""

# 通话时长的统计量
tmp = df_voc.groupby('phone_no_m')['call_dur'].agg(call_dur_mean='mean',
                                                   call_dur_median='median',
                                                   call_dur_max='max',
                                                   call_dur_min='min',
                                                   call_dur_std='std',
                                                   call_dur_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

0

In [29]:
"""
收费号码位置变动
"""

# 收费号码所在地市的个数
tmp = df_voc.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码所在区县的个数
tmp = df_voc.groupby('phone_no_m')['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码所在地市_区县的个数
tmp = df_voc.groupby('phone_no_m')['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码通话类型的个数
tmp = df_voc.groupby('phone_no_m')['calltype_id'].agg(calltype_id_unique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [30]:
"""
通话时间点的偏好
"""

# hour通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_hour'].agg(voc_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   voc_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   voc_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# day通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_day'].agg(voc_day_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                  voc_day_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                  voc_day_nunique='nunique')                           # 一个月有多少天打了电话
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()


# dayofweek通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_dayofweek'].agg(voc_dayofweek_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                        voc_dayofweek_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                        voc_dayofweek_nunique='nunique')                           
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [31]:
# 每天通话次数的均值、标准差
tmp1 = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].agg(voc_day_cnt='count')
tmp2 = tmp1.groupby('phone_no_m')['voc_day_cnt'].agg(voc_day_cnt_mean='mean',
                                                     voc_day_cnt_std='std')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')

del tmp1, tmp2
gc.collect()

20

In [32]:
# 每天的通话次数
voc_day_cnt_res = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_cnt_res[i])

    
# 每天的通话人数
voc_day_nunique_res = df_voc.groupby(['phone_no_m', 'voc_day'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_nunique_res[i])

    
# 每天的通话时长
voc_day_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_day'])['call_dur'].sum().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_call_dur_res[i])



# 每小时的通话次数
voc_hour_cnt_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['phone_no_m'].count().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_cnt_res[i])
    
# 每小时的通话人数
voc_hour_nunique_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_nunique_res[i])
    

# 每小时的通话时长
voc_hour_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['call_dur'].sum().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_call_dur_res[i])

    

# 每周几的通话次数
voc_dayofweek_cnt_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['phone_no_m'].count().unstack()
for i in df_voc['voc_dayofweek'].unique():
    phone_no_m['voc_dayofweek{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_dayofweek_cnt_res[i])
    
# 每周几的通话人数
voc_dayoffweek_nunique_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_dayofweek'].unique():
    phone_no_m['voc_dayofweek{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_dayoffweek_nunique_res[i])
    

# 每周几的通话时长
voc_dayofweek_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_dayofweek'])['call_dur'].sum().unstack()
for i in df_voc['voc_dayofweek'].unique():
    phone_no_m['voc_dayofweek{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_dayofweek_call_dur_res[i])

In [33]:
phone_no_m.to_hdf('../input/voc_features.h5', 'df', index=False)
del phone_no_m
gc.collect()

64

## sms表

In [34]:
train_sms = pd.read_hdf('../input/train_sms.h5')
test_sms = pd.read_hdf('../input/test_sms.h5')

In [35]:
df_sms = pd.concat([train_sms, test_sms])

del train_sms, test_sms
gc.collect()

106

In [36]:
df_sms['sms_day'] = df_sms['request_datetime'].astype('datetime64').dt.day
df_sms['sms_hour'] = df_sms['request_datetime'].astype('datetime64').dt.hour
df_sms['sms_dayofweek'] = df_sms['request_datetime'].astype('datetime64').dt.dayofweek

In [37]:
phone_no_m = df_sms[["phone_no_m"]].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [38]:
# 短信次数，短信人数
tmp = df_sms.groupby('phone_no_m')['opposite_no_m'].agg(sms_cnt='count', sms_nunique='nunique')

# 人均短信次数
tmp['sms_cnt_per_capita'] = tmp['sms_cnt'] / (tmp['sms_nunique'] + 0.0001)
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [39]:
"""
短信上行，短信下行
"""

# 短信上行
df_sms_calltype1 = df_sms[df_sms['calltype_id'] == 1].copy()
tmp = df_sms_calltype1.groupby('phone_no_m')['calltype_id'].agg(sms_calltype1_cnt="count")
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行比例
phone_no_m['sms_calltype1_rate'] = phone_no_m['sms_calltype1_cnt'] / phone_no_m['sms_cnt']

del tmp, df_sms_calltype1

# 短信下行
df_sms_calltype2 = df_sms[df_sms['calltype_id'] == 2].copy()
tmp = df_sms_calltype2.groupby('opposite_no_m')['calltype_id'].agg(sms_calltype2_cnt="count")
phone_no_m = phone_no_m.merge(tmp, left_on='phone_no_m', right_on='opposite_no_m', how='left')

# 短信下行比例
phone_no_m['sms_calltype2_rate'] = phone_no_m['sms_calltype2_cnt'] / phone_no_m['sms_cnt']

# 短信上行/短信下行
phone_no_m['type1_rate/type2_rate'] = phone_no_m['sms_calltype1_cnt'] / (phone_no_m['sms_calltype2_cnt'] + 0.00001)

# 删除，效果变差
# # 短信下行/短信上行
# phone_no_m['type2_rate/type1_rate'] = phone_no_m['sms_calltype2_cnt'] / (phone_no_m['sms_calltype1_cnt'] + 0.00001)

del tmp, df_sms_calltype2
gc.collect()

0

In [40]:
"""
短信时间点的偏好
"""

tmp = df_sms.groupby('phone_no_m')['sms_hour'].agg(sms_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   sms_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   sms_hour_nunique='nunique')                          # 一个月有多少个小时发了短信
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

tmp = df_sms.groupby('phone_no_m')['sms_day'].agg(sms_day_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                  sms_day_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                  sms_day_nunique='nunique')                            # 一个月有多少天发了短信
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()


tmp = df_sms.groupby('phone_no_m')['sms_dayofweek'].agg(sms_dayofweek_mode=lambda x: stats.mode(x)[0][0],          # 频次最高的元素
                                                        sms_dayofweek_mode_count=lambda x: stats.mode(x)[1][0],    # 频次最高的元素的频次
                                                        sms_dayofweek_nunique='nunique')                            # 一个月有多少天发了短信
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [41]:
# 每天的短信次数
sms_day_res = df_sms.groupby(['phone_no_m', 'sms_day'])['phone_no_m'].count().unstack()
for i in df_sms['sms_day'].unique():
    phone_no_m['sms_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_day_res[i])


# 每小时的短信次数
sms_hour_res = df_sms.groupby(['phone_no_m', 'sms_hour'])['phone_no_m'].count().unstack()
for i in df_sms['sms_hour'].unique():
    phone_no_m['sms_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_hour_res[i])
    
    
# 每周几的短信次数
sms_dayofweek_res = df_sms.groupby(['phone_no_m', 'sms_dayofweek'])['phone_no_m'].count().unstack()
for i in df_sms['sms_dayofweek'].unique():
    phone_no_m['sms_weekofday{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_dayofweek_res[i])

In [42]:
phone_no_m.to_hdf('../input/sms_features.h5', 'df', index=False)

del phone_no_m, df_sms
gc.collect()

64

## 读取数据，建模

In [43]:
df_user = reduce_mem_usage(pd.read_hdf('../input/user_features.h5'))
df_voc = reduce_mem_usage(pd.read_hdf('../input/voc_features.h5'))
df_sms = reduce_mem_usage(pd.read_hdf('../input/sms_features.h5'))
# df_app = reduce_mem_usage(pd.read_hdf('../input/app_features.h5'))

Memory usage of dataframe is 4042896.00 MB
Memory usage after optimization is: 1858428.00 MB
Decreased by 54.0%
Memory usage of dataframe is 13576000.00 MB
Memory usage after optimization is: 3482244.00 MB
Decreased by 74.3%
Memory usage of dataframe is 4455000.00 MB
Memory usage after optimization is: 1292500.00 MB
Decreased by 71.0%


In [44]:
df_user.shape, df_voc.shape, df_sms.shape

((8151, 91), (6788, 249), (6875, 80))

In [45]:
df = df_user.merge(df_voc, on='phone_no_m', how='left')
df = df.merge(df_sms, on='phone_no_m', how='left')
# df = df.merge(df_app, on='phone_no_m', how='left')

del df_user, df_voc, df_sms#, df_app
gc.collect()

0

In [46]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 418), (2045, 418))

In [47]:
fs = FeatureSelector(data=df_train.drop(['phone_no_m', 'label'], axis=1), labels=df_train['label'])

fs.identify_all(selection_params={'missing_threshold': 0.98,
                                  'correlation_threshold': 0.98, 
                                  'task': 'classification',
                                  'eval_metric': 'auc', 
                                  'cumulative_importance': 0.99})

9 features with greater than 0.98 missing values.

0 features with a single unique value.

52 features with a correlation magnitude greater than 0.98.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[104]	valid_0's auc: 0.961251	valid_0's binary_logloss: 0.166908
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[90]	valid_0's auc: 0.950456	valid_0's binary_logloss: 0.200953
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[123]	valid_0's auc: 0.966156	valid_0's binary_logloss: 0.158895
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[102]	valid_0's auc: 0.958653	valid_0's binary_logloss: 0.201915
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[84]	valid_0's auc: 0.940635	valid_0's binary_logloss: 0.231894
Traini

In [48]:
train_removed_all_once = fs.remove(methods='all')
# train_removed_all_once

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 95 features.


In [49]:
use_cols = train_removed_all_once.columns.to_list()

del train_removed_all_once
gc.collect()

42

In [50]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train[use_cols], df_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [51]:
train_cols = [i for i in X_train.columns if i not in ['phone_no_m', 'opposite_no_m', 'label']]

In [52]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0.1,
          'lambda_l2': 0,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [53]:
useful_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_valid= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_valid, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.52:
        useful_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

city_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684155	valid_0's auc: 0.671655
[40]	training's auc: 0.684156	valid_0's auc: 0.672408
[60]	training's auc: 0.684156	valid_0's auc: 0.672408
[80]	training's auc: 0.684158	valid_0's auc: 0.671655
Early stopping, best iteration is:
[37]	training's auc: 0.684156	valid_0's auc: 0.672408
*****
0.6724078657865786
********************


county_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.767358	valid_0's auc: 0.72344
[40]	training's auc: 0.772283	valid_0's auc: 0.728765
[60]	training's auc: 0.77374	valid_0's auc: 0.730307
[80]	training's auc: 0.774604	valid_0's auc: 0.730896
[100]	training's auc: 0.775263	valid_0's auc: 0.73182
[120]	training's auc: 0.775831	valid_0's auc: 0.731191
[140]	training's auc: 0.776325	valid_0's auc: 0.732008
Early stopping, best iteration is:
[90]	training's auc: 0.775065	valid_0's auc: 0.732017
*****
0.7320170875783231
***********

[60]	training's auc: 0.77094	valid_0's auc: 0.729781
[80]	training's auc: 0.772231	valid_0's auc: 0.729415
[100]	training's auc: 0.773327	valid_0's auc: 0.72856
[120]	training's auc: 0.774232	valid_0's auc: 0.728727
Early stopping, best iteration is:
[71]	training's auc: 0.771813	valid_0's auc: 0.730194
*****
0.7301935356579137
********************


county_name_arpu_202004_max
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.767023	valid_0's auc: 0.726908
[40]	training's auc: 0.771569	valid_0's auc: 0.72833
[60]	training's auc: 0.773169	valid_0's auc: 0.729605
[80]	training's auc: 0.77389	valid_0's auc: 0.73028
[100]	training's auc: 0.774547	valid_0's auc: 0.730086
[120]	training's auc: 0.774957	valid_0's auc: 0.730044
[140]	training's auc: 0.775215	valid_0's auc: 0.729886
Early stopping, best iteration is:
[101]	training's auc: 0.774606	valid_0's auc: 0.730427
*****
0.7304267111493759
********************


county_name_arpu_202004_median
Training un

[180]	training's auc: 0.761058	valid_0's auc: 0.715115
[200]	training's auc: 0.761222	valid_0's auc: 0.715495
[220]	training's auc: 0.761359	valid_0's auc: 0.715626
[240]	training's auc: 0.761477	valid_0's auc: 0.715994
[260]	training's auc: 0.761644	valid_0's auc: 0.716388
[280]	training's auc: 0.761759	valid_0's auc: 0.7164
[300]	training's auc: 0.761881	valid_0's auc: 0.71655
[320]	training's auc: 0.761925	valid_0's auc: 0.71663
[340]	training's auc: 0.762033	valid_0's auc: 0.716625
[360]	training's auc: 0.762077	valid_0's auc: 0.716699
[380]	training's auc: 0.762127	valid_0's auc: 0.717013
[400]	training's auc: 0.762305	valid_0's auc: 0.717551
[420]	training's auc: 0.762329	valid_0's auc: 0.717566
[440]	training's auc: 0.762337	valid_0's auc: 0.717584
[460]	training's auc: 0.762357	valid_0's auc: 0.717551
[480]	training's auc: 0.762391	valid_0's auc: 0.717626
[500]	training's auc: 0.762459	valid_0's auc: 0.717605
[520]	training's auc: 0.762466	valid_0's auc: 0.717662
[540]	training

[20]	training's auc: 0.825115	valid_0's auc: 0.758926
[40]	training's auc: 0.829432	valid_0's auc: 0.759176
Early stopping, best iteration is:
[8]	training's auc: 0.819295	valid_0's auc: 0.760617
*****
0.7606169584349739
********************


call_dur_1_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.839871	valid_0's auc: 0.799759
[40]	training's auc: 0.844113	valid_0's auc: 0.799569
Early stopping, best iteration is:
[1]	training's auc: 0.823295	valid_0's auc: 0.801432
*****
0.8014316377289903
********************


call_dur_1_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.845217	valid_0's auc: 0.821214
[40]	training's auc: 0.849433	valid_0's auc: 0.820003
[60]	training's auc: 0.852201	valid_0's auc: 0.816906
Early stopping, best iteration is:
[12]	training's auc: 0.842676	valid_0's auc: 0.823093
*****
0.8230930429999521
********************


call_dur_30s_cnt
Training until validation scores don't i

[40]	training's auc: 0.887235	valid_0's auc: 0.844191
Early stopping, best iteration is:
[1]	training's auc: 0.87474	valid_0's auc: 0.848182
*****
0.8481818290524704
********************


phone2opposite_call_dur_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.887655	valid_0's auc: 0.862492
[40]	training's auc: 0.889181	valid_0's auc: 0.862201
[60]	training's auc: 0.890396	valid_0's auc: 0.861935
Early stopping, best iteration is:
[11]	training's auc: 0.886647	valid_0's auc: 0.864069
*****
0.8640691514803654
********************


phone2opposite_call_dur_max
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.858766	valid_0's auc: 0.790108
[40]	training's auc: 0.864187	valid_0's auc: 0.786825
Early stopping, best iteration is:
[4]	training's auc: 0.847622	valid_0's auc: 0.796335
*****
0.796334660639977
********************


phone2opposite_call_dur_std
Training until validation scores don't improve for 50 rounds
[2

[2]	training's auc: 0.762392	valid_0's auc: 0.743124
*****
0.7431243124312431
********************


voc_day9_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.755403	valid_0's auc: 0.724144
[40]	training's auc: 0.755412	valid_0's auc: 0.72392
Early stopping, best iteration is:
[1]	training's auc: 0.754753	valid_0's auc: 0.727164
*****
0.727163748983594
********************


voc_day26_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.771941	valid_0's auc: 0.741066
[40]	training's auc: 0.772023	valid_0's auc: 0.741691
[60]	training's auc: 0.77205	valid_0's auc: 0.741615
[80]	training's auc: 0.772065	valid_0's auc: 0.741624
Early stopping, best iteration is:
[36]	training's auc: 0.772019	valid_0's auc: 0.741694
*****
0.7416938704740039
********************


voc_day11_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.760194	valid_0's auc: 0.751193
[40]	training's auc: 

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.765289	valid_0's auc: 0.751442
[40]	training's auc: 0.766171	valid_0's auc: 0.75055
[60]	training's auc: 0.766507	valid_0's auc: 0.748993
Early stopping, best iteration is:
[27]	training's auc: 0.765255	valid_0's auc: 0.751667
*****
0.7516666068780791
********************


voc_day30_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.770458	valid_0's auc: 0.752275
[40]	training's auc: 0.770547	valid_0's auc: 0.752061
Early stopping, best iteration is:
[7]	training's auc: 0.769665	valid_0's auc: 0.752353
*****
0.752352680920266
********************


voc_day19_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.759874	valid_0's auc: 0.749203
[40]	training's auc: 0.760146	valid_0's auc: 0.747652
Early stopping, best iteration is:
[9]	training's auc: 0.759782	valid_0's auc: 0.749792
*****
0.7497922346582484
********************




[80]	training's auc: 0.752218	valid_0's auc: 0.741821
[100]	training's auc: 0.752219	valid_0's auc: 0.741815
Early stopping, best iteration is:
[59]	training's auc: 0.752216	valid_0's auc: 0.741831
*****
0.7418313842253791
********************


voc_day5_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.753711	valid_0's auc: 0.734515
[40]	training's auc: 0.754746	valid_0's auc: 0.734522
[60]	training's auc: 0.754746	valid_0's auc: 0.734534
[80]	training's auc: 0.754762	valid_0's auc: 0.734513
Early stopping, best iteration is:
[47]	training's auc: 0.754746	valid_0's auc: 0.734534
*****
0.7345341871143636
********************


voc_day25_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.769677	valid_0's auc: 0.755684
[40]	training's auc: 0.769717	valid_0's auc: 0.754814
Early stopping, best iteration is:
[6]	training's auc: 0.769607	valid_0's auc: 0.755873
*****
0.7558727340125316
********************




[40]	training's auc: 0.773892	valid_0's auc: 0.739953
[60]	training's auc: 0.774654	valid_0's auc: 0.738918
[80]	training's auc: 0.77538	valid_0's auc: 0.738528
Early stopping, best iteration is:
[36]	training's auc: 0.773546	valid_0's auc: 0.740039
*****
0.7400392213134357
********************


voc_day12_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.753372	valid_0's auc: 0.716131
[40]	training's auc: 0.754746	valid_0's auc: 0.715079
Early stopping, best iteration is:
[5]	training's auc: 0.751335	valid_0's auc: 0.719557
*****
0.7195571459319845
********************


voc_day7_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.750494	valid_0's auc: 0.730061
[40]	training's auc: 0.752572	valid_0's auc: 0.728857
[60]	training's auc: 0.75346	valid_0's auc: 0.727531
Early stopping, best iteration is:
[13]	training's auc: 0.749767	valid_0's auc: 0.731518
*****
0.7315178528722438
****************

[40]	training's auc: 0.737886	valid_0's auc: 0.712412
Early stopping, best iteration is:
[7]	training's auc: 0.736872	valid_0's auc: 0.713658
*****
0.7136575070550534
********************


voc_hour18_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.788957	valid_0's auc: 0.78148
[40]	training's auc: 0.789258	valid_0's auc: 0.781558
Early stopping, best iteration is:
[8]	training's auc: 0.788392	valid_0's auc: 0.782535
*****
0.7825354546324198
********************


voc_hour19_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.790118	valid_0's auc: 0.740352
[40]	training's auc: 0.790307	valid_0's auc: 0.739652
Early stopping, best iteration is:
[4]	training's auc: 0.789451	valid_0's auc: 0.741441
*****
0.7414412636915866
********************


voc_hour20_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.773466	valid_0's auc: 0.746704
[40]	training's auc: 0.773631	valid

[60]	training's auc: 0.569875	valid_0's auc: 0.571201
[80]	training's auc: 0.570741	valid_0's auc: 0.571201
Early stopping, best iteration is:
[33]	training's auc: 0.56986	valid_0's auc: 0.571383
*****
0.5713830893958961
********************


voc_hour22_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.646136	valid_0's auc: 0.63537
[40]	training's auc: 0.646137	valid_0's auc: 0.635376
Early stopping, best iteration is:
[5]	training's auc: 0.644691	valid_0's auc: 0.635896
*****
0.6358964700817907
********************


voc_hour16_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.795044	valid_0's auc: 0.772244
[40]	training's auc: 0.795065	valid_0's auc: 0.772361
Early stopping, best iteration is:
[3]	training's auc: 0.794865	valid_0's auc: 0.772889
*****
0.7728885660305161
********************


voc_hour11_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.816256

[40]	training's auc: 0.790224	valid_0's auc: 0.758693
Early stopping, best iteration is:
[1]	training's auc: 0.778215	valid_0's auc: 0.761795
*****
0.7617947936097957
********************


voc_hour11_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.82048	valid_0's auc: 0.777897
[40]	training's auc: 0.823579	valid_0's auc: 0.775147
Early stopping, best iteration is:
[2]	training's auc: 0.815333	valid_0's auc: 0.781619
*****
0.7816191945281484
********************


voc_hour9_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.800263	valid_0's auc: 0.753193
[40]	training's auc: 0.802524	valid_0's auc: 0.753285
Early stopping, best iteration is:
[1]	training's auc: 0.792803	valid_0's auc: 0.757009
*****
0.7570087171760654
********************


voc_hour0_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.539225	valid_0's auc: 0.531323
[40]	training's 

[20]	training's auc: 0.853524	valid_0's auc: 0.819835
[40]	training's auc: 0.855777	valid_0's auc: 0.819543
Early stopping, best iteration is:
[4]	training's auc: 0.846563	valid_0's auc: 0.821672
*****
0.8216715693308462
********************


voc_dayofweek4_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.84748	valid_0's auc: 0.830913
[40]	training's auc: 0.85052	valid_0's auc: 0.827976
Early stopping, best iteration is:
[4]	training's auc: 0.839155	valid_0's auc: 0.836994
*****
0.8369938896063519
********************


voc_dayofweek5_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.849467	valid_0's auc: 0.814321
[40]	training's auc: 0.851648	valid_0's auc: 0.812126
[60]	training's auc: 0.852834	valid_0's auc: 0.810425
Early stopping, best iteration is:
[13]	training's auc: 0.848073	valid_0's auc: 0.815564
*****
0.8155641651121633
********************


sms_cnt
Training until validation sco

[20]	training's auc: 0.830731	valid_0's auc: 0.813301
[40]	training's auc: 0.83089	valid_0's auc: 0.813412
[60]	training's auc: 0.831012	valid_0's auc: 0.813055
Early stopping, best iteration is:
[16]	training's auc: 0.830803	valid_0's auc: 0.814497
*****
0.8144969388243172
********************


sms_day10_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.837666	valid_0's auc: 0.799508
[40]	training's auc: 0.838101	valid_0's auc: 0.7983
[60]	training's auc: 0.837797	valid_0's auc: 0.799179
Early stopping, best iteration is:
[23]	training's auc: 0.837507	valid_0's auc: 0.79988
*****
0.7998801238819534
********************


sms_day11_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.810393	valid_0's auc: 0.794777
[40]	training's auc: 0.81044	valid_0's auc: 0.794758
Early stopping, best iteration is:
[5]	training's auc: 0.810262	valid_0's auc: 0.795048
*****
0.7950477112928684
********************


sms_day12

[60]	training's auc: 0.864243	valid_0's auc: 0.835806
Early stopping, best iteration is:
[21]	training's auc: 0.863997	valid_0's auc: 0.836423
*****
0.8364229085952073
********************


sms_hour10_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.840865	valid_0's auc: 0.802379
[40]	training's auc: 0.841822	valid_0's auc: 0.803316
[60]	training's auc: 0.842539	valid_0's auc: 0.803325
Early stopping, best iteration is:
[27]	training's auc: 0.841209	valid_0's auc: 0.804532
*****
0.8045316759936864
********************


sms_hour14_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.822721	valid_0's auc: 0.804301
[40]	training's auc: 0.824165	valid_0's auc: 0.807316
[60]	training's auc: 0.82493	valid_0's auc: 0.806279
[80]	training's auc: 0.825467	valid_0's auc: 0.805047
[100]	training's auc: 0.82567	valid_0's auc: 0.804476
Early stopping, best iteration is:
[52]	training's auc: 0.824797	valid_0's auc: 0.807

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.525782	valid_0's auc: 0.520192
[40]	training's auc: 0.525786	valid_0's auc: 0.520223
[60]	training's auc: 0.525813	valid_0's auc: 0.52124
[80]	training's auc: 0.525815	valid_0's auc: 0.521234
Early stopping, best iteration is:
[30]	training's auc: 0.525806	valid_0's auc: 0.521244
*****
0.5212443798727698
********************


sms_weekofday6_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.855056	valid_0's auc: 0.830762
[40]	training's auc: 0.855798	valid_0's auc: 0.828459
Early stopping, best iteration is:
[8]	training's auc: 0.853946	valid_0's auc: 0.83152
*****
0.8315202444157459
********************


sms_weekofday0_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.851037	valid_0's auc: 0.825507
[40]	training's auc: 0.852627	valid_0's auc: 0.822582
Early stopping, best iteration is:
[1]	training's auc: 0.847094	valid_0'

In [54]:
print(useless_cols)

['voc_hour3_call_dur_sum']


In [55]:
lgb_train = lgb.Dataset(X_train[useful_cols].values, y_train) 

lgb_valid= lgb.Dataset(X_valid[useful_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

lgb_val_0 = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_valid, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

print('Done!')

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.991314	valid_0's auc: 0.944598
[20]	training's auc: 0.997333	valid_0's auc: 0.950612
[30]	training's auc: 0.998759	valid_0's auc: 0.951126
[40]	training's auc: 0.999498	valid_0's auc: 0.951525
[50]	training's auc: 0.999664	valid_0's auc: 0.951102
[60]	training's auc: 0.999798	valid_0's auc: 0.951114
[70]	training's auc: 0.999884	valid_0's auc: 0.950307
[80]	training's auc: 0.999999	valid_0's auc: 0.951532
[90]	training's auc: 1	valid_0's auc: 0.951864
[100]	training's auc: 1	valid_0's auc: 0.951299
[110]	training's auc: 1	valid_0's auc: 0.952163
[120]	training's auc: 1	valid_0's auc: 0.95251
[130]	training's auc: 1	valid_0's auc: 0.952955
[140]	training's auc: 1	valid_0's auc: 0.951933
[150]	training's auc: 1	valid_0's auc: 0.953168
[160]	training's auc: 1	valid_0's auc: 0.953691
[170]	training's auc: 1	valid_0's auc: 0.953676
[180]	training's auc: 1	valid_0's auc: 0.954127
[190]	trai

In [56]:
# 验证集结果
X_valid['prob'] = lgb_val_0.predict(X_valid[useful_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.4735, 1, 0)

f1_04735 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_04735 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_04735: ', f1_04735)
print('auc_04735: ', auc_04735)

f1_04735:  0.8804
auc_04735:  0.9549074472664658


In [57]:
lgb_train_all = lgb.Dataset(df_train[useful_cols].values, df_train['label'])   

print('Start training...')

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_val_0.best_iteration + 20)

print('Done!')

Start training...
Done!


In [58]:
df_test['label'] = np.where(lgb_model.predict(df_test[useful_cols]) > 0.4735, 1, 0)
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_04735), index=False)