In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from feature_selector import FeatureSelector

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
def count_encode(df, cols):
    for col in tqdm(cols):
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df
        
        
def label_encode(df, cols):
    le = LabelEncoder()
    for col in tqdm(cols):
        df[col] = df[col].fillna('NA')
        df[col] = le.fit_transform(df[col].astype(str))
    return df


def cross_cat_num(df, cat_col, num_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            df_new = g[f2].agg({
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_nunique'.format(f1, f2): 'nunique'
            })
            df = df.merge(df_new, on=f1, how='left')
            del df_new
            gc.collect()
    return df

## user表

In [4]:
train_user = pd.read_hdf('../input/train_user.h5')
test_user = pd.read_hdf('../input/test_user.h5')

In [5]:
# 将city_name和county_name拼接起来
train_user['city_name_county_name'] = train_user['city_name'].astype(str) + '_' + train_user['county_name'].astype(str)
test_user['city_name_county_name'] = test_user['city_name'].astype(str) + '_' + test_user['county_name'].astype(str)

In [6]:
cat_feat = ['city_name', 'county_name', 'city_name_county_name']
num_feat = ['idcard_cnt', 'arpu_202005', 'idcard_cnt*arpu_202005', 'arpu_202005/idcard_cnt', 'idcard_cnt/arpu_202005']

In [7]:
# y = train_user['label']

In [8]:
# ME = MeanEncoder(categorical_features=cat_feat,
#                  n_splits=3,
#                  target_type='classification',
#                  prior_weight_func=None)
# X_data = ME.fit_transform(train_user.drop('label', axis=1), y)
# X_test = ME.transform(test_user)

In [9]:
# train_user = X_data.copy()
# train_user['label'] = y
# test_user = X_test.copy()

# train_user.shape, test_user.shape

In [10]:
df_user = pd.concat([train_user, test_user])

del train_user, test_user
gc.collect()

0

In [11]:
# 电话的数量*月消费额
df_user['idcard_cnt*arpu_202005'] = df_user['idcard_cnt'] * df_user['arpu_202005']

# 月消费额/电话的数量
df_user['arpu_202005/idcard_cnt'] = df_user['arpu_202005'] / (df_user['idcard_cnt'] + 0.0001)

# 电话的数量/月消费额
df_user['idcard_cnt/arpu_202005'] = df_user['idcard_cnt'] / (df_user['arpu_202005'] + 0.0001)

In [12]:
df_user = cross_cat_num(df_user, cat_feat, num_feat)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 60%|██████████████████████████████████████████████████▍                                 | 3/5 [00:00<00:00, 22.28it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 20.72it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  4.09it/s]
  0%|                                                                                            | 0/5 [00:00<?, ?it/s][A
 40%|█████████████████████████████████▌                                                  | 2/5 [00:00<00:00, 13.02it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 12.66it/s][A
 67%|█████████████████

In [13]:
df_user = count_encode(df_user, cat_feat + ['idcard_cnt'])

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 300.27it/s]

city_name
county_name
city_name_county_name
idcard_cnt





In [14]:
df_user = label_encode(df_user, cat_feat)

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 334.24it/s]


In [15]:
df_user.to_hdf('../input/user_features.h5', 'df', index=False)

del df_user
gc.collect()

55

## voc表

In [16]:
train_voc = pd.read_hdf('../input/train_voc.h5')
test_voc = pd.read_hdf('../input/test_voc.h5')

In [17]:
df_voc = pd.concat([train_voc, test_voc])

del train_voc, test_voc
gc.collect()

128

In [18]:
df_voc['city_name_county_name'] = df_voc['city_name'] + '_' + df_voc['county_name']

In [19]:
df_voc['voc_day'] = df_voc['start_datetime'].astype('datetime64').dt.day
df_voc['voc_hour'] = df_voc['start_datetime'].astype('datetime64').dt.hour
# df_voc['voc_dayofweek'] = df_voc['start_datetime'].astype('datetime64').dt.dayofweek

In [20]:
phone_no_m = df_voc[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [21]:
# 通话次数，通话人数
tmp = df_voc.groupby('phone_no_m')['opposite_no_m'].agg(opposite_cnt='count',
                                                        opposite_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, how='left', on='phone_no_m')

# 人均通话次数
phone_no_m['voc_cnt_per_capita'] = phone_no_m['opposite_cnt'] / (phone_no_m['opposite_nunique'] + 0.0001)

del tmp
gc.collect()

0

In [22]:
"""
主叫通话
"""

df_calltype_id_1 = df_voc.loc[df_voc['calltype_id'] == 1, :].copy()
print('主叫通话的数据量：', df_calltype_id_1.shape[0])

# 主叫通话次数，主叫通话使用的手机个数
tmp = df_calltype_id_1.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_1_cnt='count',
                                                           imeis='nunique')
# 每个手机的平均主叫通话次数
tmp['voc_calltype_id_1_cnt/imeis'] = tmp['voc_calltype_id_1_cnt'] / tmp['imeis']
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长的统计量
tmp = df_calltype_id_1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_call_dur_sum='sum',
                                                             voc_calltype_id_1_call_dur_mean='mean',
                                                             voc_calltype_id_1_call_dur_median='median',
                                                             voc_calltype_id_1_call_dur_max='max',
                                                             voc_calltype_id_1_call_dur_min='min',
                                                             voc_calltype_id_1_call_dur_std='std',
                                                             voc_calltype_id_1_call_dur_skew='skew')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长小于30s的通话时长的统计量
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 30]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_30s_cnt='count',
                                                  voc_calltype_id_1_30s_sum='sum',
                                                  voc_calltype_id_1_30s_mean='mean',
                                                  voc_calltype_id_1_30s_median='median',
                                                  voc_calltype_id_1_30s_max='max',
                                                  voc_calltype_id_1_30s_min='min',
                                                  voc_calltype_id_1_30s_std='std',
                                                  voc_calltype_id_1_30s_skew='skew')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于60s的通话时长的统计量
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 60]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_60s_cnt='count',
                                                  voc_calltype_id_1_60s_sum='sum',
                                                  voc_calltype_id_1_60s_mean='mean',
                                                  voc_calltype_id_1_60s_median='median',
                                                  voc_calltype_id_1_60s_max='max',
                                                  voc_calltype_id_1_60s_min='min',
                                                  voc_calltype_id_1_60s_std='std',
                                                  voc_calltype_id_1_60s_skew='skew')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于300s（5分钟）的通话时长的统计量
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] > 300]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_300s_cnt='count',
                                                  voc_calltype_id_1_300s_sum='sum',
                                                  voc_calltype_id_1_300s_mean='mean',
                                                  voc_calltype_id_1_300s_median='median',
                                                  voc_calltype_id_1_300s_max='max',
                                                  voc_calltype_id_1_300s_min='min',
                                                  voc_calltype_id_1_300s_std='std',
                                                  voc_calltype_id_1_300s_skew='skew')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于30s的次数的占比
phone_no_m['voc_calltype_id_1_30s_rate'] = phone_no_m['voc_calltype_id_1_30s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长小于60s的次数的占比
phone_no_m['voc_calltype_id_1_60s_rate'] = phone_no_m['voc_calltype_id_1_60s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长大于300s（5分钟）的次数的占比
phone_no_m['voc_calltype_id_1_300s_rate'] = phone_no_m['voc_calltype_id_1_300s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']


# 主叫通话次数的占比
phone_no_m["call_type_id_1_rate"] = phone_no_m['voc_calltype_id_1_cnt'] / phone_no_m['opposite_cnt']

# 主叫通话时所在地市变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['city_name'].agg(calltype_id_1_city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时所在区县变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['county_name'].agg(calltype_id_1_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
del tmp
gc.collect()

# 主叫通话时所在地市_区县变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['city_name_county_name'].agg(calltype_id_1_city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
del tmp
gc.collect()

主叫通话的数据量： 557812


0

In [23]:
"""
被叫通话
"""

df_calltype_id_2 = df_voc.loc[df_voc['calltype_id'] == 2, :].copy()
print('被叫通话的数据量：', df_calltype_id_2.shape[0])
# 被叫通话的次数
tmp = df_calltype_id_2.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_2_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 被叫通话次数的占比
phone_no_m["call_type_id_2_rate"] = phone_no_m['voc_calltype_id_2_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_2
gc.collect()

被叫通话的数据量： 298137


20

In [24]:
"""
呼叫转移
"""

df_calltype_id_3 = df_voc.loc[df_voc['calltype_id'] == 3, :].copy()

# 呼叫转移的次数
tmp = df_calltype_id_3.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_3_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 呼叫转移次数的占比
phone_no_m['call_type_id_3_rate'] = phone_no_m['voc_calltype_id_3_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_3
gc.collect()

0

In [25]:
"""
与对端通话统计
"""

# 与对端通话次数，与对端通话时长
# tmp = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['call_dur'].agg(p2o_call_count='count',
#                                                                       p2o_call_sum='sum',
#                                                                       p2o_call_mean='mean',
#                                                                       p2o_call_median='median',
#                                                                       p2o_call_max='max',
#                                                                       p2o_call_min='min',
#                                                                       p2o_call_std='std')
tmp = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['call_dur'].agg(p2o_call_count='count',
                                                                      p2o_call_sum='sum',
                                                                      p2o_call_mean='mean',
                                                                      p2o_call_median='median',
                                                                      p2o_call_max='max',
                                                                      p2o_call_min='min',
                                                                      p2o_call_std='std')

# 与对端通话次数的统计量
p2o = tmp.groupby('phone_no_m')['p2o_call_count'].agg(p2o_call_count_std='std',
                                                      p2o_call_count_median='median')
phone_no_m = phone_no_m.merge(p2o, on='phone_no_m', how='left')
del p2o
gc.collect()

p2o = tmp.groupby('phone_no_m')['p2o_call_sum'].agg(p2o_call_sum_std='std',
                                                    p2o_call_sum_median='median')
phone_no_m = phone_no_m.merge(p2o, on='phone_no_m', how='left')
del p2o
gc.collect()

p2o = tmp.groupby('phone_no_m')['p2o_call_mean'].agg(p2o_call_mean_std='std',
                                                     p2o_call_mean_median='median')
phone_no_m = phone_no_m.merge(p2o, on='phone_no_m', how='left')
del p2o
gc.collect()

p2o = tmp.groupby('phone_no_m')['p2o_call_median'].agg(p2o_call_median_std='std',
                                                       p2o_call_median_median='median')
phone_no_m = phone_no_m.merge(p2o, on='phone_no_m', how='left')
del p2o
gc.collect()

p2o = tmp.groupby('phone_no_m')['p2o_call_max'].agg(p2o_call_max_std='std',
                                                    p2o_call_max_median='median')
phone_no_m = phone_no_m.merge(p2o, on='phone_no_m', how='left')
del p2o
gc.collect()

p2o = tmp.groupby('phone_no_m')['p2o_call_min'].agg(p2o_call_min_std='std',
                                                    p2o_call_min_median='median')
phone_no_m = phone_no_m.merge(p2o, on='phone_no_m', how='left')
del p2o
gc.collect()

p2o = tmp.groupby('phone_no_m')['p2o_call_std'].agg(p2o_call_std_std='std',
                                                    p2o_call_std_median='median')
phone_no_m = phone_no_m.merge(p2o, on='phone_no_m', how='left')
del p2o
gc.collect()

0

In [26]:
"""
通话时长的统计
"""

# 通话时长的统计量
tmp = df_voc.groupby('phone_no_m')['call_dur'].agg(call_dur_mean='mean',
                                                   call_dur_median='median',
                                                   call_dur_max='max',
                                                   call_dur_min='min',
                                                   call_dur_std='std',
                                                   call_dur_sum='sum',
                                                   call_dur_skew='skew')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

20

In [27]:
"""
收费号码位置变动
"""

# 收费号码所在地市的个数
tmp = df_voc.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 收费号码所在区县的个数
tmp = df_voc.groupby('phone_no_m')['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how='left')
del tmp
gc.collect()

# 收费号码所在地市_区县的个数
tmp = df_voc.groupby('phone_no_m')['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how='left')
del tmp
gc.collect()

# 收费号码通话类型的个数
tmp = df_voc.groupby('phone_no_m')['calltype_id'].agg(voc_calltype_id_unique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 收费号码与对端的通话类型的个数
# tmp = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['calltype_id'].agg(p2o_calltype_id_unique='nunique')
# phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
# del tmp
# gc.collect()

0

In [28]:
"""
通话时间点的偏好
"""

# hour通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_hour'].agg(voc_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   voc_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   voc_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# day通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_day'].agg(voc_day_mode=lambda x: stats.mode(x)[0][0],
                                                  voc_day_mode_count=lambda x: stats.mode(x)[1][0],
                                                  voc_day_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [29]:
# 每天的通话次数
voc_day_cnt_res = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_cnt_res[i])

    
# 每天的通话人数
voc_day_nunique_res = df_voc.groupby(['phone_no_m', 'voc_day'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_nunique_res[i])

    
# 每天的通话时长
voc_day_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_day'])['call_dur'].sum().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_call_dur_res[i])



# 每小时的通话次数
voc_hour_cnt_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['phone_no_m'].count().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_cnt_res[i])
    
# 每小时的通话人数
voc_hour_nunique_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_nunique_res[i])
    

# 每小时的通话时长
voc_hour_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['call_dur'].sum().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_call_dur_res[i])

In [30]:
phone_no_m.to_hdf('../input/voc_features.h5', 'df', index=False)
del phone_no_m
gc.collect()

75

## sms表

In [31]:
train_sms = pd.read_hdf('../input/train_sms.h5')
test_sms = pd.read_hdf('../input/test_sms.h5')

In [32]:
df_sms = pd.concat([train_sms, test_sms])

del train_sms, test_sms
gc.collect()

106

In [33]:
df_sms['sms_day'] = df_sms['request_datetime'].astype('datetime64').dt.day
df_sms['sms_hour'] = df_sms['request_datetime'].astype('datetime64').dt.hour
# df_sms['sms_dayofweek'] = df_sms['request_datetime'].astype('datetime64').dt.dayofweek

In [34]:
phone_no_m = df_sms[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [35]:
# 短信次数，短信人数
tmp = df_sms.groupby('phone_no_m')['opposite_no_m'].agg(sms_cnt='count',
                                                        sms_nunique='nunique')

# 人均短信次数
tmp['sms_cnt_per_capita'] = tmp['sms_cnt'] / (tmp['sms_nunique'] + 0.0001)
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [36]:
# 收费号码短信类型的个数
tmp = df_sms.groupby('phone_no_m')['calltype_id'].agg(sms_calltype_id_unique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 收费号码与对端的短信类型的个数
# tmp = df_sms.groupby(['phone_no_m', 'opposite_no_m'])['calltype_id'].agg(p2o_calltype_id_unique='nunique')
# phone_no_m = phone_no_m.merge(tmp, on=['phone_no_m', 'opposite_no_m'], how='left')
# del tmp
# gc.collect()

20

In [37]:
"""
短信上行，短信下行
"""

# 短信上行
df_sms_calltype1 = df_sms[df_sms['calltype_id'] == 1].copy()
tmp = df_sms_calltype1.groupby('phone_no_m')['calltype_id'].agg(sms_calltype1_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行比例
phone_no_m['sms_calltype1_rate'] = phone_no_m['sms_calltype1_cnt'] / phone_no_m['sms_cnt']

del tmp, df_sms_calltype1

# 短信下行
df_sms_calltype2 = df_sms[df_sms['calltype_id'] == 2].copy()
tmp = df_sms_calltype2.groupby('phone_no_m')['calltype_id'].agg(sms_calltype2_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行/短信下行
phone_no_m['type1_rate/type2_rate'] = phone_no_m['sms_calltype1_cnt'] / (phone_no_m['sms_calltype2_cnt'] + 0.00001)

# 删除，效果变差
# # 短信下行/短信上行
# phone_no_m['type2_rate/type1_rate'] = phone_no_m['sms_calltype2_cnt'] / (phone_no_m['sms_calltype1_cnt'] + 0.00001)

del tmp, df_sms_calltype2
gc.collect()

0

In [38]:
"""
短信时间点的偏好
"""

tmp = df_sms.groupby('phone_no_m')['sms_hour'].agg(sms_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   sms_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   sms_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp

tmp = df_sms.groupby('phone_no_m')['sms_day'].agg(sms_day_mode=lambda x: stats.mode(x)[0][0],
                                                  sms_day_mode_count=lambda x: stats.mode(x)[1][0],
                                                  sms_day_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [39]:
# 每天的短信次数
sms_day_res = df_sms.groupby(['phone_no_m', 'sms_day'])['phone_no_m'].count().unstack()
for i in df_sms['sms_day'].unique():
    phone_no_m['sms_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_day_res[i])


# 每小时的短信次数
sms_hour_res = df_sms.groupby(['phone_no_m', 'sms_hour'])['phone_no_m'].count().unstack()
for i in df_sms['sms_hour'].unique():
    phone_no_m['sms_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_hour_res[i])

In [40]:
phone_no_m.to_hdf('../input/sms_features.h5', 'df', index=False)

del phone_no_m
gc.collect()

64

## app表

In [41]:
train_app = pd.read_hdf('../input/train_app.h5')
test_app = pd.read_hdf('../input/test_app.h5')

In [42]:
df_app = pd.concat([train_app, test_app])

del train_app, test_app
gc.collect()

117

In [43]:
phone_no_m = df_app[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [44]:
# APP数
tmp = df_app.groupby('phone_no_m')['busi_name'].agg(busi_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

40

In [45]:
"""
流量统计
"""
tmp = df_app.groupby("phone_no_m")["flow"].agg(flow_mean='mean',
                                               flow_median='median',
                                               flow_min='min',
                                               flow_max='max',
                                               flow_std='std',
                                               flow_sum='sum',
                                               flow_skew='skew')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

20

In [46]:
phone_no_m.to_hdf('../input/app_features.h5', 'df', index=False)

del phone_no_m
gc.collect()

75

## 读取数据，建模

In [47]:
df_user = reduce_mem_usage(pd.read_hdf('../input/user_features.h5'))
df_voc = reduce_mem_usage(pd.read_hdf('../input/voc_features.h5'))
df_sms = reduce_mem_usage(pd.read_hdf('../input/sms_features.h5'))
# df_app = reduce_mem_usage(pd.read_hdf('../input/app_features.h5'))

Memory usage of dataframe is 6619056.00 MB
Memory usage after optimization is: 1949448.00 MB
Decreased by 70.5%
Memory usage of dataframe is 12088080.00 MB
Memory usage after optimization is: 3151800.00 MB
Decreased by 73.9%
Memory usage of dataframe is 3555112.00 MB
Memory usage after optimization is: 932591.00 MB
Decreased by 73.8%


In [48]:
df_user.shape, df_voc.shape, df_sms.shape #, df_app.shape

((7556, 119), (6180, 245), (6259, 70))

In [49]:
df = df_user.merge(df_voc, on='phone_no_m', how='left')
df = df.merge(df_sms, on='phone_no_m', how='left')
# df = df.merge(df_app, on='phone_no_m', how='left')

del df_user, df_voc, df_sms#, df_app
gc.collect()

0

In [50]:
df.shape

(7556, 432)

In [51]:
# astype('category')会掉分，不做此处理
# for i in tqdm(cat_feat):
#     print(i)
#     df[i] = df[i].astype('category')

In [52]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 432), (1450, 432))

## 特征筛选

### 缺失值

In [53]:
fs = FeatureSelector(data=df_train.drop(['phone_no_m', 'label'], axis=1), labels=df_train['label'])

In [54]:
fs.identify_missing(missing_threshold=0.98)
missing_features = fs.ops['missing']
print(missing_features)

2 features with greater than 0.98 missing values.

['voc_calltype_id_3_cnt', 'call_type_id_3_rate']


In [55]:
fs.missing_stats.head(15)

Unnamed: 0,missing_fraction
voc_calltype_id_3_cnt,0.982312
call_type_id_3_rate,0.982312
voc_hour3_nunique,0.969211
voc_hour3_count,0.969211
voc_hour3_call_dur_sum,0.969211
voc_hour4_count,0.966099
voc_hour4_call_dur_sum,0.966099
voc_hour4_nunique,0.966099
voc_hour2_call_dur_sum,0.959548
voc_hour2_nunique,0.959548


In [56]:
df_train.drop(missing_features, axis=1, inplace=True)
df_test.drop(missing_features, axis=1, inplace=True)

### 唯一值

In [57]:
fs.identify_single_unique()

0 features with a single unique value.



In [58]:
single_unique = fs.ops['single_unique']
print(single_unique)

[]


In [59]:
df_train.drop(single_unique, axis=1, inplace=True)
df_test.drop(single_unique, axis=1, inplace=True)

### 单特征AUC

In [60]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop(['phone_no_m', 'label'], axis=1), df_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [61]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0.1,
          'lambda_l2': 0,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [62]:
train_cols = [i for i in df_train.columns if i not in ['phone_no_m', 'label']]

In [63]:
useful_cols = {}
useless_cols = {}

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_valid= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_valid, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.52:
        useful_cols[i] = lgb_test.best_score['valid_0']['auc']
    else:
        useless_cols[i] = lgb_test.best_score['valid_0']['auc']
    print('*' * 20)
    print('\n')

city_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684155	valid_0's auc: 0.671655
[40]	training's auc: 0.684156	valid_0's auc: 0.672408
[60]	training's auc: 0.684156	valid_0's auc: 0.672408
[80]	training's auc: 0.684158	valid_0's auc: 0.671655
Early stopping, best iteration is:
[37]	training's auc: 0.684156	valid_0's auc: 0.672408
*****
0.6724078657865786
********************


county_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.767358	valid_0's auc: 0.72344
[40]	training's auc: 0.772283	valid_0's auc: 0.728765
[60]	training's auc: 0.77374	valid_0's auc: 0.730307
[80]	training's auc: 0.774604	valid_0's auc: 0.730896
[100]	training's auc: 0.775263	valid_0's auc: 0.73182
[120]	training's auc: 0.775831	valid_0's auc: 0.731191
[140]	training's auc: 0.776325	valid_0's auc: 0.732008
Early stopping, best iteration is:
[90]	training's auc: 0.775065	valid_0's auc: 0.732017
*****
0.7320170875783231
***********

[40]	training's auc: 0.684058	valid_0's auc: 0.671655
Early stopping, best iteration is:
[1]	training's auc: 0.684054	valid_0's auc: 0.671828
*****
0.6718279164873009
********************


city_name_idcard_cnt*arpu_202005_max
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684055	valid_0's auc: 0.671668
[40]	training's auc: 0.684058	valid_0's auc: 0.671655
Early stopping, best iteration is:
[1]	training's auc: 0.684055	valid_0's auc: 0.671668
*****
0.6716679820155929
********************


city_name_idcard_cnt*arpu_202005_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.545636	valid_0's auc: 0.519147
[40]	training's auc: 0.54568	valid_0's auc: 0.51892
Early stopping, best iteration is:
[1]	training's auc: 0.545636	valid_0's auc: 0.519147
*****
0.519147295164299
********************


city_name_idcard_cnt*arpu_202005_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.68359

[40]	training's auc: 0.612673	valid_0's auc: 0.595793
Early stopping, best iteration is:
[1]	training's auc: 0.612673	valid_0's auc: 0.595793
*****
0.5957932749796718
********************


county_name_idcard_cnt_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.574299	valid_0's auc: 0.567609
[40]	training's auc: 0.574723	valid_0's auc: 0.565202
Early stopping, best iteration is:
[1]	training's auc: 0.574299	valid_0's auc: 0.567609
*****
0.5676089348065241
********************


county_name_idcard_cnt_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.761347	valid_0's auc: 0.724206
[40]	training's auc: 0.764948	valid_0's auc: 0.726244
[60]	training's auc: 0.76706	valid_0's auc: 0.72767
[80]	training's auc: 0.768275	valid_0's auc: 0.728908
[100]	training's auc: 0.769326	valid_0's auc: 0.728555
[120]	training's auc: 0.769897	valid_0's auc: 0.728806
[140]	training's auc: 0.770367	valid_0's auc: 0.728585
[160]	t

[60]	training's auc: 0.763476	valid_0's auc: 0.713309
[80]	training's auc: 0.764425	valid_0's auc: 0.714179
[100]	training's auc: 0.764957	valid_0's auc: 0.714608
[120]	training's auc: 0.76549	valid_0's auc: 0.71529
[140]	training's auc: 0.765742	valid_0's auc: 0.71598
[160]	training's auc: 0.766007	valid_0's auc: 0.715897
[180]	training's auc: 0.76614	valid_0's auc: 0.715947
[200]	training's auc: 0.766271	valid_0's auc: 0.716004
[220]	training's auc: 0.766357	valid_0's auc: 0.716248
[240]	training's auc: 0.766417	valid_0's auc: 0.716236
[260]	training's auc: 0.766477	valid_0's auc: 0.716358
[280]	training's auc: 0.766533	valid_0's auc: 0.716335
Early stopping, best iteration is:
[232]	training's auc: 0.766393	valid_0's auc: 0.716439
*****
0.7164391710910222
********************


county_name_idcard_cnt*arpu_202005_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.7615	valid_0's auc: 0.726962
[40]	training's auc: 0.766494	valid_0's auc: 0.728113
[

[80]	training's auc: 0.762088	valid_0's auc: 0.717702
[100]	training's auc: 0.762786	valid_0's auc: 0.718272
[120]	training's auc: 0.763084	valid_0's auc: 0.717958
[140]	training's auc: 0.763389	valid_0's auc: 0.718089
Early stopping, best iteration is:
[101]	training's auc: 0.762807	valid_0's auc: 0.718436
*****
0.7184361099153393
********************


county_name_idcard_cnt/arpu_202005_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.763087	valid_0's auc: 0.718674
[40]	training's auc: 0.768359	valid_0's auc: 0.722023
[60]	training's auc: 0.770882	valid_0's auc: 0.723228
[80]	training's auc: 0.772455	valid_0's auc: 0.724481
[100]	training's auc: 0.773411	valid_0's auc: 0.724995
[120]	training's auc: 0.774083	valid_0's auc: 0.725357
[140]	training's auc: 0.774551	valid_0's auc: 0.725871
[160]	training's auc: 0.775004	valid_0's auc: 0.72591
[180]	training's auc: 0.77533	valid_0's auc: 0.726545
[200]	training's auc: 0.775624	valid_0's auc: 0.72640

[80]	training's auc: 0.763762	valid_0's auc: 0.723436
[100]	training's auc: 0.764562	valid_0's auc: 0.72395
[120]	training's auc: 0.7657	valid_0's auc: 0.724865
[140]	training's auc: 0.766319	valid_0's auc: 0.72456
[160]	training's auc: 0.766924	valid_0's auc: 0.725044
[180]	training's auc: 0.767399	valid_0's auc: 0.724993
[200]	training's auc: 0.767741	valid_0's auc: 0.725297
Early stopping, best iteration is:
[162]	training's auc: 0.767042	valid_0's auc: 0.725651
*****
0.7256510977184675
********************


city_name_county_name_arpu_202005_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.763888	valid_0's auc: 0.723479
[40]	training's auc: 0.769968	valid_0's auc: 0.724176
[60]	training's auc: 0.772313	valid_0's auc: 0.724481
[80]	training's auc: 0.773785	valid_0's auc: 0.725032
[100]	training's auc: 0.774708	valid_0's auc: 0.725576
[120]	training's auc: 0.7754	valid_0's auc: 0.725618
[140]	training's auc: 0.775968	valid_0's auc: 0.726161
[16

[40]	training's auc: 0.770072	valid_0's auc: 0.726105
[60]	training's auc: 0.772452	valid_0's auc: 0.72941
[80]	training's auc: 0.774072	valid_0's auc: 0.73028
[100]	training's auc: 0.775109	valid_0's auc: 0.730241
[120]	training's auc: 0.775658	valid_0's auc: 0.730954
[140]	training's auc: 0.776219	valid_0's auc: 0.732023
[160]	training's auc: 0.776804	valid_0's auc: 0.732388
[180]	training's auc: 0.777191	valid_0's auc: 0.733048
[200]	training's auc: 0.777466	valid_0's auc: 0.733471
[220]	training's auc: 0.777763	valid_0's auc: 0.733699
[240]	training's auc: 0.778097	valid_0's auc: 0.734072
[260]	training's auc: 0.77825	valid_0's auc: 0.734655
[280]	training's auc: 0.778463	valid_0's auc: 0.734338
[300]	training's auc: 0.778696	valid_0's auc: 0.734691
[320]	training's auc: 0.778743	valid_0's auc: 0.734542
[340]	training's auc: 0.778801	valid_0's auc: 0.73464
[360]	training's auc: 0.778885	valid_0's auc: 0.734778
[380]	training's auc: 0.778917	valid_0's auc: 0.734766
[400]	training's 

[180]	training's auc: 0.777135	valid_0's auc: 0.728025
[200]	training's auc: 0.777443	valid_0's auc: 0.727331
[220]	training's auc: 0.777629	valid_0's auc: 0.727935
[240]	training's auc: 0.777797	valid_0's auc: 0.728114
[260]	training's auc: 0.77793	valid_0's auc: 0.72812
[280]	training's auc: 0.778114	valid_0's auc: 0.728183
[300]	training's auc: 0.77823	valid_0's auc: 0.728285
[320]	training's auc: 0.77834	valid_0's auc: 0.728617
[340]	training's auc: 0.778401	valid_0's auc: 0.728641
[360]	training's auc: 0.778492	valid_0's auc: 0.728829
[380]	training's auc: 0.778569	valid_0's auc: 0.729008
[400]	training's auc: 0.778607	valid_0's auc: 0.729188
[420]	training's auc: 0.778661	valid_0's auc: 0.729639
[440]	training's auc: 0.778723	valid_0's auc: 0.729475
[460]	training's auc: 0.778761	valid_0's auc: 0.729286
[480]	training's auc: 0.778783	valid_0's auc: 0.729322
Early stopping, best iteration is:
[434]	training's auc: 0.778684	valid_0's auc: 0.729794
*****
0.7297944468359879
*********

[60]	training's auc: 0.859812	valid_0's auc: 0.795649
Early stopping, best iteration is:
[15]	training's auc: 0.851358	valid_0's auc: 0.800532
*****
0.8005318194862965
********************


voc_calltype_id_1_call_dur_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.83708	valid_0's auc: 0.834849
[40]	training's auc: 0.837959	valid_0's auc: 0.835244
[60]	training's auc: 0.83846	valid_0's auc: 0.834798
Early stopping, best iteration is:
[29]	training's auc: 0.837389	valid_0's auc: 0.835554
*****
0.8355544793609796
********************


voc_calltype_id_1_call_dur_std
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.823893	valid_0's auc: 0.755799
[40]	training's auc: 0.828583	valid_0's auc: 0.758808
[60]	training's auc: 0.830767	valid_0's auc: 0.758914
[80]	training's auc: 0.832292	valid_0's auc: 0.757759
[100]	training's auc: 0.833301	valid_0's auc: 0.756913
Early stopping, best iteration is:
[50]	training's auc: 0

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.672243	valid_0's auc: 0.643197
[40]	training's auc: 0.674317	valid_0's auc: 0.642436
[60]	training's auc: 0.676061	valid_0's auc: 0.641887
Early stopping, best iteration is:
[20]	training's auc: 0.672243	valid_0's auc: 0.643197
*****
0.643196656622184
********************


voc_calltype_id_1_300s_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.671023	valid_0's auc: 0.6515
[40]	training's auc: 0.673158	valid_0's auc: 0.654006
[60]	training's auc: 0.674406	valid_0's auc: 0.653598
[80]	training's auc: 0.675223	valid_0's auc: 0.653743
Early stopping, best iteration is:
[48]	training's auc: 0.673857	valid_0's auc: 0.654307
*****
0.6543068709044818
********************


voc_calltype_id_1_300s_max
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684812	valid_0's auc: 0.649373
[40]	training's auc: 0.686779	valid_0's auc: 0.648992
[60

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.847269	valid_0's auc: 0.790925
[40]	training's auc: 0.850357	valid_0's auc: 0.789094
[60]	training's auc: 0.852279	valid_0's auc: 0.787901
Early stopping, best iteration is:
[16]	training's auc: 0.846268	valid_0's auc: 0.792677
*****
0.7926770937963361
********************


p2o_call_max_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.86897	valid_0's auc: 0.8204
[40]	training's auc: 0.872359	valid_0's auc: 0.818302
Early stopping, best iteration is:
[1]	training's auc: 0.859364	valid_0's auc: 0.823531
*****
0.8235309944037882
********************


p2o_call_min_std
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.846611	valid_0's auc: 0.783154
[40]	training's auc: 0.850676	valid_0's auc: 0.781104
Early stopping, best iteration is:
[2]	training's auc: 0.832645	valid_0's auc: 0.789873
*****
0.7898730090400344
*******************

[40]	training's auc: 0.836381	valid_0's auc: 0.818014
Early stopping, best iteration is:
[1]	training's auc: 0.836357	valid_0's auc: 0.818918
*****
0.8189183048739657
********************


voc_day_mode_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.902743	valid_0's auc: 0.886563
[40]	training's auc: 0.90309	valid_0's auc: 0.886305
[60]	training's auc: 0.903416	valid_0's auc: 0.885358
Early stopping, best iteration is:
[13]	training's auc: 0.90241	valid_0's auc: 0.888309
*****
0.8883089395896111
********************


voc_day_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.87395	valid_0's auc: 0.859841
[40]	training's auc: 0.87395	valid_0's auc: 0.859841
[60]	training's auc: 0.87395	valid_0's auc: 0.859841
Early stopping, best iteration is:
[18]	training's auc: 0.873926	valid_0's auc: 0.86012
*****
0.8601201152723968
********************


voc_day22_count
Training until validation scores don't improv

[120]	training's auc: 0.754341	valid_0's auc: 0.739761
Early stopping, best iteration is:
[87]	training's auc: 0.754074	valid_0's auc: 0.744774
*****
0.7447744774477447
********************


voc_day15_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.744287	valid_0's auc: 0.742653
[40]	training's auc: 0.744396	valid_0's auc: 0.743885
[60]	training's auc: 0.744418	valid_0's auc: 0.74394
[80]	training's auc: 0.744422	valid_0's auc: 0.743979
[100]	training's auc: 0.744424	valid_0's auc: 0.743942
[120]	training's auc: 0.744428	valid_0's auc: 0.743972
Early stopping, best iteration is:
[70]	training's auc: 0.74442	valid_0's auc: 0.744018
*****
0.7440181518151815
********************


voc_day5_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.755184	valid_0's auc: 0.725379
[40]	training's auc: 0.755289	valid_0's auc: 0.725561
Early stopping, best iteration is:
[2]	training's auc: 0.75493	valid_0's auc: 0.726963

[40]	training's auc: 0.754229	valid_0's auc: 0.741579
[60]	training's auc: 0.754293	valid_0's auc: 0.741334
[80]	training's auc: 0.754309	valid_0's auc: 0.741178
[100]	training's auc: 0.754398	valid_0's auc: 0.74119
Early stopping, best iteration is:
[57]	training's auc: 0.754286	valid_0's auc: 0.7416
*****
0.7415997034486057
********************


voc_day28_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.761732	valid_0's auc: 0.745088
[40]	training's auc: 0.761741	valid_0's auc: 0.74658
[60]	training's auc: 0.761757	valid_0's auc: 0.747169
[80]	training's auc: 0.761945	valid_0's auc: 0.746822
[100]	training's auc: 0.761978	valid_0's auc: 0.746731
Early stopping, best iteration is:
[69]	training's auc: 0.761769	valid_0's auc: 0.747181
*****
0.7471809680968097
********************


voc_day30_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.769902	valid_0's auc: 0.752362
[40]	training's auc: 0.76998	v

[40]	training's auc: 0.769709	valid_0's auc: 0.754829
Early stopping, best iteration is:
[7]	training's auc: 0.769602	valid_0's auc: 0.755885
*****
0.7558846917300426
********************


voc_day11_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.760833	valid_0's auc: 0.750407
[40]	training's auc: 0.761089	valid_0's auc: 0.752235
[60]	training's auc: 0.76109	valid_0's auc: 0.75225
[80]	training's auc: 0.761092	valid_0's auc: 0.752259
[100]	training's auc: 0.761094	valid_0's auc: 0.75227
[120]	training's auc: 0.761094	valid_0's auc: 0.75227
Early stopping, best iteration is:
[87]	training's auc: 0.761092	valid_0's auc: 0.75227
*****
0.7522704716123786
********************


voc_day22_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.768648	valid_0's auc: 0.730944
[40]	training's auc: 0.770325	valid_0's auc: 0.730947
Early stopping, best iteration is:
[1]	training's auc: 0.764104	valid_0's auc: 0.

[2]	training's auc: 0.747535	valid_0's auc: 0.74329
*****
0.7432902257617066
********************


voc_day5_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.754448	valid_0's auc: 0.719021
[40]	training's auc: 0.75596	valid_0's auc: 0.719163
Early stopping, best iteration is:
[4]	training's auc: 0.752176	valid_0's auc: 0.719659
*****
0.719658786530827
********************


voc_day2_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.738893	valid_0's auc: 0.686581
[40]	training's auc: 0.740483	valid_0's auc: 0.685391
Early stopping, best iteration is:
[2]	training's auc: 0.735712	valid_0's auc: 0.689086
*****
0.6890858922848807
********************


voc_day12_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.753372	valid_0's auc: 0.716068
[40]	training's auc: 0.754746	valid_0's auc: 0.715012
Early stopping, best iteration is:
[5]	training's auc: 0

[1]	training's auc: 0.782016	valid_0's auc: 0.76589
*****
0.7658903118572726
********************


voc_hour17_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.795209	valid_0's auc: 0.765747
[40]	training's auc: 0.79544	valid_0's auc: 0.7667
[60]	training's auc: 0.795512	valid_0's auc: 0.766573
[80]	training's auc: 0.795558	valid_0's auc: 0.766373
Early stopping, best iteration is:
[42]	training's auc: 0.795443	valid_0's auc: 0.766724
*****
0.7667243626536566
********************


voc_hour7_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.708253	valid_0's auc: 0.69893
[40]	training's auc: 0.708556	valid_0's auc: 0.698845
Early stopping, best iteration is:
[1]	training's auc: 0.708049	valid_0's auc: 0.699691
*****
0.6996908930023437
********************


voc_hour5_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.529595	valid_0's auc: 0.521165
[40]	training's auc: 

[80]	training's auc: 0.766501	valid_0's auc: 0.738265
Early stopping, best iteration is:
[36]	training's auc: 0.766706	valid_0's auc: 0.73947
*****
0.73946973501698
********************


voc_hour16_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.795072	valid_0's auc: 0.772292
[40]	training's auc: 0.795089	valid_0's auc: 0.772386
Early stopping, best iteration is:
[3]	training's auc: 0.794898	valid_0's auc: 0.772918
*****
0.7729184603242933
********************


voc_hour17_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.807673	valid_0's auc: 0.778842
[40]	training's auc: 0.807519	valid_0's auc: 0.779826
Early stopping, best iteration is:
[3]	training's auc: 0.806134	valid_0's auc: 0.780249
*****
0.7802485411584636
********************


voc_hour7_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.709691	valid_0's auc: 0.700804
[40]	training's auc: 0.709718	v

[80]	training's auc: 0.652826	valid_0's auc: 0.631813
[100]	training's auc: 0.65465	valid_0's auc: 0.62871
Early stopping, best iteration is:
[57]	training's auc: 0.651787	valid_0's auc: 0.631953
*****
0.6319534127325777
********************


voc_hour8_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.786715	valid_0's auc: 0.745208
[40]	training's auc: 0.788678	valid_0's auc: 0.744937
[60]	training's auc: 0.789828	valid_0's auc: 0.745565
Early stopping, best iteration is:
[11]	training's auc: 0.785109	valid_0's auc: 0.745945
*****
0.7459448390491223
********************


voc_hour16_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.787861	valid_0's auc: 0.759756
[40]	training's auc: 0.790278	valid_0's auc: 0.758795
Early stopping, best iteration is:
[1]	training's auc: 0.777837	valid_0's auc: 0.761943
*****
0.7619427703639929
********************


voc_hour17_call_dur_sum
Training until valid

[20]	training's auc: 0.880531	valid_0's auc: 0.85812
[40]	training's auc: 0.880536	valid_0's auc: 0.858246
Early stopping, best iteration is:
[1]	training's auc: 0.880204	valid_0's auc: 0.859477
*****
0.8594773879561869
********************


sms_day_mode
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.829988	valid_0's auc: 0.824639
[40]	training's auc: 0.829989	valid_0's auc: 0.824718
[60]	training's auc: 0.829989	valid_0's auc: 0.824446
Early stopping, best iteration is:
[13]	training's auc: 0.82995	valid_0's auc: 0.825559
*****
0.8255593222365715
********************


sms_day_mode_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.879702	valid_0's auc: 0.857365
[40]	training's auc: 0.880186	valid_0's auc: 0.857477
[60]	training's auc: 0.880501	valid_0's auc: 0.856633
[80]	training's auc: 0.880652	valid_0's auc: 0.856546
Early stopping, best iteration is:
[31]	training's auc: 0.880007	valid_0's auc: 0.858001


[40]	training's auc: 0.834877	valid_0's auc: 0.825036
Early stopping, best iteration is:
[4]	training's auc: 0.834647	valid_0's auc: 0.82589
*****
0.8258896541828096
********************


sms_day18_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.816939	valid_0's auc: 0.797693
[40]	training's auc: 0.817099	valid_0's auc: 0.795744
[60]	training's auc: 0.817175	valid_0's auc: 0.794363
Early stopping, best iteration is:
[14]	training's auc: 0.816878	valid_0's auc: 0.79811
*****
0.798110381690343
********************


sms_day19_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.852865	valid_0's auc: 0.826189
[40]	training's auc: 0.852946	valid_0's auc: 0.825909
Early stopping, best iteration is:
[2]	training's auc: 0.852679	valid_0's auc: 0.827292
*****
0.8272916965609605
********************


sms_day20_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.807959	valid_0's

*****
0.8230721169943082
********************


sms_hour9_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.824555	valid_0's auc: 0.787942
[40]	training's auc: 0.82512	valid_0's auc: 0.787507
[60]	training's auc: 0.825551	valid_0's auc: 0.787017
Early stopping, best iteration is:
[18]	training's auc: 0.82451	valid_0's auc: 0.788218
*****
0.7882183598794662
********************


sms_hour0_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.560302	valid_0's auc: 0.54383
[40]	training's auc: 0.561164	valid_0's auc: 0.541803
Early stopping, best iteration is:
[3]	training's auc: 0.559975	valid_0's auc: 0.544502
*****
0.5445021404314344
********************


sms_hour20_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.790886	valid_0's auc: 0.75664
[40]	training's auc: 0.790954	valid_0's auc: 0.756182
[60]	training's auc: 0.791129	valid_0's auc: 0.756705
[80]	training's auc

In [64]:
for key, val in useless_cols.items():
    print(key, ':', val)

city_name_idcard_cnt*arpu_202005_min : 0.519147295164299
voc_hour1_count : 0.509878569378677
voc_hour3_count : 0.5081716052039986
voc_hour2_count : 0.5142610728464151
voc_hour4_count : 0.5153566987133495
voc_hour1_nunique : 0.5102911106328024
voc_hour3_nunique : 0.5055244654900273
voc_hour2_nunique : 0.5119278232171043
voc_hour4_nunique : 0.5162086860859999
voc_hour1_call_dur_sum : 0.5041822116994308
voc_hour3_call_dur_sum : 0.5055543597838045
voc_hour2_call_dur_sum : 0.5092597574974889
voc_hour4_call_dur_sum : 0.5162804323910652
sms_hour3_count : 0.5002929640790166
sms_hour5_count : 0.5097799182092122
sms_hour4_count : 0.5133313603099441


In [65]:
df_train.drop(list(useless_cols.keys()), axis=1, inplace=True)
df_test.drop(list(useless_cols.keys()), axis=1, inplace=True)

### 高共线

In [66]:
def correlation(df, threshold):
    """
    去除特征相关系数大于阈值的特征
    :param df:
    :param threshold:
    :return:
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName_i = corr_matrix.columns[i]
                colName_j = corr_matrix.columns[j]
                if useful_cols[colName_i] >= useful_cols[colName_j]:
                    col_corr.add(colName_j)
                else:
                    col_corr.add(colName_i)

    return col_corr


col = correlation(df_train.drop(['phone_no_m', 'label'], axis=1), 0.98)
print('Correlated columns: ', col)

Correlated columns:  {'county_name_arpu_202005_min', 'county_name_idcard_cnt/arpu_202005_max', 'voc_calltype_id_1_60s_min', 'county_name_arpu_202005/idcard_cnt_nunique', 'city_name_idcard_cnt/arpu_202005_nunique', 'city_name_county_name_arpu_202005/idcard_cnt_max', 'city_name_idcard_cnt*arpu_202005_sum', 'city_name_county_name_arpu_202005_median', 'call_type_id_2_rate', 'calltype_id_1_county_name_nunique', 'voc_calltype_id_1_300s_max', 'county_name_arpu_202005_nunique', 'city_name_arpu_202005_sum', 'county_name_arpu_202005/idcard_cnt_skew', 'county_name_arpu_202005_mean', 'city_name_idcard_cnt_sum', 'city_name_arpu_202005_nunique', 'county_name_idcard_cnt*arpu_202005_nunique', 'county_name_idcard_cnt*arpu_202005_min', 'county_name_idcard_cnt*arpu_202005_sum', 'city_name_county_name_idcard_cnt/arpu_202005_nunique', 'city_name_county_name_arpu_202005_nunique', 'county_name_idcard_cnt_sum', 'county_name_arpu_202005_max', 'county_name_idcard_cnt_nunique', 'city_name', 'city_name_arpu_20200

In [67]:
df_train.drop(list(col), axis=1, inplace=True)
df_test.drop(list(col), axis=1, inplace=True)

In [68]:
feature_names = [i for i in df_train.columns if i not in ['phone_no_m', 'label']]
ycol = 'label'

In [69]:
# oof = []
# prediction = df_test[['phone_no_m', 'arpu_202004']]
# prediction['label'] = 0
# df_importance_list = []

# kfold = StratifiedKFold(n_splits=5)
# for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol])):
#     print('\nFold_{} Training ================================\n'.format(fold_id+1))
    
#     X_train = df_train.iloc[trn_idx][feature_names]
#     Y_train = df_train.iloc[trn_idx][ycol]

#     X_val = df_train.iloc[val_idx][feature_names]
#     Y_val = df_train.iloc[val_idx][ycol]
    
#     lgb_train = lgb.Dataset(X_train, Y_train) 
#     lgb_valid= lgb.Dataset(X_val, Y_val, reference=lgb_train)
    
#     lgb_model = lgb.train(params,
#                           lgb_train,
#                           num_boost_round=10000,
#                           valid_sets=[lgb_valid, lgb_train],
#                           early_stopping_rounds=100,
#                           verbose_eval=10)

#     pred_val = lgb_model.predict(X_val)
#     df_oof = df_train.iloc[val_idx][['phone_no_m', ycol]].copy()
#     df_oof['pred'] = pred_val
#     oof.append(df_oof)

#     pred_test = lgb_model.predict(df_test[feature_names])
#     prediction['label_{}'.format(fold_id)] = pred_test

#     importance = lgb_model.feature_importance(importance_type='gain')
#     feature_name = lgb_model.feature_name()
#     df_importance = pd.DataFrame({
#         'feature_name': feature_name,
#         'importance': importance
#     })
#     df_importance_list.append(df_importance)

#     del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
#     gc.collect()

In [70]:
# df_importance = pd.concat(df_importance_list)
# df_importance = df_importance.groupby(['feature_name'])['importance'].agg(
#     'mean').sort_values(ascending=False).reset_index()
# df_importance.head(20)

In [71]:
# df_importance.tail(10)

In [72]:
# df_importance['normalized_importance'] = df_importance['importance'] / df_importance['importance'].sum()
# df_importance['cumulative_importance'] = np.cumsum(df_importance['normalized_importance'])
# record_low_importance = df_importance[df_importance['cumulative_importance'] > 0.99]
# to_drop = list(record_low_importance['feature_name'])
# print(to_drop)

In [73]:
# df_train.drop(to_drop, axis=1, inplace=True)
# df_test.drop(to_drop, axis=1, inplace=True)

In [74]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop(['phone_no_m', 'label'], axis=1), df_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [75]:
use_cols = [i for i in df_train.columns if i not in ['phone_no_m', 'label']]

In [76]:
lgb_train = lgb.Dataset(X_train[use_cols].values, y_train) 
lgb_valid= lgb.Dataset(X_valid[use_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

lgb_val_0 = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_valid, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

print('Done!')

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.988639	valid_0's auc: 0.952863
[20]	training's auc: 0.995998	valid_0's auc: 0.952127
[30]	training's auc: 0.998807	valid_0's auc: 0.95457
[40]	training's auc: 0.999488	valid_0's auc: 0.955653
[50]	training's auc: 0.999738	valid_0's auc: 0.955643
[60]	training's auc: 0.9998	valid_0's auc: 0.955114
[70]	training's auc: 0.99992	valid_0's auc: 0.955688
[80]	training's auc: 0.999964	valid_0's auc: 0.955135
[90]	training's auc: 0.999997	valid_0's auc: 0.955275
[100]	training's auc: 1	valid_0's auc: 0.955407
[110]	training's auc: 1	valid_0's auc: 0.955933
[120]	training's auc: 1	valid_0's auc: 0.956298
[130]	training's auc: 1	valid_0's auc: 0.956106
[140]	training's auc: 1	valid_0's auc: 0.956525
[150]	training's auc: 1	valid_0's auc: 0.957661
[160]	training's auc: 1	valid_0's auc: 0.957646
[170]	training's auc: 1	valid_0's auc: 0.958103
[180]	training's auc: 1	valid_0's auc: 0.958127
[190]	

In [77]:
# 验证集结果
X_valid['prob'] = lgb_val_0.predict(X_valid[use_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.5, 1, 0)

f1_05 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc = roc_auc_score(y_valid, X_valid['prob'])

print('f1_05: ', f1_05)
print('auc: ', auc)

f1_05:  0.8836
auc:  0.9597353757114842


In [78]:
lgb_train_all = lgb.Dataset(df_train[use_cols].values, df_train['label'])   

print('Start training...')

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_val_0.best_iteration + 20)

print('Done!')

Start training...
Done!


In [79]:
df_test['label'] = np.where(lgb_model.predict(df_test[use_cols]) > 0.5, 1, 0)
# df_test.loc[df_test['arpu_202004'] <= 0, 'label'] = 1
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_05), index=False)