In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from feature_selector import FeatureSelector

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
def count_encode(df, cols):
    for col in tqdm(cols):
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df
        
        
def label_encode(df, cols):
    le = LabelEncoder()
    for col in tqdm(cols):
        df[col] = df[col].fillna('NA')
        df[col] = le.fit_transform(df[col].astype(str))
    return df


def cross_cat_num(df, cat_col, num_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            df_new = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_nunique'.format(f1, f2): 'nunique'
            })
            df = df.merge(df_new, on=f1, how='left')
            del df_new
            gc.collect()
    return df

## user表

In [4]:
train_user = pd.read_hdf('../input/train_user.h5')
test_user = pd.read_hdf('../input/test_user.h5')

In [5]:
# 将city_name和county_name拼接起来
train_user['city_name_county_name'] = train_user['city_name'].astype(str) + '_' + train_user['county_name'].astype(str)
test_user['city_name_county_name'] = test_user['city_name'].astype(str) + '_' + test_user['county_name'].astype(str)

In [6]:
cat_feat = ['city_name', 'county_name', 'city_name_county_name']
num_feat = ['idcard_cnt', 'arpu_202004', 'idcard_cnt*arpu_202004', 'arpu_202004/idcard_cnt']

In [7]:
# y = train_user['label']

In [8]:
# ME = MeanEncoder(categorical_features=cat_feat,
#                  n_splits=3,
#                  target_type='classification',
#                  prior_weight_func=None)
# X_data = ME.fit_transform(train_user.drop('label', axis=1), y)
# X_test = ME.transform(test_user)

In [9]:
# train_user = X_data.copy()
# train_user['label'] = y
# test_user = X_test.copy()

# train_user.shape, test_user.shape

In [10]:
df_user = pd.concat([train_user, test_user])

del train_user, test_user
gc.collect()

0

In [11]:
# 电话的数量*月消费额
df_user['idcard_cnt*arpu_202004'] = df_user['idcard_cnt'] * df_user['arpu_202004']

# 月消费额/电话的数量
df_user['arpu_202004/idcard_cnt'] = df_user['arpu_202004'] / (df_user['idcard_cnt'] + 0.0001)

In [12]:
df_user = cross_cat_num(df_user, cat_feat, num_feat)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 22.53it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  5.51it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:00<00:00, 13.45it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 13.55it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:00<00:00,  4.61it/s]
  0%|                    

In [13]:
df_user = count_encode(df_user, cat_feat + ['idcard_cnt'])

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 334.23it/s]

city_name
county_name
city_name_county_name
idcard_cnt





In [14]:
df_user = label_encode(df_user, cat_feat)

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 334.28it/s]


In [15]:
df_user.to_hdf('../input/user_features.h5', 'df', index=False)

del df_user
gc.collect()

77

## voc表

In [16]:
train_voc = pd.read_hdf('../input/train_voc.h5')
test_voc = pd.read_hdf('../input/test_voc.h5')

In [17]:
df_voc = pd.concat([train_voc, test_voc])

del train_voc, test_voc
gc.collect()

128

In [18]:
df_voc['city_name_county_name'] = df_voc['city_name'] + '_' + df_voc['county_name']

In [19]:
df_voc['voc_day'] = df_voc['start_datetime'].astype('datetime64').dt.day
df_voc['voc_hour'] = df_voc['start_datetime'].astype('datetime64').dt.hour
# df_voc['voc_dayofweek'] = df_voc['start_datetime'].astype('datetime64').dt.dayofweek

In [20]:
phone_no_m = df_voc[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [21]:
# 通话次数，通话人数
tmp = df_voc.groupby('phone_no_m')['opposite_no_m'].agg(opposite_cnt='count', opposite_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, how='left', on='phone_no_m')

# 人均通话次数
phone_no_m['voc_cnt_per_capita'] = phone_no_m['opposite_cnt'] / (phone_no_m['opposite_nunique'] + 0.0001)

del tmp
gc.collect()

0

In [22]:
"""
主叫通话
"""

df_calltype_id_1 = df_voc.loc[df_voc['calltype_id'] == 1, :].copy()

# 主叫通话次数，主叫通话使用的手机个数
tmp = df_calltype_id_1.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_1_cnt='count', imeis='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长
tmp = df_calltype_id_1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_call_dur_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长小于30s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 30]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_30s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于60s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 60]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_60s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于300s（5分钟）的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] > 300]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_300s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于30s的次数的占比
phone_no_m['voc_calltype_id_1_30s_rate'] = phone_no_m['voc_calltype_id_1_30s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长小于60s的次数的占比
phone_no_m['voc_calltype_id_1_60s_rate'] = phone_no_m['voc_calltype_id_1_60s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长大于300s（5分钟）的次数的占比
phone_no_m['voc_calltype_id_1_300s_rate'] = phone_no_m['voc_calltype_id_1_300s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']


# 主叫通话次数的占比
phone_no_m["call_type_id_1_rate"] = phone_no_m['voc_calltype_id_1_cnt'] / phone_no_m['opposite_cnt']

# 主叫通话时所在地市变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时所在区县变动的个数
tmp = df_calltype_id_1.groupby("phone_no_m")['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
del tmp
gc.collect()

# 主叫通话时所在地市_区县变动的个数
tmp = df_calltype_id_1.groupby("phone_no_m")['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
del tmp
gc.collect()

0

In [23]:
"""
被叫通话
"""

df_calltype_id_2 = df_voc.loc[df_voc['calltype_id'] == 2, :].copy()

# 被叫通话的次数
tmp = df_calltype_id_2.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_2_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 被叫通话次数的占比
phone_no_m["call_type_id_2_rate"] = phone_no_m['voc_calltype_id_2_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_2
gc.collect()

20

In [24]:
"""
呼叫转移
"""

df_calltype_id_3 = df_voc.loc[df_voc['calltype_id'] == 3, :].copy()

# 呼叫转移的次数
tmp = df_calltype_id_3.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_3_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 呼叫转移次数的占比
phone_no_m["call_type_id_3_rate"] = phone_no_m['voc_calltype_id_3_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_3
gc.collect()

20

In [25]:
"""
与对端通话统计
"""

# 与对端通话次数，与对端通话时长
tmp = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['call_dur'].agg(call_count='count',
                                                                      call_sum='sum')

# 与对端通话次数的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_count'].agg(phone2opposite_cnt_mean='mean',
                                                             phone2opposite_cnt_median='median',
                                                             phone2opposite_cnt_min='min',
                                                             phone2opposite_cnt_max='max',
                                                             phone2opposite_cnt_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite
gc.collect()


# 与对端总通话时长的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_sum'].agg(phone2opposite_call_dur_mean='mean',
                                                           phone2opposite_call_dur_median='median',
                                                           phone2opposite_call_dur_min='min',
                                                           phone2opposite_call_dur_max='max',
                                                           phone2opposite_call_dur_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite, tmp
gc.collect()

0

In [26]:
"""
通话时长的统计
"""

# 通话时长的统计量
tmp = df_voc.groupby('phone_no_m')['call_dur'].agg(call_dur_mean='mean',
                                                   call_dur_median='median',
                                                   call_dur_max='max',
                                                   call_dur_min='min',
                                                   call_dur_std='std',
                                                   call_dur_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

0

In [27]:
"""
收费号码位置变动
"""

# 收费号码所在地市的个数
tmp = df_voc.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码所在区县的个数
tmp = df_voc.groupby('phone_no_m')['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how='left')

# 收费号码所在地市_区县的个数
tmp = df_voc.groupby('phone_no_m')['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how='left')

# 收费号码通话类型的个数
tmp = df_voc.groupby('phone_no_m')['calltype_id'].agg(calltype_id_unique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [28]:
"""
通话时间点的偏好
"""

# hour通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_hour'].agg(voc_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   voc_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   voc_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# day通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_day'].agg(voc_day_mode=lambda x: stats.mode(x)[0][0],
                                                  voc_day_mode_count=lambda x: stats.mode(x)[1][0],
                                                  voc_day_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [29]:
# 每天的通话次数
voc_day_cnt_res = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_cnt_res[i])

    
# 每天的通话人数
voc_day_nunique_res = df_voc.groupby(['phone_no_m', 'voc_day'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_nunique_res[i])

    
# 每天的通话时长
voc_day_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_day'])['call_dur'].sum().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_call_dur_res[i])



# 每小时的通话次数
voc_hour_cnt_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['phone_no_m'].count().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_cnt_res[i])
    
# 每小时的通话人数
voc_hour_nunique_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_nunique_res[i])
    

# 每小时的通话时长
voc_hour_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['call_dur'].sum().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_call_dur_res[i])

In [30]:
phone_no_m.to_hdf('../input/voc_features.h5', 'df', index=False)
del phone_no_m
gc.collect()

75

## sms表

In [31]:
train_sms = pd.read_hdf('../input/train_sms.h5')
test_sms = pd.read_hdf('../input/test_sms.h5')

In [32]:
df_sms = pd.concat([train_sms, test_sms])

del train_sms, test_sms
gc.collect()

106

In [33]:
df_sms['sms_day'] = df_sms['request_datetime'].astype('datetime64').dt.day
df_sms['sms_hour'] = df_sms['request_datetime'].astype('datetime64').dt.hour
# df_sms['sms_dayofweek'] = df_sms['request_datetime'].astype('datetime64').dt.dayofweek

In [34]:
phone_no_m = df_sms[["phone_no_m"]].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [35]:
# 短信次数，短信人数
tmp = df_sms.groupby('phone_no_m')['opposite_no_m'].agg(sms_cnt='count', sms_nunique='nunique')

# 人均短信次数
tmp['sms_cnt_per_capita'] = tmp['sms_cnt'] / (tmp['sms_nunique'] + 0.0001)
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [36]:
"""
短信上行，短信下行
"""

# 短信上行
df_sms_calltype1 = df_sms[df_sms['calltype_id'] == 1].copy()
tmp = df_sms_calltype1.groupby('phone_no_m')['calltype_id'].agg(sms_calltype1_cnt="count")
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行比例
phone_no_m['sms_calltype1_rate'] = phone_no_m['sms_calltype1_cnt'] / phone_no_m['sms_cnt']

del tmp, df_sms_calltype1

# 短信下行
df_sms_calltype2 = df_sms[df_sms['calltype_id'] == 2].copy()
tmp = df_sms_calltype2.groupby('phone_no_m')['calltype_id'].agg(sms_calltype2_cnt="count")
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行/短信下行
phone_no_m['type1_rate/type2_rate'] = phone_no_m['sms_calltype1_cnt'] / (phone_no_m['sms_calltype2_cnt'] + 0.00001)

# 删除，效果变差
# # 短信下行/短信上行
# phone_no_m['type2_rate/type1_rate'] = phone_no_m['sms_calltype2_cnt'] / (phone_no_m['sms_calltype1_cnt'] + 0.00001)

del tmp, df_sms_calltype2
gc.collect()

0

In [37]:
"""
短信时间点的偏好
"""

tmp = df_sms.groupby('phone_no_m')['sms_hour'].agg(sms_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   sms_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   sms_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp

tmp = df_sms.groupby('phone_no_m')['sms_day'].agg(sms_day_mode=lambda x: stats.mode(x)[0][0],
                                                  sms_day_mode_count=lambda x: stats.mode(x)[1][0],
                                                  sms_day_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [38]:
# 每天的短信次数
sms_day_res = df_sms.groupby(['phone_no_m', 'sms_day'])['phone_no_m'].count().unstack()
for i in df_sms['sms_day'].unique():
    phone_no_m['sms_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_day_res[i])


# 每小时的短信次数
sms_hour_res = df_sms.groupby(['phone_no_m', 'sms_hour'])['phone_no_m'].count().unstack()
for i in df_sms['sms_hour'].unique():
    phone_no_m['sms_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_hour_res[i])

In [39]:
phone_no_m.to_hdf('../input/sms_features.h5', 'df', index=False)

del phone_no_m
gc.collect()

64

## app表

In [40]:
train_app = pd.read_hdf('../input/train_app.h5')
test_app = pd.read_hdf('../input/test_app.h5')

In [41]:
df_app = pd.concat([train_app, test_app])

del train_app, test_app
gc.collect()

117

In [42]:
phone_no_m = df_app[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [43]:
# APP数
tmp = df_app.groupby('phone_no_m')['busi_name'].agg(busi_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

40

In [44]:
"""
流量统计
"""
tmp = df_app.groupby("phone_no_m")["flow"].agg(flow_mean='mean',
                                               flow_median='median',
                                               flow_min='min',
                                               flow_max='max',
                                               flow_std='std',
                                               flow_sum='sum',
                                               flow_skew='skew')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

20

In [45]:
phone_no_m.to_hdf('../input/app_features.h5', 'df', index=False)

del phone_no_m
gc.collect()

75

## 读取数据，建模

In [46]:
df_user = reduce_mem_usage(pd.read_hdf('../input/user_features.h5'))
df_voc = reduce_mem_usage(pd.read_hdf('../input/voc_features.h5'))
df_sms = reduce_mem_usage(pd.read_hdf('../input/sms_features.h5'))
# df_app = reduce_mem_usage(pd.read_hdf('../input/app_features.h5'))

Memory usage of dataframe is 3445536.00 MB
Memory usage after optimization is: 1692544.00 MB
Decreased by 50.9%
Memory usage of dataframe is 10456560.00 MB
Memory usage after optimization is: 2756280.00 MB
Decreased by 73.6%
Memory usage of dataframe is 3505040.00 MB
Memory usage after optimization is: 926332.00 MB
Decreased by 73.6%


In [47]:
df_user.shape, df_voc.shape, df_sms.shape #, df_app.shape

((7556, 86), (6180, 212), (6259, 69))

In [48]:
df = df_user.merge(df_voc, on='phone_no_m', how='left')
df = df.merge(df_sms, on='phone_no_m', how='left')
# df = df.merge(df_app, on='phone_no_m', how='left')

del df_user, df_voc, df_sms#, df_app
gc.collect()

0

In [49]:
df.shape

(7556, 365)

In [50]:
# astype('category')会掉分，不做此处理
# for i in tqdm(cat_feat):
#     print(i)
#     df[i] = df[i].astype('category')

In [51]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 365), (1450, 365))

## 特征筛选

### 缺失值

In [52]:
fs = FeatureSelector(data=df_train.drop(['phone_no_m', 'label'], axis=1), labels=df_train['label'])

In [53]:
fs.identify_missing(missing_threshold=0.98)
missing_features = fs.ops['missing']
print(missing_features)

6 features with greater than 0.98 missing values.

['city_name_arpu_202004_skew', 'city_name_idcard_cnt*arpu_202004_skew', 'county_name_idcard_cnt*arpu_202004_skew', 'city_name_county_name_idcard_cnt*arpu_202004_skew', 'voc_calltype_id_3_cnt', 'call_type_id_3_rate']


In [54]:
fs.missing_stats.head(20)

Unnamed: 0,missing_fraction
city_name_arpu_202004_skew,1.0
city_name_idcard_cnt*arpu_202004_skew,1.0
city_name_county_name_idcard_cnt*arpu_202004_skew,0.989027
county_name_idcard_cnt*arpu_202004_skew,0.989027
voc_calltype_id_3_cnt,0.982312
call_type_id_3_rate,0.982312
voc_hour3_count,0.969211
voc_hour3_call_dur_sum,0.969211
voc_hour3_nunique,0.969211
voc_hour4_call_dur_sum,0.966099


In [55]:
df_train.drop(missing_features, axis=1, inplace=True)
df_test.drop(missing_features, axis=1, inplace=True)

### 唯一值

In [56]:
fs.identify_single_unique()

1 features with a single unique value.



In [57]:
single_unique = fs.ops['single_unique']
print(single_unique)

['arpu_202005']


In [58]:
df_train.drop(single_unique, axis=1, inplace=True)
df_test.drop(single_unique, axis=1, inplace=True)

### 单特征AUC

In [59]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop(['phone_no_m', 'label'], axis=1), df_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [60]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0.1,
          'lambda_l2': 0,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [61]:
train_cols = [i for i in df_train.columns if i not in ['phone_no_m', 'label']]

In [62]:
useful_cols = {}
useless_cols = {}

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_valid= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_valid, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.52:
        useful_cols[i] = lgb_test.best_score['valid_0']['auc']
    else:
        useless_cols[i] = lgb_test.best_score['valid_0']['auc']
    print('*' * 20)
    print('\n')

city_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684155	valid_0's auc: 0.671655
[40]	training's auc: 0.684156	valid_0's auc: 0.672408
[60]	training's auc: 0.684156	valid_0's auc: 0.672408
[80]	training's auc: 0.684158	valid_0's auc: 0.671655
Early stopping, best iteration is:
[37]	training's auc: 0.684156	valid_0's auc: 0.672408
*****
0.6724078657865786
********************


county_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.767358	valid_0's auc: 0.72344
[40]	training's auc: 0.772283	valid_0's auc: 0.728765
[60]	training's auc: 0.77374	valid_0's auc: 0.730307
[80]	training's auc: 0.774604	valid_0's auc: 0.730896
[100]	training's auc: 0.775263	valid_0's auc: 0.73182
[120]	training's auc: 0.775831	valid_0's auc: 0.731191
[140]	training's auc: 0.776325	valid_0's auc: 0.732008
Early stopping, best iteration is:
[90]	training's auc: 0.775065	valid_0's auc: 0.732017
*****
0.7320170875783231
***********

[40]	training's auc: 0.684125	valid_0's auc: 0.671655
Early stopping, best iteration is:
[1]	training's auc: 0.684108	valid_0's auc: 0.671947
*****
0.6719474936624097
********************


city_name_arpu_202004/idcard_cnt_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.571119	valid_0's auc: 0.549704
[40]	training's auc: 0.571448	valid_0's auc: 0.543211
Early stopping, best iteration is:
[1]	training's auc: 0.571119	valid_0's auc: 0.549704
*****
0.5497037475486679
********************


city_name_arpu_202004/idcard_cnt_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.683719	valid_0's auc: 0.671399
[40]	training's auc: 0.683719	valid_0's auc: 0.671399
Early stopping, best iteration is:
[1]	training's auc: 0.683719	valid_0's auc: 0.671399
*****
0.6713989333715981
********************


city_name_arpu_202004/idcard_cnt_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.6

[80]	training's auc: 0.772875	valid_0's auc: 0.724905
[100]	training's auc: 0.773368	valid_0's auc: 0.72591
[120]	training's auc: 0.773882	valid_0's auc: 0.726518
[140]	training's auc: 0.774129	valid_0's auc: 0.72679
[160]	training's auc: 0.774405	valid_0's auc: 0.72508
[180]	training's auc: 0.774529	valid_0's auc: 0.725397
Early stopping, best iteration is:
[140]	training's auc: 0.774129	valid_0's auc: 0.72679
*****
0.726790070311379
********************


county_name_idcard_cnt*arpu_202004_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.608074	valid_0's auc: 0.560044
[40]	training's auc: 0.608054	valid_0's auc: 0.556575
[60]	training's auc: 0.608298	valid_0's auc: 0.55689
[80]	training's auc: 0.608983	valid_0's auc: 0.556585
Early stopping, best iteration is:
[33]	training's auc: 0.608204	valid_0's auc: 0.560395
*****
0.5603954417180849
********************


county_name_idcard_cnt*arpu_202004_median
Training until validation scores don't impro

[40]	training's auc: 0.61342	valid_0's auc: 0.597121
Early stopping, best iteration is:
[1]	training's auc: 0.61342	valid_0's auc: 0.597121
*****
0.5971205816233798
********************


city_name_county_name_idcard_cnt_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.584501	valid_0's auc: 0.577387
[40]	training's auc: 0.584501	valid_0's auc: 0.577387
Early stopping, best iteration is:
[1]	training's auc: 0.584501	valid_0's auc: 0.577387
*****
0.5773873583010475
********************


city_name_county_name_idcard_cnt_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.761058	valid_0's auc: 0.725718
[40]	training's auc: 0.764799	valid_0's auc: 0.727651
[60]	training's auc: 0.767596	valid_0's auc: 0.730336
[80]	training's auc: 0.769232	valid_0's auc: 0.731289
[100]	training's auc: 0.770667	valid_0's auc: 0.730881
[120]	training's auc: 0.771352	valid_0's auc: 0.731476
[140]	training's auc: 0.77169	valid_0's au

[160]	training's auc: 0.770424	valid_0's auc: 0.727376
[180]	training's auc: 0.770676	valid_0's auc: 0.727739
[200]	training's auc: 0.770946	valid_0's auc: 0.727934
[220]	training's auc: 0.771193	valid_0's auc: 0.728358
[240]	training's auc: 0.771383	valid_0's auc: 0.728255
[260]	training's auc: 0.771486	valid_0's auc: 0.728303
[280]	training's auc: 0.77162	valid_0's auc: 0.728288
[300]	training's auc: 0.771705	valid_0's auc: 0.728271
[320]	training's auc: 0.771818	valid_0's auc: 0.727775
Early stopping, best iteration is:
[279]	training's auc: 0.771622	valid_0's auc: 0.728494
*****
0.7284940450566796
********************


city_name_county_name_idcard_cnt*arpu_202004_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.765579	valid_0's auc: 0.726086
[40]	training's auc: 0.770161	valid_0's auc: 0.731044
[60]	training's auc: 0.77184	valid_0's auc: 0.730469
[80]	training's auc: 0.77283	valid_0's auc: 0.730773
[100]	training's auc: 0.773484	valid_0's au

[140]	training's auc: 0.751219	valid_0's auc: 0.737138
[160]	training's auc: 0.751243	valid_0's auc: 0.737216
[180]	training's auc: 0.751243	valid_0's auc: 0.737216
[200]	training's auc: 0.751287	valid_0's auc: 0.737595
[220]	training's auc: 0.751301	valid_0's auc: 0.738095
[240]	training's auc: 0.751314	valid_0's auc: 0.738095
[260]	training's auc: 0.751314	valid_0's auc: 0.738095
Early stopping, best iteration is:
[218]	training's auc: 0.751301	valid_0's auc: 0.738095
*****
0.7380945975032286
********************


idcard_cnt_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.728639	valid_0's auc: 0.722937
[40]	training's auc: 0.728657	valid_0's auc: 0.722704
Early stopping, best iteration is:
[1]	training's auc: 0.728639	valid_0's auc: 0.722937
*****
0.7229366958434974
********************


opposite_cnt
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.875081	valid_0's auc: 0.835057
[40]	training's auc: 0.8783

[40]	training's auc: 0.867137	valid_0's auc: 0.8259
[60]	training's auc: 0.867858	valid_0's auc: 0.824646
Early stopping, best iteration is:
[27]	training's auc: 0.866524	valid_0's auc: 0.826957
*****
0.8269568804706557
********************


phone2opposite_cnt_std
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.889576	valid_0's auc: 0.844216
[40]	training's auc: 0.893908	valid_0's auc: 0.842427
[60]	training's auc: 0.895728	valid_0's auc: 0.841799
Early stopping, best iteration is:
[25]	training's auc: 0.891286	valid_0's auc: 0.846914
*****
0.846914310996317
********************


phone2opposite_call_dur_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.877718	valid_0's auc: 0.814936
[40]	training's auc: 0.881614	valid_0's auc: 0.812117
Early stopping, best iteration is:
[2]	training's auc: 0.86986	valid_0's auc: 0.817817
*****
0.8178167001482757
********************


phone2opposite_call_dur_median
Training un

[20]	training's auc: 0.902743	valid_0's auc: 0.886563
[40]	training's auc: 0.90309	valid_0's auc: 0.886305
[60]	training's auc: 0.903416	valid_0's auc: 0.885358
Early stopping, best iteration is:
[13]	training's auc: 0.90241	valid_0's auc: 0.888309
*****
0.8883089395896111
********************


voc_day_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.87395	valid_0's auc: 0.859841
[40]	training's auc: 0.87395	valid_0's auc: 0.859841
[60]	training's auc: 0.87395	valid_0's auc: 0.859841
Early stopping, best iteration is:
[18]	training's auc: 0.873926	valid_0's auc: 0.86012
*****
0.8601201152723968
********************


voc_day22_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.762886	valid_0's auc: 0.741375
[40]	training's auc: 0.763122	valid_0's auc: 0.740825
Early stopping, best iteration is:
[2]	training's auc: 0.762234	valid_0's auc: 0.743686
*****
0.7436863251542546
********************


voc_day23_

[20]	training's auc: 0.744287	valid_0's auc: 0.742653
[40]	training's auc: 0.744396	valid_0's auc: 0.743885
[60]	training's auc: 0.744418	valid_0's auc: 0.74394
[80]	training's auc: 0.744422	valid_0's auc: 0.743979
[100]	training's auc: 0.744424	valid_0's auc: 0.743942
[120]	training's auc: 0.744428	valid_0's auc: 0.743972
Early stopping, best iteration is:
[70]	training's auc: 0.74442	valid_0's auc: 0.744018
*****
0.7440181518151815
********************


voc_day5_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.755184	valid_0's auc: 0.725379
[40]	training's auc: 0.755289	valid_0's auc: 0.725561
Early stopping, best iteration is:
[2]	training's auc: 0.75493	valid_0's auc: 0.726963
*****
0.7269634572152868
********************


voc_day2_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.740368	valid_0's auc: 0.698743
[40]	training's auc: 0.740437	valid_0's auc: 0.698594
[60]	training's auc: 0.740504	valid_

[100]	training's auc: 0.761978	valid_0's auc: 0.746731
Early stopping, best iteration is:
[69]	training's auc: 0.761769	valid_0's auc: 0.747181
*****
0.7471809680968097
********************


voc_day30_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.769902	valid_0's auc: 0.752362
[40]	training's auc: 0.76998	valid_0's auc: 0.752794
[60]	training's auc: 0.770022	valid_0's auc: 0.752717
[80]	training's auc: 0.770027	valid_0's auc: 0.752735
[100]	training's auc: 0.770157	valid_0's auc: 0.752903
[120]	training's auc: 0.770157	valid_0's auc: 0.752903
[140]	training's auc: 0.770197	valid_0's auc: 0.752971
[160]	training's auc: 0.770219	valid_0's auc: 0.753046
[180]	training's auc: 0.770219	valid_0's auc: 0.753046
[200]	training's auc: 0.770221	valid_0's auc: 0.753046
[220]	training's auc: 0.770246	valid_0's auc: 0.753073
[240]	training's auc: 0.770246	valid_0's auc: 0.753073
Early stopping, best iteration is:
[208]	training's auc: 0.770246	valid_0'

[40]	training's auc: 0.770325	valid_0's auc: 0.730947
Early stopping, best iteration is:
[1]	training's auc: 0.764104	valid_0's auc: 0.73348
*****
0.7334804132587172
********************


voc_day23_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.774361	valid_0's auc: 0.731767
[40]	training's auc: 0.775916	valid_0's auc: 0.731258
[60]	training's auc: 0.777294	valid_0's auc: 0.731388
Early stopping, best iteration is:
[14]	training's auc: 0.773543	valid_0's auc: 0.732403
*****
0.732402723968049
********************


voc_day1_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.736102	valid_0's auc: 0.721919
[40]	training's auc: 0.737709	valid_0's auc: 0.722627
[60]	training's auc: 0.738257	valid_0's auc: 0.722234
[80]	training's auc: 0.739056	valid_0's auc: 0.722508
Early stopping, best iteration is:
[43]	training's auc: 0.736963	valid_0's auc: 0.726826
*****
0.7268259434639116
****************

[20]	training's auc: 0.753372	valid_0's auc: 0.716068
[40]	training's auc: 0.754746	valid_0's auc: 0.715012
Early stopping, best iteration is:
[5]	training's auc: 0.751335	valid_0's auc: 0.719583
*****
0.7195825560816951
********************


voc_day8_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.745784	valid_0's auc: 0.709708
[40]	training's auc: 0.746994	valid_0's auc: 0.707928
[60]	training's auc: 0.747952	valid_0's auc: 0.70726
Early stopping, best iteration is:
[13]	training's auc: 0.745266	valid_0's auc: 0.709969
*****
0.7099685512029463
********************


voc_day18_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.764157	valid_0's auc: 0.733144
[40]	training's auc: 0.766299	valid_0's auc: 0.731122
Early stopping, best iteration is:
[1]	training's auc: 0.759518	valid_0's auc: 0.734095
*****
0.7340947409958387
********************


voc_day31_call_dur_sum
Training until validatio

[20]	training's auc: 0.708253	valid_0's auc: 0.69893
[40]	training's auc: 0.708556	valid_0's auc: 0.698845
Early stopping, best iteration is:
[1]	training's auc: 0.708049	valid_0's auc: 0.699691
*****
0.6996908930023437
********************


voc_hour5_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.529595	valid_0's auc: 0.521165
[40]	training's auc: 0.529596	valid_0's auc: 0.521156
[60]	training's auc: 0.529296	valid_0's auc: 0.522969
[80]	training's auc: 0.529896	valid_0's auc: 0.519355
[100]	training's auc: 0.529895	valid_0's auc: 0.51931
Early stopping, best iteration is:
[53]	training's auc: 0.529296	valid_0's auc: 0.522969
*****
0.5229692806237145
********************


voc_hour1_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.518739	valid_0's auc: 0.509859
[40]	training's auc: 0.518745	valid_0's auc: 0.509879
[60]	training's auc: 0.518745	valid_0's auc: 0.50984
[80]	training's auc: 0.518745	valid

[3]	training's auc: 0.794898	valid_0's auc: 0.772918
*****
0.7729184603242933
********************


voc_hour17_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.807673	valid_0's auc: 0.778842
[40]	training's auc: 0.807519	valid_0's auc: 0.779826
Early stopping, best iteration is:
[3]	training's auc: 0.806134	valid_0's auc: 0.780249
*****
0.7802485411584636
********************


voc_hour7_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.709691	valid_0's auc: 0.700804
[40]	training's auc: 0.709718	valid_0's auc: 0.700607
Early stopping, best iteration is:
[5]	training's auc: 0.709674	valid_0's auc: 0.700879
*****
0.7008791911799875
********************


voc_hour5_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.529543	valid_0's auc: 0.521503
[40]	training's auc: 0.529847	valid_0's auc: 0.519696
Early stopping, best iteration is:
[4]	training's auc: 0.529543	v

[20]	training's auc: 0.786715	valid_0's auc: 0.745208
[40]	training's auc: 0.788678	valid_0's auc: 0.744937
[60]	training's auc: 0.789828	valid_0's auc: 0.745565
Early stopping, best iteration is:
[11]	training's auc: 0.785109	valid_0's auc: 0.745945
*****
0.7459448390491223
********************


voc_hour16_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.787861	valid_0's auc: 0.759756
[40]	training's auc: 0.790278	valid_0's auc: 0.758795
Early stopping, best iteration is:
[1]	training's auc: 0.777837	valid_0's auc: 0.761943
*****
0.7619427703639929
********************


voc_hour17_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.801503	valid_0's auc: 0.760529
[40]	training's auc: 0.803946	valid_0's auc: 0.760149
[60]	training's auc: 0.80532	valid_0's auc: 0.759508
Early stopping, best iteration is:
[14]	training's auc: 0.80056	valid_0's auc: 0.76142
*****
0.7614196202228919
**************

[20]	training's auc: 0.879702	valid_0's auc: 0.857365
[40]	training's auc: 0.880186	valid_0's auc: 0.857477
[60]	training's auc: 0.880501	valid_0's auc: 0.856633
[80]	training's auc: 0.880652	valid_0's auc: 0.856546
Early stopping, best iteration is:
[31]	training's auc: 0.880007	valid_0's auc: 0.858001
*****
0.858000609843593
********************


sms_day_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.90171	valid_0's auc: 0.868528
[40]	training's auc: 0.901742	valid_0's auc: 0.867266
Early stopping, best iteration is:
[7]	training's auc: 0.901688	valid_0's auc: 0.868603
*****
0.8686026211316784
********************


sms_day1_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.815883	valid_0's auc: 0.785885
[40]	training's auc: 0.8159	valid_0's auc: 0.786227
[60]	training's auc: 0.815935	valid_0's auc: 0.785991
[80]	training's auc: 0.815932	valid_0's auc: 0.786171
Early stopping, best iteration is:
[45

Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.852865	valid_0's auc: 0.826189
[40]	training's auc: 0.852946	valid_0's auc: 0.825909
Early stopping, best iteration is:
[2]	training's auc: 0.852679	valid_0's auc: 0.827292
*****
0.8272916965609605
********************


sms_day20_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.807959	valid_0's auc: 0.766336
[40]	training's auc: 0.808019	valid_0's auc: 0.766147
Early stopping, best iteration is:
[2]	training's auc: 0.807732	valid_0's auc: 0.768407
*****
0.7684074113933133
********************


sms_day21_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.763335	valid_0's auc: 0.734957
[40]	training's auc: 0.763367	valid_0's auc: 0.734784
Early stopping, best iteration is:
[7]	training's auc: 0.76332	valid_0's auc: 0.735057
*****
0.7350573372554646
********************


sms_day22_count
Training until validation scores don't

[40]	training's auc: 0.790954	valid_0's auc: 0.756182
[60]	training's auc: 0.791129	valid_0's auc: 0.756705
[80]	training's auc: 0.791151	valid_0's auc: 0.756959
[100]	training's auc: 0.791246	valid_0's auc: 0.758033
[120]	training's auc: 0.791278	valid_0's auc: 0.757949
[140]	training's auc: 0.791318	valid_0's auc: 0.75774
Early stopping, best iteration is:
[103]	training's auc: 0.791275	valid_0's auc: 0.758036
*****
0.7580355861673124
********************


sms_hour8_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.758293	valid_0's auc: 0.733125
[40]	training's auc: 0.759019	valid_0's auc: 0.732993
Early stopping, best iteration is:
[2]	training's auc: 0.757752	valid_0's auc: 0.738477
*****
0.7384772444635768
********************


sms_hour21_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.762414	valid_0's auc: 0.734663
[40]	training's auc: 0.762514	valid_0's auc: 0.734767
[60]	training's auc: 0.762612

In [63]:
for key, val in useless_cols.items():
    print(key, val)

county_name_arpu_202004_skew 0.5082194360740422
city_name_county_name_arpu_202004_skew 0.5082194360740422
voc_hour1_count 0.509878569378677
voc_hour3_count 0.5081716052039986
voc_hour2_count 0.5142610728464151
voc_hour4_count 0.5153566987133495
voc_hour1_nunique 0.5102911106328024
voc_hour3_nunique 0.5055244654900273
voc_hour2_nunique 0.5119278232171043
voc_hour4_nunique 0.5162086860859999
voc_hour1_call_dur_sum 0.5041822116994308
voc_hour3_call_dur_sum 0.5055543597838045
voc_hour2_call_dur_sum 0.5092597574974889
voc_hour4_call_dur_sum 0.5162804323910652
sms_hour3_count 0.5002929640790166
sms_hour5_count 0.5097799182092122
sms_hour4_count 0.5133313603099441


In [64]:
df_train.drop(list(useless_cols.keys()), axis=1, inplace=True)
df_test.drop(list(useless_cols.keys()), axis=1, inplace=True)

### 高共线

In [65]:
def correlation(df, threshold):
    """
    去除特征相关系数大于阈值的特征
    :param df:
    :param threshold:
    :return:
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName_i = corr_matrix.columns[i]
                colName_j = corr_matrix.columns[j]
                if useful_cols[colName_i] >= useful_cols[colName_j]:
                    col_corr.add(colName_j)
                else:
                    col_corr.add(colName_i)

    return col_corr


col = correlation(df_train.drop(['phone_no_m', 'label'], axis=1), 0.98)
print('Correlated columns: ', col)

Correlated columns:  {'county_name_arpu_202004_nunique', 'county_name_idcard_cnt_nunique', 'county_name_idcard_cnt*arpu_202004_nunique', 'city_name_county_name_arpu_202004/idcard_cnt_nunique', 'call_type_id_2_rate', 'county_name_nunique_y', 'city_name_arpu_202004/idcard_cnt_nunique', 'county_name_idcard_cnt_max', 'county_name_arpu_202004/idcard_cnt_skew', 'city_name', 'county_name_nunique_x', 'city_name_county_name_arpu_202004/idcard_cnt_median', 'city_name_idcard_cnt*arpu_202004_nunique', 'county_name_arpu_202004/idcard_cnt_max', 'county_name_idcard_cnt*arpu_202004_median', 'county_name_count', 'city_name_county_name_arpu_202004_nunique', 'county_name_arpu_202004/idcard_cnt_min', 'county_name_arpu_202004_min', 'city_name_arpu_202004_min', 'city_name_idcard_cnt*arpu_202004_max', 'county_name_arpu_202004/idcard_cnt_nunique', 'county_name_arpu_202004_max', 'city_name_arpu_202004_nunique', 'county_name_arpu_202004/idcard_cnt_mean', 'county_name_idcard_cnt*arpu_202004_min', 'voc_calltype_i

In [66]:
df_train.drop(list(col), axis=1, inplace=True)
df_test.drop(list(col), axis=1, inplace=True)

In [67]:
feature_names = [i for i in df_train.columns if i not in ['phone_no_m', 'label']]
ycol = 'label'

In [68]:
oof = []
prediction = df_test[['phone_no_m', 'arpu_202004']]
prediction['label'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol])):
    print('\nFold_{} Training ================================\n'.format(fold_id+1))
    
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]
    
    lgb_train = lgb.Dataset(X_train, Y_train) 
    lgb_valid= lgb.Dataset(X_val, Y_val, reference=lgb_train)
    
    lgb_model = lgb.train(params,
                          lgb_train,
                          num_boost_round=10000,
                          valid_sets=[lgb_valid, lgb_train],
                          early_stopping_rounds=100,
                          verbose_eval=10)

    pred_val = lgb_model.predict(X_val)
    df_oof = df_train.iloc[val_idx][['phone_no_m', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(df_test[feature_names])
    prediction['label_{}'.format(fold_id)] = pred_test

    importance = lgb_model.feature_importance(importance_type='gain')
    feature_name = lgb_model.feature_name()
    df_importance = pd.DataFrame({
        'feature_name': feature_name,
        'importance': importance
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.987644	valid_0's auc: 0.944023
[20]	training's auc: 0.996809	valid_0's auc: 0.949554
[30]	training's auc: 0.998786	valid_0's auc: 0.952951
[40]	training's auc: 0.999539	valid_0's auc: 0.953334
[50]	training's auc: 0.999754	valid_0's auc: 0.954742
[60]	training's auc: 0.999877	valid_0's auc: 0.954957
[70]	training's auc: 0.999947	valid_0's auc: 0.956052
[80]	training's auc: 1	valid_0's auc: 0.955515
[90]	training's auc: 1	valid_0's auc: 0.955699
[100]	training's auc: 1	valid_0's auc: 0.955963
[110]	training's auc: 1	valid_0's auc: 0.956798
[120]	training's auc: 1	valid_0's auc: 0.956633
[130]	training's auc: 1	valid_0's auc: 0.956939
[140]	training's auc: 1	valid_0's auc: 0.957259
[150]	training's auc: 1	valid_0's auc: 0.956912
[160]	training's auc: 1	valid_0's auc: 0.957461
[170]	training's auc: 1	valid_0's auc: 0.957289
[180]	training's auc: 1	valid_0's auc: 0.957609
[190]	training's auc: 1	valid_0'

In [69]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['feature_name'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance.head(20)

Unnamed: 0,feature_name,importance
0,voc_day_mode_count,22209.1415
1,idcard_cnt*arpu_202004,1609.172232
2,sms_day31_count,1096.076836
3,sms_cnt_per_capita,966.903712
4,sms_day30_count,918.881257
5,sms_day26_count,833.563315
6,sms_day27_count,592.911115
7,sms_nunique,506.756037
8,voc_day_nunique,359.870899
9,sms_day_mode_count,353.65129


In [70]:
df_importance.tail(10)

Unnamed: 0,feature_name,importance
294,county_name_idcard_cnt_min,2.2789
295,calltype_id_unique,2.2004
296,city_name_idcard_cnt_min,1.143556
297,city_name_county_name_idcard_cnt_median,0.940848
298,phone2opposite_cnt_min,0.486505
299,city_name_county_name_idcard_cnt_min,0.042024
300,city_name_idcard_cnt_median,0.027063
301,voc_hour5_nunique,0.0
302,voc_hour5_count,0.0
303,voc_hour5_call_dur_sum,0.0


In [71]:
df_importance['normalized_importance'] = df_importance['importance'] / df_importance['importance'].sum()
df_importance['cumulative_importance'] = np.cumsum(df_importance['normalized_importance'])
record_low_importance = df_importance[df_importance['cumulative_importance'] > 0.99]
to_drop = list(record_low_importance['feature_name'])
print(to_drop)

['voc_hour7_count', 'voc_hour14_count', 'voc_hour_mode_count', 'voc_day3_nunique', 'voc_day14_nunique', 'voc_day2_nunique', 'sms_hour6_count', 'voc_hour20_count', 'voc_calltype_id_1_300s_cnt', 'voc_day9_count', 'voc_day20_count', 'city_name_idcard_cnt*arpu_202004_median', 'sms_day18_count', 'voc_hour19_nunique', 'voc_hour13_nunique', 'voc_day8_count', 'voc_hour19_count', 'voc_day10_nunique', 'voc_hour14_nunique', 'voc_hour17_nunique', 'voc_day18_nunique', 'voc_day6_nunique', 'city_name_arpu_202004/idcard_cnt_min', 'voc_hour21_nunique', 'voc_hour15_nunique', 'city_name_arpu_202004_max', 'voc_day17_nunique', 'voc_day20_nunique', 'city_name_idcard_cnt*arpu_202004_min', 'voc_hour22_nunique', 'call_dur_sum', 'voc_day1_count', 'voc_hour6_nunique', 'city_name_county_name_arpu_202004_min', 'voc_hour0_nunique', 'voc_hour9_count', 'city_name_nunique_y', 'voc_day1_nunique', 'voc_hour0_count', 'voc_hour6_count', 'voc_day16_nunique', 'county_name_idcard_cnt_median', 'city_name_nunique_x', 'city_nam

In [72]:
df_train.drop(to_drop, axis=1, inplace=True)
df_test.drop(to_drop, axis=1, inplace=True)

In [73]:
use_cols = [i for i in df_train.columns if i not in ['phone_no_m', 'label']]

In [74]:
lgb_train_all = lgb.Dataset(df_train[use_cols].values, df_train['label'])   

print('Start training...')

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=100)

print('Done!')

Start training...
Done!


In [75]:
df_test['label'] = np.where(lgb_model.predict(df_test[use_cols]) > 0.4735, 1, 0)
# df_test.loc[df_test['arpu_202004'] <= 0, 'label'] = 1
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}.csv'.format(time.strftime('%Y%m%d')), index=False)