In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from feature_selector import FeatureSelector

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
def count_encode(df, cols):
    for col in tqdm(cols):
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df
        
        
def label_encode(df, cols):
    le = LabelEncoder()
    for col in tqdm(cols):
        df[col] = df[col].fillna('NA')
        df[col] = le.fit_transform(df[col].astype(str))
    return df


def cross_cat_num(df, cat_col, num_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            df_new = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_nunique'.format(f1, f2): 'nunique'
            })
            df = df.merge(df_new, on=f1, how='left')
            del df_new
            gc.collect()
    return df

## user表

In [4]:
train_user = pd.read_hdf('../input/train_user.h5')
test_user = pd.read_hdf('../input/test_user.h5')

In [5]:
# 将city_name和county_name拼接起来
train_user['city_name_county_name'] = train_user['city_name'].astype(str) + '_' + train_user['county_name'].astype(str)
test_user['city_name_county_name'] = test_user['city_name'].astype(str) + '_' + test_user['county_name'].astype(str)

In [6]:
cat_feat = ['city_name', 'county_name', 'city_name_county_name']
num_feat = ['idcard_cnt', 'arpu_202004', 'idcard_cnt*arpu_202004', 'arpu_202004/idcard_cnt']

In [7]:
# y = train_user['label']

In [8]:
# ME = MeanEncoder(categorical_features=cat_feat,
#                  n_splits=3,
#                  target_type='classification',
#                  prior_weight_func=None)
# X_data = ME.fit_transform(train_user.drop('label', axis=1), y)
# X_test = ME.transform(test_user)

In [9]:
# train_user = X_data.copy()
# train_user['label'] = y
# test_user = X_test.copy()

# train_user.shape, test_user.shape

In [10]:
df_user = pd.concat([train_user, test_user])

del train_user, test_user
gc.collect()

0

In [11]:
# 电话的数量*月消费额
df_user['idcard_cnt*arpu_202004'] = df_user['idcard_cnt'] * df_user['arpu_202004']

# 月消费额/电话的数量
df_user['arpu_202004/idcard_cnt'] = df_user['arpu_202004'] / (df_user['idcard_cnt'] + 0.0001)

In [12]:
df_user = cross_cat_num(df_user, cat_feat, num_feat)

  0%|                                                                                            | 0/3 [00:00<?, ?it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 23.04it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  5.56it/s]
  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:00<00:00, 13.55it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 13.74it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:00<00:00,  4.66it/s]
  0%|                    

In [13]:
df_user = count_encode(df_user, cat_feat + ['idcard_cnt'])

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 298.74it/s]

city_name
county_name
city_name_county_name
idcard_cnt





In [14]:
df_user = label_encode(df_user, cat_feat)

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 250.69it/s]


In [15]:
df_user.to_hdf('../input/user_features.h5', 'df', index=False)

del df_user
gc.collect()

66

## voc表

In [16]:
train_voc = pd.read_hdf('../input/train_voc.h5')
test_voc = pd.read_hdf('../input/test_voc.h5')

In [17]:
df_voc = pd.concat([train_voc, test_voc])

del train_voc, test_voc
gc.collect()

128

In [18]:
df_voc['city_name_county_name'] = df_voc['city_name'] + '_' + df_voc['county_name']

In [19]:
df_voc['voc_day'] = df_voc['start_datetime'].astype('datetime64').dt.day
df_voc['voc_hour'] = df_voc['start_datetime'].astype('datetime64').dt.hour
# df_voc['voc_dayofweek'] = df_voc['start_datetime'].astype('datetime64').dt.dayofweek

In [20]:
phone_no_m = df_voc[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [21]:
# 通话次数，通话人数
tmp = df_voc.groupby('phone_no_m')['opposite_no_m'].agg(opposite_cnt='count', opposite_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, how='left', on='phone_no_m')

# 人均通话次数
phone_no_m['voc_cnt_per_capita'] = phone_no_m['opposite_cnt'] / (phone_no_m['opposite_nunique'] + 0.0001)

del tmp
gc.collect()

0

In [22]:
"""
主叫通话
"""

df_calltype_id_1 = df_voc.loc[df_voc['calltype_id'] == 1, :].copy()

# 主叫通话次数，主叫通话使用的手机个数
tmp = df_calltype_id_1.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_1_cnt='count', imeis='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长
tmp = df_calltype_id_1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_call_dur_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时长小于30s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 30]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_30s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于60s的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] < 60]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_60s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长大于300s（5分钟）的次数
tmp1 = df_calltype_id_1[df_calltype_id_1['call_dur'] > 300]
tmp2 = tmp1.groupby('phone_no_m')['call_dur'].agg(voc_calltype_id_1_300s_cnt='count')
phone_no_m = phone_no_m.merge(tmp2, on='phone_no_m', how='left')
del tmp1, tmp2
gc.collect()

# 主叫通话时长小于30s的次数的占比
phone_no_m['voc_calltype_id_1_30s_rate'] = phone_no_m['voc_calltype_id_1_30s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长小于60s的次数的占比
phone_no_m['voc_calltype_id_1_60s_rate'] = phone_no_m['voc_calltype_id_1_60s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']

# 主叫通话时长大于300s（5分钟）的次数的占比
phone_no_m['voc_calltype_id_1_300s_rate'] = phone_no_m['voc_calltype_id_1_300s_cnt'] / phone_no_m['voc_calltype_id_1_cnt']


# 主叫通话次数的占比
phone_no_m["call_type_id_1_rate"] = phone_no_m['voc_calltype_id_1_cnt'] / phone_no_m['opposite_cnt']

# 主叫通话时所在地市变动的个数
tmp = df_calltype_id_1.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

# 主叫通话时所在区县变动的个数
tmp = df_calltype_id_1.groupby("phone_no_m")['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
del tmp
gc.collect()

# 主叫通话时所在地市_区县变动的个数
tmp = df_calltype_id_1.groupby("phone_no_m")['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
del tmp
gc.collect()

0

In [23]:
"""
被叫通话
"""

df_calltype_id_2 = df_voc.loc[df_voc['calltype_id'] == 2, :].copy()

# 被叫通话的次数
tmp = df_calltype_id_2.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_2_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 被叫通话次数的占比
phone_no_m["call_type_id_2_rate"] = phone_no_m['voc_calltype_id_2_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_2
gc.collect()

20

In [24]:
"""
呼叫转移
"""

df_calltype_id_3 = df_voc.loc[df_voc['calltype_id'] == 3, :].copy()

# 呼叫转移的次数
tmp = df_calltype_id_3.groupby('phone_no_m')['imei_m'].agg(voc_calltype_id_3_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 呼叫转移次数的占比
phone_no_m["call_type_id_3_rate"] = phone_no_m['voc_calltype_id_3_cnt'] / phone_no_m['opposite_cnt']

del tmp, df_calltype_id_3
gc.collect()

20

In [25]:
"""
与对端通话统计
"""

# 与对端通话次数，与对端通话时长
tmp = df_voc.groupby(['phone_no_m', 'opposite_no_m'])['call_dur'].agg(call_count='count',
                                                                      call_sum='sum')

# 与对端通话次数的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_count'].agg(phone2opposite_cnt_mean='mean',
                                                             phone2opposite_cnt_median='median',
                                                             phone2opposite_cnt_min='min',
                                                             phone2opposite_cnt_max='max',
                                                             phone2opposite_cnt_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite
gc.collect()


# 与对端总通话时长的统计量
phone2opposite = tmp.groupby('phone_no_m')['call_sum'].agg(phone2opposite_call_dur_mean='mean',
                                                           phone2opposite_call_dur_median='median',
                                                           phone2opposite_call_dur_min='min',
                                                           phone2opposite_call_dur_max='max',
                                                           phone2opposite_call_dur_std='std')
phone_no_m = phone_no_m.merge(phone2opposite, on='phone_no_m', how='left')
del phone2opposite, tmp
gc.collect()

0

In [26]:
"""
通话时长的统计
"""

# 通话时长的统计量
tmp = df_voc.groupby('phone_no_m')['call_dur'].agg(call_dur_mean='mean',
                                                   call_dur_median='median',
                                                   call_dur_max='max',
                                                   call_dur_min='min',
                                                   call_dur_std='std',
                                                   call_dur_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

0

In [27]:
"""
收费号码位置变动
"""

# 收费号码所在地市的个数
tmp = df_voc.groupby('phone_no_m')['city_name'].agg(city_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 收费号码所在区县的个数
tmp = df_voc.groupby('phone_no_m')['county_name'].agg(county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how='left')

# 收费号码所在地市_区县的个数
tmp = df_voc.groupby('phone_no_m')['city_name_county_name'].agg(city_name_county_name_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how='left')

# 收费号码通话类型的个数
tmp = df_voc.groupby('phone_no_m')['calltype_id'].agg(calltype_id_unique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [28]:
"""
通话时间点的偏好
"""

# hour通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_hour'].agg(voc_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   voc_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   voc_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# day通话次数最高
tmp = df_voc.groupby('phone_no_m')['voc_day'].agg(voc_day_mode=lambda x: stats.mode(x)[0][0],
                                                  voc_day_mode_count=lambda x: stats.mode(x)[1][0],
                                                  voc_day_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [29]:
# 每天的通话次数
voc_day_cnt_res = df_voc.groupby(['phone_no_m', 'voc_day'])['phone_no_m'].count().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_cnt_res[i])

    
# 每天的通话人数
voc_day_nunique_res = df_voc.groupby(['phone_no_m', 'voc_day'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_nunique_res[i])

    
# 每天的通话时长
voc_day_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_day'])['call_dur'].sum().unstack()
for i in df_voc['voc_day'].unique():
    phone_no_m['voc_day{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_day_call_dur_res[i])



# 每小时的通话次数
voc_hour_cnt_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['phone_no_m'].count().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_cnt_res[i])
    
# 每小时的通话人数
voc_hour_nunique_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['opposite_no_m'].nunique().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_nunique'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_nunique_res[i])
    

# 每小时的通话时长
voc_hour_call_dur_res = df_voc.groupby(['phone_no_m', 'voc_hour'])['call_dur'].sum().unstack()
for i in df_voc['voc_hour'].unique():
    phone_no_m['voc_hour{}_call_dur_sum'.format(i)] = phone_no_m['phone_no_m'].map(voc_hour_call_dur_res[i])

In [30]:
phone_no_m.to_hdf('../input/voc_features.h5', 'df', index=False)
del phone_no_m
gc.collect()

75

## sms表

In [31]:
train_sms = pd.read_hdf('../input/train_sms.h5')
test_sms = pd.read_hdf('../input/test_sms.h5')

In [32]:
df_sms = pd.concat([train_sms, test_sms])

del train_sms, test_sms
gc.collect()

106

In [33]:
df_sms['sms_day'] = df_sms['request_datetime'].astype('datetime64').dt.day
df_sms['sms_hour'] = df_sms['request_datetime'].astype('datetime64').dt.hour
# df_sms['sms_dayofweek'] = df_sms['request_datetime'].astype('datetime64').dt.dayofweek

In [34]:
phone_no_m = df_sms[["phone_no_m"]].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [35]:
# 短信次数，短信人数
tmp = df_sms.groupby('phone_no_m')['opposite_no_m'].agg(sms_cnt='count', sms_nunique='nunique')

# 人均短信次数
tmp['sms_cnt_per_capita'] = tmp['sms_cnt'] / (tmp['sms_nunique'] + 0.0001)
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [36]:
"""
短信上行，短信下行
"""

# 短信上行
df_sms_calltype1 = df_sms[df_sms['calltype_id'] == 1].copy()
tmp = df_sms_calltype1.groupby('phone_no_m')['calltype_id'].agg(sms_calltype1_cnt="count")
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行比例
phone_no_m['sms_calltype1_rate'] = phone_no_m['sms_calltype1_cnt'] / phone_no_m['sms_cnt']

del tmp, df_sms_calltype1

# 短信下行
df_sms_calltype2 = df_sms[df_sms['calltype_id'] == 2].copy()
tmp = df_sms_calltype2.groupby('phone_no_m')['calltype_id'].agg(sms_calltype2_cnt="count")
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行/短信下行
phone_no_m['type1_rate/type2_rate'] = phone_no_m['sms_calltype1_cnt'] / (phone_no_m['sms_calltype2_cnt'] + 0.00001)

# 删除，效果变差
# # 短信下行/短信上行
# phone_no_m['type2_rate/type1_rate'] = phone_no_m['sms_calltype2_cnt'] / (phone_no_m['sms_calltype1_cnt'] + 0.00001)

del tmp, df_sms_calltype2
gc.collect()

0

In [37]:
"""
短信时间点的偏好
"""

tmp = df_sms.groupby('phone_no_m')['sms_hour'].agg(sms_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   sms_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   sms_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp

tmp = df_sms.groupby('phone_no_m')['sms_day'].agg(sms_day_mode=lambda x: stats.mode(x)[0][0],
                                                  sms_day_mode_count=lambda x: stats.mode(x)[1][0],
                                                  sms_day_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [38]:
# 每天的短信次数
sms_day_res = df_sms.groupby(['phone_no_m', 'sms_day'])['phone_no_m'].count().unstack()
for i in df_sms['sms_day'].unique():
    phone_no_m['sms_day{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_day_res[i])


# 每小时的短信次数
sms_hour_res = df_sms.groupby(['phone_no_m', 'sms_hour'])['phone_no_m'].count().unstack()
for i in df_sms['sms_hour'].unique():
    phone_no_m['sms_hour{}_count'.format(i)] = phone_no_m['phone_no_m'].map(sms_hour_res[i])

In [39]:
phone_no_m.to_hdf('../input/sms_features.h5', 'df', index=False)

del phone_no_m
gc.collect()

64

## app表

In [40]:
train_app = pd.read_hdf('../input/train_app.h5')
test_app = pd.read_hdf('../input/test_app.h5')

In [41]:
df_app = pd.concat([train_app, test_app])

del train_app, test_app
gc.collect()

106

In [42]:
phone_no_m = df_app[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [43]:
# APP数
tmp = df_app.groupby('phone_no_m')['busi_name'].agg(busi_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

40

In [44]:
"""
流量统计
"""
tmp = df_app.groupby("phone_no_m")["flow"].agg(flow_mean='mean',
                                               flow_median='median',
                                               flow_min='min',
                                               flow_max='max',
                                               flow_std='std',
                                               flow_sum='sum',
                                               flow_skew='skew')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

20

In [45]:
phone_no_m.to_hdf('../input/app_features.h5', 'df', index=False)

del phone_no_m
gc.collect()

75

## 读取数据，建模

In [46]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import LabelEncoder
from scipy import stats
from feature_selector import FeatureSelector

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [47]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [48]:
df_user = reduce_mem_usage(pd.read_hdf('../input/user_features.h5'))
df_voc = reduce_mem_usage(pd.read_hdf('../input/voc_features.h5'))
df_sms = reduce_mem_usage(pd.read_hdf('../input/sms_features.h5'))
# df_app = reduce_mem_usage(pd.read_hdf('../input/app_features.h5'))

Memory usage of dataframe is 3651648.00 MB
Memory usage after optimization is: 1760616.00 MB
Decreased by 51.8%
Memory usage of dataframe is 11485296.00 MB
Memory usage after optimization is: 2979932.00 MB
Decreased by 74.1%
Memory usage of dataframe is 3850000.00 MB
Memory usage after optimization is: 1031250.00 MB
Decreased by 73.2%


In [49]:
df_user.shape, df_voc.shape, df_sms.shape #, df_app.shape

((8151, 85), (6788, 212), (6875, 69))

In [50]:
df = df_user.merge(df_voc, on='phone_no_m', how='left')
df = df.merge(df_sms, on='phone_no_m', how='left')
# df = df.merge(df_app, on='phone_no_m', how='left')

del df_user, df_voc, df_sms#, df_app
gc.collect()

0

In [51]:
df.shape

(8151, 364)

In [52]:
# astype('category')会掉分，不做此处理
# for i in tqdm(cat_feat):
#     print(i)
#     df[i] = df[i].astype('category')

In [53]:
df_train = df[df.label.notna()]
df_test = df[df.label.isna()]

df_train.shape, df_test.shape

((6106, 364), (2045, 364))

## 特征筛选

### 缺失值

In [54]:
fs = FeatureSelector(data=df_train.drop(['phone_no_m', 'label'], axis=1), labels=df_train['label'])

In [55]:
fs.identify_missing(missing_threshold=0.98)
missing_features = fs.ops['missing']
print(missing_features)

6 features with greater than 0.98 missing values.

['city_name_arpu_202004_skew', 'city_name_idcard_cnt*arpu_202004_skew', 'county_name_idcard_cnt*arpu_202004_skew', 'city_name_county_name_idcard_cnt*arpu_202004_skew', 'voc_calltype_id_3_cnt', 'call_type_id_3_rate']


In [56]:
fs.missing_stats.head(20)

Unnamed: 0,missing_fraction
city_name_idcard_cnt*arpu_202004_skew,1.0
city_name_arpu_202004_skew,1.0
city_name_county_name_idcard_cnt*arpu_202004_skew,0.99132
county_name_idcard_cnt*arpu_202004_skew,0.99132
voc_calltype_id_3_cnt,0.982312
call_type_id_3_rate,0.982312
voc_hour3_count,0.969211
voc_hour3_nunique,0.969211
voc_hour3_call_dur_sum,0.969211
voc_hour4_count,0.966099


In [57]:
df_train.drop(missing_features, axis=1, inplace=True)
df_test.drop(missing_features, axis=1, inplace=True)

### 唯一值

In [58]:
fs.identify_single_unique()

0 features with a single unique value.



In [59]:
single_unique = fs.ops['single_unique']
print(single_unique)

[]


In [60]:
df_train.drop(single_unique, axis=1, inplace=True)
df_test.drop(single_unique, axis=1, inplace=True)

### 单特征AUC

In [61]:
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop(['phone_no_m', 'label'], axis=1), df_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [62]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0.1,
          'lambda_l2': 0,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [63]:
train_cols = [i for i in df_train.columns if i not in ['phone_no_m', 'label']]

In [64]:
useful_cols = {}
useless_cols = {}

for i in train_cols:
    print(i)
    
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_valid= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_valid, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.52:
        useful_cols[i] = lgb_test.best_score['valid_0']['auc']
    else:
        useless_cols[i] = lgb_test.best_score['valid_0']['auc']
    print('*' * 20)
    print('\n')

city_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.684155	valid_0's auc: 0.671655
[40]	training's auc: 0.684156	valid_0's auc: 0.672408
[60]	training's auc: 0.684156	valid_0's auc: 0.672408
[80]	training's auc: 0.684158	valid_0's auc: 0.671655
Early stopping, best iteration is:
[37]	training's auc: 0.684156	valid_0's auc: 0.672408
*****
0.6724078657865786
********************


county_name
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.767358	valid_0's auc: 0.72344
[40]	training's auc: 0.772283	valid_0's auc: 0.728765
[60]	training's auc: 0.77374	valid_0's auc: 0.730307
[80]	training's auc: 0.774604	valid_0's auc: 0.730896
[100]	training's auc: 0.775263	valid_0's auc: 0.73182
[120]	training's auc: 0.775831	valid_0's auc: 0.731191
[140]	training's auc: 0.776325	valid_0's auc: 0.732008
Early stopping, best iteration is:
[90]	training's auc: 0.775065	valid_0's auc: 0.732017
*****
0.7320170875783231
***********

[20]	training's auc: 0.684116	valid_0's auc: 0.671947
[40]	training's auc: 0.684125	valid_0's auc: 0.671655
[60]	training's auc: 0.684125	valid_0's auc: 0.671655
[80]	training's auc: 0.684125	valid_0's auc: 0.671655
Early stopping, best iteration is:
[42]	training's auc: 0.684117	valid_0's auc: 0.672001
*****
0.6720013033912087
********************


city_name_arpu_202004/idcard_cnt_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.578299	valid_0's auc: 0.561841
[40]	training's auc: 0.578299	valid_0's auc: 0.561841
Early stopping, best iteration is:
[1]	training's auc: 0.578299	valid_0's auc: 0.561841
*****
0.5618408308222127
********************


city_name_arpu_202004/idcard_cnt_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.683867	valid_0's auc: 0.670468
[40]	training's auc: 0.683867	valid_0's auc: 0.670468
Early stopping, best iteration is:
[1]	training's auc: 0.683867	valid_0's auc: 0.670468
*****
0.

[80]	training's auc: 0.772927	valid_0's auc: 0.726343
[100]	training's auc: 0.773571	valid_0's auc: 0.727213
[120]	training's auc: 0.774159	valid_0's auc: 0.727829
[140]	training's auc: 0.774432	valid_0's auc: 0.72799
[160]	training's auc: 0.774678	valid_0's auc: 0.728283
[180]	training's auc: 0.774914	valid_0's auc: 0.728609
[200]	training's auc: 0.775093	valid_0's auc: 0.72895
[220]	training's auc: 0.775222	valid_0's auc: 0.728884
[240]	training's auc: 0.775373	valid_0's auc: 0.729282
[260]	training's auc: 0.775445	valid_0's auc: 0.729339
[280]	training's auc: 0.77551	valid_0's auc: 0.729216
Early stopping, best iteration is:
[245]	training's auc: 0.775401	valid_0's auc: 0.729494
*****
0.729494009183527
********************


county_name_idcard_cnt*arpu_202004_min
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.598225	valid_0's auc: 0.555203
[40]	training's auc: 0.599398	valid_0's auc: 0.560246
[60]	training's auc: 0.59977	valid_0's auc: 0.562061
[

[300]	training's auc: 0.776218	valid_0's auc: 0.734234
[320]	training's auc: 0.776297	valid_0's auc: 0.733953
Early stopping, best iteration is:
[288]	training's auc: 0.77616	valid_0's auc: 0.734343
*****
0.7343428636341895
********************


county_name_arpu_202004/idcard_cnt_skew
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.760753	valid_0's auc: 0.719007
[40]	training's auc: 0.766226	valid_0's auc: 0.722786
[60]	training's auc: 0.768889	valid_0's auc: 0.724419
[80]	training's auc: 0.770585	valid_0's auc: 0.724952
[100]	training's auc: 0.772333	valid_0's auc: 0.726204
[120]	training's auc: 0.772965	valid_0's auc: 0.727244
[140]	training's auc: 0.773575	valid_0's auc: 0.727422
[160]	training's auc: 0.774061	valid_0's auc: 0.727395
[180]	training's auc: 0.774384	valid_0's auc: 0.727428
Early stopping, best iteration is:
[132]	training's auc: 0.77336	valid_0's auc: 0.727966
*****
0.7279664107715119
********************


county_name_arpu_202004/

[100]	training's auc: 0.775115	valid_0's auc: 0.727316
[120]	training's auc: 0.775744	valid_0's auc: 0.728363
[140]	training's auc: 0.776071	valid_0's auc: 0.728285
[160]	training's auc: 0.776339	valid_0's auc: 0.729019
[180]	training's auc: 0.776563	valid_0's auc: 0.729279
[200]	training's auc: 0.776757	valid_0's auc: 0.729413
[220]	training's auc: 0.776916	valid_0's auc: 0.729661
[240]	training's auc: 0.777064	valid_0's auc: 0.729709
[260]	training's auc: 0.777142	valid_0's auc: 0.729769
[280]	training's auc: 0.777198	valid_0's auc: 0.729877
[300]	training's auc: 0.777244	valid_0's auc: 0.729859
[320]	training's auc: 0.7773	valid_0's auc: 0.729812
[340]	training's auc: 0.777359	valid_0's auc: 0.730252
[360]	training's auc: 0.777415	valid_0's auc: 0.73027
[380]	training's auc: 0.777449	valid_0's auc: 0.730231
Early stopping, best iteration is:
[341]	training's auc: 0.777357	valid_0's auc: 0.730374
*****
0.7303743961352657
********************


city_name_county_name_idcard_cnt*arpu_20

[160]	training's auc: 0.775543	valid_0's auc: 0.729744
[180]	training's auc: 0.775995	valid_0's auc: 0.729956
[200]	training's auc: 0.776308	valid_0's auc: 0.729365
[220]	training's auc: 0.7766	valid_0's auc: 0.729058
Early stopping, best iteration is:
[171]	training's auc: 0.775798	valid_0's auc: 0.73018
*****
0.7301800832257139
********************


city_name_county_name_arpu_202004/idcard_cnt_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.745711	valid_0's auc: 0.721786
[40]	training's auc: 0.748131	valid_0's auc: 0.721621
Early stopping, best iteration is:
[1]	training's auc: 0.734531	valid_0's auc: 0.725431
*****
0.725431374659205
********************


city_name_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.676216	valid_0's auc: 0.668351
[40]	training's auc: 0.67623	valid_0's auc: 0.668381
[60]	training's auc: 0.67623	valid_0's auc: 0.668381
Early stopping, best iteration is:
[25]	training's 

[20]	training's auc: 0.809253	valid_0's auc: 0.760203
[40]	training's auc: 0.812339	valid_0's auc: 0.758868
Early stopping, best iteration is:
[9]	training's auc: 0.807325	valid_0's auc: 0.761934
*****
0.7619338020758598
********************


call_type_id_2_rate
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.840311	valid_0's auc: 0.798118
[40]	training's auc: 0.842953	valid_0's auc: 0.797746
Early stopping, best iteration is:
[5]	training's auc: 0.835857	valid_0's auc: 0.800735
*****
0.8007351006839815
********************


phone2opposite_cnt_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.912874	valid_0's auc: 0.862673
[40]	training's auc: 0.915995	valid_0's auc: 0.86524
Early stopping, best iteration is:
[1]	training's auc: 0.90085	valid_0's auc: 0.866538
*****
0.8665384201463625
********************


phone2opposite_cnt_median
Training until validation scores don't improve for 50 rounds
[20]	training's a

[40]	training's auc: 0.825684	valid_0's auc: 0.808216
Early stopping, best iteration is:
[1]	training's auc: 0.825684	valid_0's auc: 0.808216
*****
0.8082161477017267
********************


voc_hour_mode
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.846148	valid_0's auc: 0.822881
[40]	training's auc: 0.846345	valid_0's auc: 0.822926
[60]	training's auc: 0.846369	valid_0's auc: 0.823189
Early stopping, best iteration is:
[17]	training's auc: 0.846121	valid_0's auc: 0.823404
*****
0.8234039436552351
********************


voc_hour_mode_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.872088	valid_0's auc: 0.854358
[40]	training's auc: 0.872424	valid_0's auc: 0.854655
[60]	training's auc: 0.87266	valid_0's auc: 0.855222
Early stopping, best iteration is:
[11]	training's auc: 0.87169	valid_0's auc: 0.855917
*****
0.855916977567322
********************


voc_hour_nunique
Training until validation scores don't imp

[20]	training's auc: 0.761157	valid_0's auc: 0.737866
[40]	training's auc: 0.761484	valid_0's auc: 0.735977
Early stopping, best iteration is:
[7]	training's auc: 0.760881	valid_0's auc: 0.738262
*****
0.738262005548381
********************


voc_day7_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.74877	valid_0's auc: 0.739713
[40]	training's auc: 0.748915	valid_0's auc: 0.741171
[60]	training's auc: 0.74908	valid_0's auc: 0.741299
[80]	training's auc: 0.749142	valid_0's auc: 0.741407
[100]	training's auc: 0.74942	valid_0's auc: 0.741332
[120]	training's auc: 0.749432	valid_0's auc: 0.741371
Early stopping, best iteration is:
[84]	training's auc: 0.749143	valid_0's auc: 0.741443
*****
0.7414427584062754
********************


voc_day14_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.75402	valid_0's auc: 0.744502
[40]	training's auc: 0.754056	valid_0's auc: 0.743837
[60]	training's auc: 0.754073	valid_0

[100]	training's auc: 0.756047	valid_0's auc: 0.740317
[120]	training's auc: 0.756795	valid_0's auc: 0.740317
Early stopping, best iteration is:
[78]	training's auc: 0.756037	valid_0's auc: 0.740338
*****
0.7403381642512077
********************


voc_day26_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.774687	valid_0's auc: 0.743957
[40]	training's auc: 0.774691	valid_0's auc: 0.743955
[60]	training's auc: 0.774694	valid_0's auc: 0.743966
Early stopping, best iteration is:
[15]	training's auc: 0.774683	valid_0's auc: 0.743975
*****
0.7439748050892045
********************


voc_day27_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.770616	valid_0's auc: 0.735344
[40]	training's auc: 0.770637	valid_0's auc: 0.736001
Early stopping, best iteration is:
[1]	training's auc: 0.770478	valid_0's auc: 0.737062
*****
0.7370617496532262
********************


voc_day13_nunique
Training until validation scores d

[40]	training's auc: 0.766669	valid_0's auc: 0.747487
[60]	training's auc: 0.766684	valid_0's auc: 0.747493
Early stopping, best iteration is:
[11]	training's auc: 0.766607	valid_0's auc: 0.748339
*****
0.7483393719806763
********************


voc_day25_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.76967	valid_0's auc: 0.755684
[40]	training's auc: 0.769709	valid_0's auc: 0.754829
Early stopping, best iteration is:
[7]	training's auc: 0.769602	valid_0's auc: 0.755885
*****
0.7558846917300426
********************


voc_day11_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.760833	valid_0's auc: 0.750407
[40]	training's auc: 0.761089	valid_0's auc: 0.752235
[60]	training's auc: 0.76109	valid_0's auc: 0.75225
[80]	training's auc: 0.761092	valid_0's auc: 0.752259
[100]	training's auc: 0.761094	valid_0's auc: 0.75227
[120]	training's auc: 0.761094	valid_0's auc: 0.75227
Early stopping, best iteration i

[20]	training's auc: 0.750619	valid_0's auc: 0.740725
[40]	training's auc: 0.75187	valid_0's auc: 0.741646
Early stopping, best iteration is:
[2]	training's auc: 0.747535	valid_0's auc: 0.74329
*****
0.7432902257617066
********************


voc_day5_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.754448	valid_0's auc: 0.719021
[40]	training's auc: 0.75596	valid_0's auc: 0.719163
Early stopping, best iteration is:
[4]	training's auc: 0.752176	valid_0's auc: 0.719659
*****
0.719658786530827
********************


voc_day2_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.738893	valid_0's auc: 0.686581
[40]	training's auc: 0.740483	valid_0's auc: 0.685391
Early stopping, best iteration is:
[2]	training's auc: 0.735712	valid_0's auc: 0.689086
*****
0.6890858922848807
********************


voc_day12_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0

[1]	training's auc: 0.764636	valid_0's auc: 0.73654
*****
0.736540094226814
********************


voc_hour16_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.782568	valid_0's auc: 0.765074
[40]	training's auc: 0.782708	valid_0's auc: 0.764889
Early stopping, best iteration is:
[1]	training's auc: 0.782016	valid_0's auc: 0.76589
*****
0.7658903118572726
********************


voc_hour17_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.795209	valid_0's auc: 0.765747
[40]	training's auc: 0.79544	valid_0's auc: 0.7667
[60]	training's auc: 0.795512	valid_0's auc: 0.766573
[80]	training's auc: 0.795558	valid_0's auc: 0.766373
Early stopping, best iteration is:
[42]	training's auc: 0.795443	valid_0's auc: 0.766724
*****
0.7667243626536566
********************


voc_hour7_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.708253	valid_0's auc: 0.69893
[40]	training's auc: 0

[88]	training's auc: 0.803246	valid_0's auc: 0.781444
*****
0.7814443129095519
********************


voc_hour22_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.646136	valid_0's auc: 0.63537
[40]	training's auc: 0.646137	valid_0's auc: 0.635376
Early stopping, best iteration is:
[5]	training's auc: 0.644691	valid_0's auc: 0.635896
*****
0.6358964700817907
********************


voc_hour8_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.766687	valid_0's auc: 0.739465
[40]	training's auc: 0.766707	valid_0's auc: 0.739419
[60]	training's auc: 0.766734	valid_0's auc: 0.739419
[80]	training's auc: 0.766501	valid_0's auc: 0.738265
Early stopping, best iteration is:
[36]	training's auc: 0.766706	valid_0's auc: 0.73947
*****
0.73946973501698
********************


voc_hour16_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.795072	valid_0's auc: 0.772292
[40]	trainin

[20]	training's auc: 0.805556	valid_0's auc: 0.781108
[40]	training's auc: 0.808302	valid_0's auc: 0.779414
[60]	training's auc: 0.80977	valid_0's auc: 0.780479
Early stopping, best iteration is:
[12]	training's auc: 0.804395	valid_0's auc: 0.782775
*****
0.7827746089826374
********************


voc_hour22_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.649204	valid_0's auc: 0.628768
[40]	training's auc: 0.652113	valid_0's auc: 0.630652
[60]	training's auc: 0.65188	valid_0's auc: 0.631729
[80]	training's auc: 0.652826	valid_0's auc: 0.631813
[100]	training's auc: 0.65465	valid_0's auc: 0.62871
Early stopping, best iteration is:
[57]	training's auc: 0.651787	valid_0's auc: 0.631953
*****
0.6319534127325777
********************


voc_hour8_call_dur_sum
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.786715	valid_0's auc: 0.745208
[40]	training's auc: 0.788678	valid_0's auc: 0.744937
[60]	training's auc:

[20]	training's auc: 0.880531	valid_0's auc: 0.85812
[40]	training's auc: 0.880536	valid_0's auc: 0.858246
Early stopping, best iteration is:
[1]	training's auc: 0.880204	valid_0's auc: 0.859477
*****
0.8594773879561869
********************


sms_day_mode
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.829988	valid_0's auc: 0.824639
[40]	training's auc: 0.829989	valid_0's auc: 0.824718
[60]	training's auc: 0.829989	valid_0's auc: 0.824446
Early stopping, best iteration is:
[13]	training's auc: 0.82995	valid_0's auc: 0.825559
*****
0.8255593222365715
********************


sms_day_mode_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.879702	valid_0's auc: 0.857365
[40]	training's auc: 0.880186	valid_0's auc: 0.857477
[60]	training's auc: 0.880501	valid_0's auc: 0.856633
[80]	training's auc: 0.880652	valid_0's auc: 0.856546
Early stopping, best iteration is:
[31]	training's auc: 0.880007	valid_0's auc: 0.858001


*****
0.7926397259291147
********************


sms_day17_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.834824	valid_0's auc: 0.825027
[40]	training's auc: 0.834877	valid_0's auc: 0.825036
Early stopping, best iteration is:
[4]	training's auc: 0.834647	valid_0's auc: 0.82589
*****
0.8258896541828096
********************


sms_day18_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.816939	valid_0's auc: 0.797693
[40]	training's auc: 0.817099	valid_0's auc: 0.795744
[60]	training's auc: 0.817175	valid_0's auc: 0.794363
Early stopping, best iteration is:
[14]	training's auc: 0.816878	valid_0's auc: 0.79811
*****
0.798110381690343
********************


sms_day19_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.852865	valid_0's auc: 0.826189
[40]	training's auc: 0.852946	valid_0's auc: 0.825909
Early stopping, best iteration is:
[2]	training's auc: 0.852679	valid_0's

[20]	training's auc: 0.824555	valid_0's auc: 0.787942
[40]	training's auc: 0.82512	valid_0's auc: 0.787507
[60]	training's auc: 0.825551	valid_0's auc: 0.787017
Early stopping, best iteration is:
[18]	training's auc: 0.82451	valid_0's auc: 0.788218
*****
0.7882183598794662
********************


sms_hour0_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.560302	valid_0's auc: 0.54383
[40]	training's auc: 0.561164	valid_0's auc: 0.541803
Early stopping, best iteration is:
[3]	training's auc: 0.559975	valid_0's auc: 0.544502
*****
0.5445021404314344
********************


sms_hour20_count
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.790886	valid_0's auc: 0.75664
[40]	training's auc: 0.790954	valid_0's auc: 0.756182
[60]	training's auc: 0.791129	valid_0's auc: 0.756705
[80]	training's auc: 0.791151	valid_0's auc: 0.756959
[100]	training's auc: 0.791246	valid_0's auc: 0.758033
[120]	training's auc: 0.791278	vali

In [65]:
for key, val in useless_cols.items():
    print(key, val)

voc_hour1_count 0.509878569378677
voc_hour3_count 0.5081716052039986
voc_hour2_count 0.5142610728464151
voc_hour4_count 0.5153566987133495
voc_hour1_nunique 0.5102911106328024
voc_hour3_nunique 0.5055244654900273
voc_hour2_nunique 0.5119278232171043
voc_hour4_nunique 0.5162086860859999
voc_hour1_call_dur_sum 0.5041822116994308
voc_hour3_call_dur_sum 0.5055543597838045
voc_hour2_call_dur_sum 0.5092597574974889
voc_hour4_call_dur_sum 0.5162804323910652
sms_hour3_count 0.5002929640790166
sms_hour5_count 0.5097799182092122
sms_hour4_count 0.5133313603099441


In [66]:
df_train.drop(list(useless_cols.keys()), axis=1, inplace=True)
df_test.drop(list(useless_cols.keys()), axis=1, inplace=True)

### 高共线

In [67]:
def correlation(df, threshold):
    """
    去除特征相关系数大于阈值的特征
    :param df:
    :param threshold:
    :return:
    """
    col_corr = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colName_i = corr_matrix.columns[i]
                colName_j = corr_matrix.columns[j]
                if useful_cols[colName_i] >= useful_cols[colName_j]:
                    col_corr.add(colName_j)
                else:
                    col_corr.add(colName_i)

    return col_corr


col = correlation(df_train.drop(['phone_no_m', 'label'], axis=1), 0.98)
print('Correlated columns: ', col)

Correlated columns:  {'county_name_arpu_202004/idcard_cnt_nunique', 'county_name_arpu_202004_max', 'city_name_idcard_cnt*arpu_202004_nunique', 'county_name_arpu_202004_skew', 'city_name_county_name_arpu_202004/idcard_cnt_median', 'voc_calltype_id_1_60s_cnt', 'county_name_idcard_cnt*arpu_202004_min', 'county_name_idcard_cnt_skew', 'county_name_nunique_x', 'call_type_id_2_rate', 'county_name_arpu_202004/idcard_cnt_min', 'city_name_county_name_count', 'voc_cnt_per_capita', 'county_name_arpu_202004_median', 'city_name', 'county_name_nunique_y', 'city_name_county_name_arpu_202004_min', 'county_name_idcard_cnt_max', 'county_name_idcard_cnt*arpu_202004_mean', 'county_name_idcard_cnt*arpu_202004_nunique', 'city_name_arpu_202004_nunique', 'voc_day12_count', 'city_name_count', 'city_name_arpu_202004_min', 'county_name_arpu_202004_mean', 'voc_calltype_id_1_30s_cnt', 'county_name_idcard_cnt_nunique', 'county_name_arpu_202004/idcard_cnt_max', 'city_name_county_name_arpu_202004/idcard_cnt_mean', 'co

In [68]:
df_train.drop(list(col), axis=1, inplace=True)
df_test.drop(list(col), axis=1, inplace=True)

In [69]:
feature_names = [i for i in df_train.columns if i not in ['phone_no_m', 'label']]
ycol = 'label'

In [70]:
feature_names[:5]

['county_name',
 'idcard_cnt',
 'arpu_202004',
 'city_name_county_name',
 'idcard_cnt*arpu_202004']

In [71]:
oof = []
prediction = df_test[['phone_no_m', 'arpu_202004']]
prediction['label'] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(df_train[feature_names], df_train[ycol])):
    print('\nFold_{} Training ================================\n'.format(fold_id+1))
    
    X_train = df_train.iloc[trn_idx][feature_names]
    Y_train = df_train.iloc[trn_idx][ycol]

    X_val = df_train.iloc[val_idx][feature_names]
    Y_val = df_train.iloc[val_idx][ycol]
    
    lgb_train = lgb.Dataset(X_train, Y_train) 
    lgb_valid= lgb.Dataset(X_val, Y_val, reference=lgb_train)
    
    lgb_model = lgb.train(params,
                          lgb_train,
                          num_boost_round=10000,
                          valid_sets=[lgb_valid, lgb_train],
                          early_stopping_rounds=100,
                          verbose_eval=10)

    pred_val = lgb_model.predict(X_val)
    df_oof = df_train.iloc[val_idx][['phone_no_m', ycol]].copy()
    df_oof['pred'] = pred_val
    oof.append(df_oof)

    pred_test = lgb_model.predict(df_test[feature_names])
    prediction['label_{}'.format(fold_id)] = pred_test

    importance = lgb_model.feature_importance(importance_type='gain')
    feature_name = lgb_model.feature_name()
    df_importance = pd.DataFrame({
        'feature_name': feature_name,
        'importance': importance
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()



Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.986831	valid_0's auc: 0.9447
[20]	training's auc: 0.996363	valid_0's auc: 0.947231
[30]	training's auc: 0.998752	valid_0's auc: 0.9511
[40]	training's auc: 0.999494	valid_0's auc: 0.953153
[50]	training's auc: 0.999699	valid_0's auc: 0.951919
[60]	training's auc: 0.999858	valid_0's auc: 0.951789
[70]	training's auc: 0.999966	valid_0's auc: 0.952354
[80]	training's auc: 1	valid_0's auc: 0.951626
[90]	training's auc: 1	valid_0's auc: 0.95189
[100]	training's auc: 1	valid_0's auc: 0.95259
[110]	training's auc: 1	valid_0's auc: 0.953152
[120]	training's auc: 1	valid_0's auc: 0.953014
[130]	training's auc: 1	valid_0's auc: 0.953293
[140]	training's auc: 1	valid_0's auc: 0.953198
[150]	training's auc: 1	valid_0's auc: 0.953661
[160]	training's auc: 1	valid_0's auc: 0.953704
[170]	training's auc: 1	valid_0's auc: 0.953849
[180]	training's auc: 1	valid_0's auc: 0.953824
[190]	training's auc: 1	valid_0's auc:

In [72]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['feature_name'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance.head(20)

Unnamed: 0,feature_name,importance
0,voc_day_mode_count,22109.243473
1,idcard_cnt*arpu_202004,1610.558125
2,sms_cnt_per_capita,1129.920307
3,sms_day31_count,1080.055842
4,sms_day30_count,907.317977
5,sms_day26_count,788.427765
6,sms_day27_count,642.002648
7,sms_nunique,554.225234
8,sms_day_mode_count,394.546472
9,opposite_nunique,338.772997


In [73]:
df_importance.tail(10)

Unnamed: 0,feature_name,importance
296,voc_day1_nunique,1.643011
297,county_name_arpu_202004_min,1.382758
298,city_name_county_name_idcard_cnt_median,1.028929
299,county_name_idcard_cnt_median,0.691549
300,voc_day16_nunique,0.547588
301,voc_hour5_call_dur_sum,0.017112
302,voc_hour5_nunique,0.003453
303,city_name_county_name_arpu_202004_skew,0.0
304,voc_hour5_count,0.0
305,city_name_idcard_cnt_min,0.0


In [74]:
use_cols = df_importance.loc[df_importance['importance'] > 1, 'feature_name'].to_list()
len(use_cols)

299

In [75]:
lgb_train_all = lgb.Dataset(df_train[use_cols].values, df_train['label'])   

print('Start training...')

lgb_model = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=100)

print('Done!')

Start training...
Done!


In [76]:
df_test['label'] = np.where(lgb_model.predict(df_test[use_cols]) > 0.4735, 1, 0)
df_test.loc[df_test['arpu_202004'] <= 0, 'label'] = 1
df_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}.csv'.format(time.strftime('%Y%m%d')), index=False)