In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from scipy import stats
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
train_user = reduce_mem_usage(pd.read_csv('../input/train/train_user.csv', usecols=['phone_no_m', 'label']))
test_user = reduce_mem_usage(pd.read_csv('../input/test/test_user.csv', usecols=['phone_no_m']))

train_sms = reduce_mem_usage(pd.read_csv('../input/train/train_sms.csv'))
test_sms = reduce_mem_usage(pd.read_csv('../input/test/test_sms.csv'))

Memory usage of dataframe is 97824.00 MB
Memory usage after optimization is: 55082.00 MB
Decreased by 43.7%
Memory usage of dataframe is 16488.00 MB
Memory usage after optimization is: 16488.00 MB
Decreased by 0.0%
Memory usage of dataframe is 219152416.00 MB
Memory usage after optimization is: 171212853.00 MB
Decreased by 21.9%
Memory usage of dataframe is 13084928.00 MB
Memory usage after optimization is: 10222628.00 MB
Decreased by 21.9%


In [4]:
df_user = pd.concat([train_user, test_user])

del train_user, test_user
gc.collect()

20

In [5]:
# 只取最后一个月的数据
train_sms = train_sms[train_sms['request_datetime'] >= '2020-03-01 00:00:00']

In [6]:
train_sms.columns

Index(['phone_no_m', 'opposite_no_m', 'calltype_id', 'request_datetime'], dtype='object')

In [7]:
df_sms = pd.concat([train_sms, test_sms])

del train_sms, test_sms
gc.collect()

60

In [8]:
df_sms['sms_day'] = df_sms['request_datetime'].astype('datetime64').dt.day
df_sms['sms_hour'] = df_sms['request_datetime'].astype('datetime64').dt.hour
df_sms['sms_dayofweek'] = df_sms['request_datetime'].astype('datetime64').dt.dayofweek

In [9]:
phone_no_m = df_sms[["phone_no_m"]].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [10]:
# 短信次数，短信人数
tmp = df_sms.groupby('phone_no_m')['opposite_no_m'].agg(sms_cnt='count', sms_nunique='nunique')

# 对端平均收到短信次数
tmp['sms_avg'] = tmp['sms_cnt'] / tmp['sms_nunique']
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [11]:
"""
短信上行，短信下行
"""

# 短信上行
df_sms_calltype1 = df_sms[df_sms['calltype_id'] == 1].copy()
tmp = df_sms_calltype1.groupby('phone_no_m')['calltype_id'].agg(sms_calltype1_cnt="count")
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行比例
phone_no_m['sms_calltype1_rate'] = phone_no_m['sms_calltype1_cnt'] / phone_no_m['sms_cnt']

del tmp, df_sms_calltype1

# 短信下行
df_sms_calltype2 = df_sms[df_sms['calltype_id'] == 2].copy()
tmp = df_sms_calltype2.groupby('phone_no_m')['calltype_id'].agg(sms_calltype2_cnt="count")
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

# 短信上行/短信下行
phone_no_m['type1_rate/type2_rate'] = phone_no_m['sms_calltype1_cnt'] / (phone_no_m['sms_calltype2_cnt'] + 0.00001)

# 删除，效果变差
# # 短信下行/短信上行
# phone_no_m['type2_rate/type1_rate'] = phone_no_m['sms_calltype2_cnt'] / (phone_no_m['sms_calltype1_cnt'] + 0.00001)

del tmp, df_sms_calltype2
gc.collect()

0

In [12]:
"""
短信时间点的偏好
"""

tmp = df_sms.groupby('phone_no_m')['sms_hour'].agg(sms_hour_mode=lambda x: stats.mode(x)[0][0],        # 频次最高的元素
                                                   sms_hour_mode_count=lambda x: stats.mode(x)[1][0],  # 频次最高的元素的频次
                                                   sms_hour_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp

tmp = df_sms.groupby('phone_no_m')['sms_day'].agg(sms_day_mode=lambda x: stats.mode(x)[0][0],
                                                  sms_day_mode_count=lambda x: stats.mode(x)[1][0],
                                                  sms_day_nunique='nunique')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

0

In [13]:
phone_no_m.columns

Index(['phone_no_m', 'sms_cnt', 'sms_nunique', 'sms_avg', 'sms_calltype1_cnt', 'sms_calltype1_rate', 'sms_calltype2_cnt', 'type1_rate/type2_rate', 'sms_hour_mode', 'sms_hour_mode_count', 'sms_hour_nunique', 'sms_day_mode', 'sms_day_mode_count', 'sms_day_nunique'], dtype='object')

In [14]:
df_sms = df_user.merge(phone_no_m, how='left', on='phone_no_m')

del df_user, phone_no_m
gc.collect()

40

In [15]:
df_sms_train = df_sms[df_sms.label.notna()]
df_sms_test = df_sms[df_sms.label.isna()]

df_sms_train.shape, df_sms_test.shape

((6106, 15), (2045, 15))

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(df_sms_train.drop('label', axis=1), df_sms_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [17]:
train_cols = [i for i in X_train if i not in ['phone_no_m', 'label']]

In [18]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0,
          'lambda_l2': 1,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [19]:
useful_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_eval= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_eval, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.50:
        useful_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

sms_cnt
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.864712	valid_0's auc: 0.808768
[40]	training's auc: 0.869101	valid_0's auc: 0.80392
Early stopping, best iteration is:
[1]	training's auc: 0.854611	valid_0's auc: 0.81233
*****
0.8123296025254699
********************


sms_nunique
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.851239	valid_0's auc: 0.811787
[40]	training's auc: 0.853713	valid_0's auc: 0.806982
Early stopping, best iteration is:
[2]	training's auc: 0.846908	valid_0's auc: 0.815001
*****
0.8150006576744631
********************


sms_avg
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.868621	valid_0's auc: 0.809397
[40]	training's auc: 0.872506	valid_0's auc: 0.809999
[60]	training's auc: 0.875699	valid_0's auc: 0.808645
Early stopping, best iteration is:
[22]	training's auc: 0.869392	valid_0's auc: 0.810546
*****
0.8105464079016598
********************


sms_

In [20]:
print(useful_cols)
print(len(useful_cols))

['sms_cnt', 'sms_nunique', 'sms_avg', 'sms_calltype1_cnt', 'sms_calltype1_rate', 'sms_calltype2_cnt', 'type1_rate/type2_rate', 'sms_hour_mode', 'sms_hour_mode_count', 'sms_hour_nunique', 'sms_day_mode', 'sms_day_mode_count', 'sms_day_nunique']
13


In [21]:
print(useless_cols)
print(len(useless_cols))

[]
0


In [22]:
lgb_train = lgb.Dataset(X_train[useful_cols].values, y_train) 

lgb_eval= lgb.Dataset(X_valid[useful_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

lgb_valid = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_eval, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.957449	valid_0's auc: 0.909208
[20]	training's auc: 0.965684	valid_0's auc: 0.908881
[30]	training's auc: 0.973426	valid_0's auc: 0.907595
[40]	training's auc: 0.97813	valid_0's auc: 0.907803
[50]	training's auc: 0.981671	valid_0's auc: 0.906407
[60]	training's auc: 0.984219	valid_0's auc: 0.905882
[70]	training's auc: 0.985923	valid_0's auc: 0.904749
[80]	training's auc: 0.987153	valid_0's auc: 0.903195
[90]	training's auc: 0.987905	valid_0's auc: 0.903485
[100]	training's auc: 0.988709	valid_0's auc: 0.902298
[110]	training's auc: 0.989256	valid_0's auc: 0.902719
Early stopping, best iteration is:
[13]	training's auc: 0.960342	valid_0's auc: 0.911685


In [23]:
# 验证集结果
X_valid['prob'] = lgb_valid.predict(X_valid[useful_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.5, 1, 0)

f1_05 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_05 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_05: ', f1_05)
print('auc_05: ', auc_05)

f1_05:  0.821
auc_05:  0.9116847826086957


In [24]:
lgb_train_all = lgb.Dataset(df_sms_train[useful_cols].values, df_sms_train['label'])   

print('Start training...')

lgb_train = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_valid.best_iteration + 20,
                      verbose_eval=10)

Start training...


In [25]:
df_sms_test['label'] = np.where(lgb_train.predict(df_sms_test[useful_cols]) > 0.5, 1, 0)
df_sms_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_05), index=False)