In [1]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from scipy import stats
import gc
import warnings
from tqdm import tqdm
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from itertools import product
from mlxtend.feature_selection import SequentialFeatureSelector

warnings.filterwarnings('ignore')

pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth", 100)
pd.set_option('display.width', 1000)

In [2]:
# 节省内存读文件
def reduce_mem_usage(df):
    """
    iterate through all the columns of a dataframe and modify the data type to reduce memory usage.
    @param df:
    @return:
    """
    start_mem = df.memory_usage().sum()
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('str')

    end_mem = df.memory_usage().sum()
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
train_user = reduce_mem_usage(pd.read_csv('../input/train/train_user.csv', usecols=['phone_no_m', 'label']))
test_user = reduce_mem_usage(pd.read_csv('../input/test/test_user.csv', usecols=['phone_no_m']))

train_app = reduce_mem_usage(pd.read_csv('../input/train/train_app.csv'))
test_app = reduce_mem_usage(pd.read_csv('../input/test/test_app.csv'))

Memory usage of dataframe is 97824.00 MB
Memory usage after optimization is: 55082.00 MB
Decreased by 43.7%
Memory usage of dataframe is 16488.00 MB
Memory usage after optimization is: 16488.00 MB
Decreased by 0.0%
Memory usage of dataframe is 105075392.00 MB
Memory usage after optimization is: 91940984.00 MB
Decreased by 12.5%
Memory usage of dataframe is 4142624.00 MB
Memory usage after optimization is: 3365906.00 MB
Decreased by 18.7%


In [4]:
df_user = pd.concat([train_user, test_user])

del train_user, test_user
gc.collect()

20

In [5]:
train_app = train_app[train_app['month_id'] == '2020-03']

In [6]:
train_app.columns

Index(['phone_no_m', 'busi_name', 'flow', 'month_id'], dtype='object')

In [7]:
df_app = pd.concat([train_app, test_app])

del train_app, test_app
gc.collect()

60

In [8]:
phone_no_m = df_app[['phone_no_m']].copy()
phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')

In [9]:
# APP数
tmp = df_app.groupby('phone_no_m')['busi_name'].agg(busi_cnt='count')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')
del tmp
gc.collect()

40

In [10]:
"""
流量统计
"""
tmp = df_app.groupby("phone_no_m")["flow"].agg(flow_mean='mean',
                                               flow_median='median',
                                               flow_min='min',
                                               flow_max='max',
                                               flow_std='std',
                                               flow_sum='sum')
phone_no_m = phone_no_m.merge(tmp, on='phone_no_m', how='left')

del tmp
gc.collect()

20

In [11]:
df_app = df_user.merge(phone_no_m, how='left', on='phone_no_m')

del df_user, phone_no_m
gc.collect()

20

In [12]:
df_app_train = df_app[df_app.label.notna()]
df_app_test = df_app[df_app.label.isna()]

df_app_train.shape, df_app_test.shape

((6106, 9), (2045, 9))

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(df_app_train.drop('label', axis=1), df_app_train['label'],
                                                      test_size=0.2,
                                                      random_state=2020)

In [14]:
train_cols = [i for i in X_train if i not in ['phone_no_m', 'label']]

In [15]:
params = {'objective': 'binary',
          'boosting': 'gbdt',
          'metric': 'auc',
          'learning_rate': 0.1,
          'num_leaves': 31,
          'lambda_l1': 0,
          'lambda_l2': 1,
          'min_data_in_leaf': 20,
          'is_unbalance': True,
          'max_depth': -1,
          'seed': 2020}

In [16]:
useful_cols = []
useless_cols = []

for i in train_cols:
    print(i)
    lgb_train = lgb.Dataset(X_train[[i]].values, y_train) 
    lgb_eval= lgb.Dataset(X_valid[[i]].values, y_valid, reference=lgb_train)
    lgb_test = lgb.train(params,
                         lgb_train,
                         num_boost_round=1000,
                         valid_sets=[lgb_eval, lgb_train],
                         early_stopping_rounds=50,
                         verbose_eval=20)
    
    print('*' * 5)
    print(lgb_test.best_score['valid_0']['auc'])
    if lgb_test.best_score['valid_0']['auc'] > 0.50:
        useful_cols.append(i)
    else:
        useless_cols.append(i)
    print('*' * 20)
    print('\n')

busi_cnt
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.837256	valid_0's auc: 0.813981
[40]	training's auc: 0.839626	valid_0's auc: 0.813101
Early stopping, best iteration is:
[3]	training's auc: 0.832706	valid_0's auc: 0.81444
*****
0.8144401396661405
********************


flow_mean
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.828839	valid_0's auc: 0.777879
[40]	training's auc: 0.832328	valid_0's auc: 0.777234
[60]	training's auc: 0.834198	valid_0's auc: 0.776827
Early stopping, best iteration is:
[16]	training's auc: 0.827556	valid_0's auc: 0.778441
*****
0.7784414310996317
********************


flow_median
Training until validation scores don't improve for 50 rounds
[20]	training's auc: 0.82826	valid_0's auc: 0.77835
[40]	training's auc: 0.831198	valid_0's auc: 0.776839
Early stopping, best iteration is:
[9]	training's auc: 0.82564	valid_0's auc: 0.778903
*****
0.7789032979384894
********************


flo

In [17]:
print(useful_cols)
print(len(useful_cols))

['busi_cnt', 'flow_mean', 'flow_median', 'flow_min', 'flow_max', 'flow_std', 'flow_sum']
7


In [18]:
print(useless_cols)
print(len(useless_cols))

[]
0


In [19]:
lgb_train = lgb.Dataset(X_train[useful_cols].values, y_train) 

lgb_eval= lgb.Dataset(X_valid[useful_cols].values, y_valid, reference=lgb_train)  

print('Start training...')

lgb_valid = lgb.train(params,
                      lgb_train,
                      num_boost_round=10000,
                      valid_sets=[lgb_eval, lgb_train],
                      early_stopping_rounds=100,
                      verbose_eval=10)

Start training...
Training until validation scores don't improve for 100 rounds
[10]	training's auc: 0.864065	valid_0's auc: 0.822678
[20]	training's auc: 0.877054	valid_0's auc: 0.822009
[30]	training's auc: 0.885291	valid_0's auc: 0.817942
[40]	training's auc: 0.889476	valid_0's auc: 0.816549
[50]	training's auc: 0.892871	valid_0's auc: 0.813603
[60]	training's auc: 0.894944	valid_0's auc: 0.811201
[70]	training's auc: 0.897043	valid_0's auc: 0.809134
[80]	training's auc: 0.898777	valid_0's auc: 0.807923
[90]	training's auc: 0.90041	valid_0's auc: 0.805573
[100]	training's auc: 0.901481	valid_0's auc: 0.806593
[110]	training's auc: 0.90249	valid_0's auc: 0.804883
Early stopping, best iteration is:
[18]	training's auc: 0.874763	valid_0's auc: 0.822882


In [20]:
# 验证集结果
X_valid['prob'] = lgb_valid.predict(X_valid[useful_cols])
X_valid['pred'] = np.where(X_valid['prob'] > 0.5, 1, 0)

f1_05 = np.round(f1_score(y_valid, X_valid['pred']), 4)
auc_05 = roc_auc_score(y_valid, X_valid['prob'])

print('f1_05: ', f1_05)
print('auc_05: ', auc_05)

f1_05:  0.7221
auc_05:  0.8228822882288228


In [21]:
lgb_train_all = lgb.Dataset(df_app_train[useful_cols].values, df_app_train['label'])   

print('Start training...')

lgb_train = lgb.train(params,
                      lgb_train_all,
                      num_boost_round=lgb_valid.best_iteration + 20,
                      verbose_eval=10)

Start training...


In [22]:
df_app_test['label'] = np.where(lgb_train.predict(df_app_test[useful_cols]) > 0.5, 1, 0)
df_app_test[['phone_no_m', 'label']].to_csv('../sub/sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), f1_05), index=False)