# 1 初始化

## 1.1 导入包

In [25]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier
from tqdm import tqdm
import logging

log_fmt = "[%(asctime)s] %(levelname)s in %(module)s: %(message)s"
logging.basicConfig(format=log_fmt, level=logging.INFO)

import warnings
warnings.filterwarnings('ignore')

## 1.2 超参数定义

In [2]:
base_path = '/home/zengrui/datasets/190829_Kanshan_zjfx'

## 1.3 函数定义

In [3]:
def extract_day(s):
    return s.apply(lambda x: int(x.split('-')[0][1:]))


def extract_hour(s):
    return s.apply(lambda x: int(x.split('-')[1][1:]))

# 2 加载、处理数据

## 2.1 加载邀请回答数据

In [4]:
train = pd.read_csv(f'{base_path}/invite_info_0926.txt', sep='\t', header=None)
train.columns = ['qid', 'uid', 'dt', 'label']
logging.info("invite %s", train.shape)

test = pd.read_csv(f'{base_path}/invite_info_evaluate_1_0926.txt', sep='\t', header=None)
test.columns = ['qid', 'uid', 'dt']
logging.info("test %s", test.shape)

sub = test.copy()

sub_size = len(sub)

train['day'] = extract_day(train['dt'])
train['hour'] = extract_hour(train['dt'])

test['day'] = extract_day(test['dt'])
test['hour'] = extract_hour(test['dt'])

del train['dt'], test['dt']

[2019-12-05 21:11:21,885] INFO in <ipython-input-4-7a2c6478cb9e>: invite (9489162, 4)
[2019-12-05 21:11:22,998] INFO in <ipython-input-4-7a2c6478cb9e>: test (1141683, 3)


## 2.2 加载问题

In [5]:
ques = pd.read_csv(f'{base_path}/question_info_0926.txt', header=None, sep='\t')
ques.columns = ['qid', 'q_dt', 'title_t1', 'title_t2', 'desc_t1', 'desc_t2', 'topic']
del ques['title_t1'], ques['title_t2'], ques['desc_t1'], ques['desc_t2']
logging.info("ques %s", ques.shape)

ques['q_day'] = extract_day(ques['q_dt'])
ques['q_hour'] = extract_hour(ques['q_dt'])
del ques['q_dt']

[2019-12-05 21:11:53,656] INFO in <ipython-input-5-958902b92fc5>: ques (1829900, 3)


## 2.3 加载回答

In [6]:
ans = pd.read_csv(f'{base_path}/answer_info_0926.txt', header=None, sep='\t')
ans.columns = ['aid', 'qid', 'uid', 'ans_dt', 'ans_t1', 'ans_t2', 'is_good', 'is_rec', 'is_dest', 'has_img',
               'has_video', 'word_count', 'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
               'reci_xxx', 'reci_no_help', 'reci_dis']
del ans['ans_t1'], ans['ans_t2']
logging.info("ans %s", ans.shape)

ans['a_day'] = extract_day(ans['ans_dt'])
ans['a_hour'] = extract_hour(ans['ans_dt'])
del ans['ans_dt']

ans = pd.merge(ans, ques, on='qid')
del ques

[2019-12-05 21:12:51,116] INFO in <ipython-input-6-935311f5f359>: ans (4513735, 18)


## 2.4 处理时间数据

回答距提问的天数。

In [7]:
ans['diff_qa_days'] = ans['a_day'] - ans['q_day']

时间窗口划分。

In [8]:
train_start = 3838
train_end = 3867

val_start = 3868
val_end = 3874

label_end = 3867
label_start = label_end - 6

train_label_feature_end = label_end - 7
train_label_feature_start = train_label_feature_end - 22

train_ans_feature_end = label_end - 7
train_ans_feature_start = train_ans_feature_end - 50

val_label_feature_end = val_start - 1
val_label_feature_start = val_label_feature_end - 22

val_ans_feature_end = val_start - 1
val_ans_feature_start = val_ans_feature_end - 50

train_label_feature = train[(train['day'] >= train_label_feature_start) & (train['day'] <= train_label_feature_end)]
logging.info("train_label_feature %s", train_label_feature.shape)

val_label_feature = train[(train['day'] >= val_label_feature_start) & (train['day'] <= val_label_feature_end)]
logging.info("val_label_feature %s", val_label_feature.shape)

train_label = train[(train['day'] > train_label_feature_end)]

logging.info("train feature start %s end %s, label start %s end %s", train_label_feature['day'].min(),
             train_label_feature['day'].max(), train_label['day'].min(), train_label['day'].max())

logging.info("test feature start %s end %s, label start %s end %s", val_label_feature['day'].min(),
             val_label_feature['day'].max(), test['day'].min(), test['day'].max())

[2019-12-05 21:13:04,850] INFO in <ipython-input-8-5c7aac178bca>: train_label_feature (6895493, 5)
[2019-12-05 21:13:05,455] INFO in <ipython-input-8-5c7aac178bca>: val_label_feature (7583553, 5)
[2019-12-05 21:13:05,704] INFO in <ipython-input-8-5c7aac178bca>: train feature start 3838 end 3860, label start 3861 end 3867
[2019-12-05 21:13:05,732] INFO in <ipython-input-8-5c7aac178bca>: test feature start 3845 end 3867, label start 3868 end 3874


确定ans的时间范围。

In [9]:
train_ans_feature = ans[(ans['a_day'] >= train_ans_feature_start) & (ans['a_day'] <= train_ans_feature_end)]

val_ans_feature = ans[(ans['a_day'] >= val_ans_feature_start) & (ans['a_day'] <= val_ans_feature_end)]

logging.info("train ans feature %s, start %s end %s", train_ans_feature.shape, train_ans_feature['a_day'].min(),
             train_ans_feature['a_day'].max())

logging.info("val ans feature %s, start %s end %s", val_ans_feature.shape, val_ans_feature['a_day'].min(),
             val_ans_feature['a_day'].max())

fea_cols = ['is_good', 'is_rec', 'is_dest', 'has_img', 'has_video', 'word_count',
            'reci_cheer', 'reci_uncheer', 'reci_comment', 'reci_mark', 'reci_tks',
            'reci_xxx', 'reci_no_help', 'reci_dis', 'diff_qa_days']

[2019-12-05 21:13:08,676] INFO in <ipython-input-9-502ad99fdbbb>: train ans feature (3700178, 23), start 3810 end 3860
[2019-12-05 21:13:08,687] INFO in <ipython-input-9-502ad99fdbbb>: val ans feature (3992334, 23), start 3817 end 3867


## 2.5 特征提取

In [10]:
def extract_feature1(target, label_feature, ans_feature):
    # 问题特征
    t1 = label_feature.groupby('qid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['qid', 'q_inv_mean', 'q_inv_sum', 'q_inv_std', 'q_inv_count']
    target = pd.merge(target, t1, on='qid', how='left')

    # 用户特征
    t1 = label_feature.groupby('uid')['label'].agg(['mean', 'sum', 'std', 'count']).reset_index()
    t1.columns = ['uid', 'u_inv_mean', 'u_inv_sum', 'u_inv_std', 'u_inv_count']
    target = pd.merge(target, t1, on='uid', how='left')
    #
    # train_size = len(train)
    # data = pd.concat((train, test), sort=True)

    # 回答部分特征

    t1 = ans_feature.groupby('qid')['aid'].count().reset_index()
    t1.columns = ['qid', 'q_ans_count']
    target = pd.merge(target, t1, on='qid', how='left')

    t1 = ans_feature.groupby('uid')['aid'].count().reset_index()
    t1.columns = ['uid', 'u_ans_count']
    target = pd.merge(target, t1, on='uid', how='left')

    for col in fea_cols:
        t1 = ans_feature.groupby('uid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['uid', f'u_{col}_sum', f'u_{col}_max', f'u_{col}_mean']
        target = pd.merge(target, t1, on='uid', how='left')

        t1 = ans_feature.groupby('qid')[col].agg(['sum', 'max', 'mean']).reset_index()
        t1.columns = ['qid', f'q_{col}_sum', f'q_{col}_max', f'q_{col}_mean']
        target = pd.merge(target, t1, on='qid', how='left')
        logging.info("extract %s", col)
    return target


train_label = extract_feature1(train_label, train_label_feature, train_ans_feature)
test = extract_feature1(test, val_label_feature, val_ans_feature)

[2019-12-05 21:13:43,123] INFO in <ipython-input-10-565ad55c944e>: extract is_good
[2019-12-05 21:13:53,744] INFO in <ipython-input-10-565ad55c944e>: extract is_rec
[2019-12-05 21:14:04,387] INFO in <ipython-input-10-565ad55c944e>: extract is_dest
[2019-12-05 21:14:15,242] INFO in <ipython-input-10-565ad55c944e>: extract has_img
[2019-12-05 21:14:26,314] INFO in <ipython-input-10-565ad55c944e>: extract has_video
[2019-12-05 21:14:37,566] INFO in <ipython-input-10-565ad55c944e>: extract word_count
[2019-12-05 21:14:48,907] INFO in <ipython-input-10-565ad55c944e>: extract reci_cheer
[2019-12-05 21:15:00,437] INFO in <ipython-input-10-565ad55c944e>: extract reci_uncheer
[2019-12-05 21:15:12,017] INFO in <ipython-input-10-565ad55c944e>: extract reci_comment
[2019-12-05 21:15:23,737] INFO in <ipython-input-10-565ad55c944e>: extract reci_mark
[2019-12-05 21:15:35,594] INFO in <ipython-input-10-565ad55c944e>: extract reci_tks
[2019-12-05 21:15:47,683] INFO in <ipython-input-10-565ad55c944e>: 

特征提取结束。

In [11]:
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)
assert len(test) == sub_size

[2019-12-05 21:19:13,905] INFO in <ipython-input-11-6bad7a600272>: train shape (2593669, 105), test shape (1141683, 104)


## 2.6 加载用户

In [12]:
user = pd.read_csv(f'{base_path}/member_info_0926.txt', header=None, sep='\t')
user.columns = ['uid', 'gender', 'creat_keyword', 'level', 'hot', 'reg_type', 'reg_plat', 'freq', 'uf_b1', 'uf_b2',
                'uf_b3', 'uf_b4', 'uf_b5', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5', 'score', 'follow_topic',
                'inter_topic']
del user['follow_topic'], user['inter_topic']
logging.info("user %s", user.shape)

unq = user.nunique()
logging.info("user unq %s", unq)

for x in unq[unq == 1].index:
    del user[x]
    logging.info('del unq==1 %s', x)

t = user.dtypes
cats = [x for x in t[t == 'object'].index if x not in ['follow_topic', 'inter_topic', 'uid']]
logging.info("user cat %s", cats)

for d in cats:
    lb = LabelEncoder()
    user[d] = lb.fit_transform(user[d])
    logging.info('encode %s', d)

q_lb = LabelEncoder()
q_lb.fit(list(train_label['qid'].astype(str).values) + list(test['qid'].astype(str).values))
train_label['qid_enc'] = q_lb.transform(train_label['qid'])
test['qid_enc'] = q_lb.transform(test['qid'])

u_lb = LabelEncoder()
u_lb.fit(user['uid'])
train_label['uid_enc'] = u_lb.transform(train_label['uid'])
test['uid_enc'] = u_lb.transform(test['uid'])

[2019-12-05 21:19:20,760] INFO in <ipython-input-12-f4f8e8f0ac9b>: user (1931654, 19)
[2019-12-05 21:19:23,601] INFO in <ipython-input-12-f4f8e8f0ac9b>: user unq uid              1931654
gender                 3
creat_keyword          1
level                  1
hot                    1
reg_type               1
reg_plat               1
freq                   5
uf_b1                  2
uf_b2                  2
uf_b3                  2
uf_b4                  2
uf_b5                  2
uf_c1               2561
uf_c2                291
uf_c3                428
uf_c4               1556
uf_c5                  2
score                732
dtype: int64
[2019-12-05 21:19:23,655] INFO in <ipython-input-12-f4f8e8f0ac9b>: del unq==1 creat_keyword
[2019-12-05 21:19:23,660] INFO in <ipython-input-12-f4f8e8f0ac9b>: del unq==1 level
[2019-12-05 21:19:23,661] INFO in <ipython-input-12-f4f8e8f0ac9b>: del unq==1 hot
[2019-12-05 21:19:23,849] INFO in <ipython-input-12-f4f8e8f0ac9b>: del unq==1 reg_type
[2019

merge user

In [None]:
train_label = pd.merge(train_label, user, on='uid', how='left')
test = pd.merge(test, user, on='uid', how='left')
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

In [23]:
data = pd.concat((train_label, test), axis=0, sort=True)
# del train_label, test

count编码。

In [24]:
count_fea = ['uid_enc', 'qid_enc', 'gender', 'freq', 'uf_c1', 'uf_c2', 'uf_c3', 'uf_c4', 'uf_c5']
with tqdm(total=len(count_fea)) as pbar:
    for feat in count_fea:
        col_name = '{}_count'.format(feat)
        data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
        data.loc[data[col_name] < 2, feat] = -1
        data[feat] += 1
        data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
        data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())
        pbar.update(1)

## 2.7 处理数据

压缩数据。

In [27]:
t = data.dtypes
for x in t[t == 'int64'].index:
    data[x] = data[x].astype('int32')

for x in t[t == 'float64'].index:
    data[x] = data[x].astype('float32')

data['wk'] = data['day'] % 7

feature_cols = [x for x in data.columns if x not in ('label', 'uid', 'qid', 'dt', 'day')]

target编码。

In [16]:
train_label

Unnamed: 0,qid,uid,label,day,hour,q_inv_mean,q_inv_sum,q_inv_std,q_inv_count,u_inv_mean,...,uf_b2,uf_b3,uf_b4,uf_b5,uf_c1,uf_c2,uf_c3,uf_c4,uf_c5,score
0,Q2166419046,M401693808,0,3865,22,,,,,0.000000,...,1,0,0,0,2113,190,261,927,1,297
1,Q604029601,M2317670257,0,3862,15,,,,,0.090909,...,0,0,0,0,1519,229,0,506,1,415
2,Q2443223942,M3544409350,0,3867,4,0.375000,57.0,0.485723,152.0,0.000000,...,0,0,0,0,551,226,188,815,1,296
3,Q795459266,M2818659842,0,3861,20,0.166667,1.0,0.408248,6.0,0.285714,...,0,0,0,0,1519,229,0,506,1,380
4,Q110462128,M848334644,1,3862,8,,,,,0.634146,...,0,0,0,0,2161,31,396,1438,1,719
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2593664,Q3119693157,M1523603883,0,3864,20,,,,,0.000000,...,0,0,1,0,1190,130,421,758,1,318
2593665,Q3341450521,M479879245,0,3862,15,,,,,0.083333,...,0,0,0,0,1190,130,315,758,1,360
2593666,Q1458223535,M479879245,1,3867,18,,,,,0.083333,...,0,0,0,0,1190,130,315,758,1,360
2593667,Q3516644442,M4285896253,1,3862,12,,,,,0.314286,...,0,0,0,0,449,48,0,1311,1,586


In [28]:
data

Unnamed: 0,day,freq,freq_count,gender,gender_count,hour,label,q_ans_count,q_diff_qa_days_max,q_diff_qa_days_mean,...,uf_c3,uf_c3_count,uf_c4,uf_c4_count,uf_c5,uf_c5_count,uid,uid_enc,uid_enc_count,wk
0,3865,5,1.000000,3,1.000000,22,0.0,,,,...,262,0.220357,928,0.063075,2,1.000000,M401693808,1508099,0.000000,1
1,3862,5,1.000000,3,1.000000,15,0.0,,,,...,1,0.444174,507,0.004092,2,1.000000,M2317670257,657986,0.000003,5
2,3867,2,0.726702,3,1.000000,4,0.0,32.0,13.0,9.531250,...,189,0.403767,816,0.044312,2,1.000000,M3544409350,1272354,0.000015,3
3,3861,1,0.979042,2,0.487400,20,0.0,3.0,234.0,222.333328,...,1,0.444174,507,0.004092,2,1.000000,M2818659842,909155,0.000003,4
4,3862,5,1.000000,1,0.178696,8,1.0,,,,...,397,0.514839,1439,0.125055,2,1.000000,M848334644,1856020,0.000027,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141678,3869,6,0.325453,4,0.000000,20,,,,,...,81,0.020067,760,0.434533,3,0.346068,M2010778235,504330,0.000000,5
1141679,3872,3,0.118357,4,0.000000,21,,,,,...,121,0.094249,760,0.434533,3,0.346068,M3131383616,1066119,0.000003,1
1141680,3871,2,0.726702,4,0.000000,15,,,,,...,313,0.056923,760,0.434533,3,0.346068,M1872860897,435862,0.000000,0
1141681,3871,6,0.325453,4,0.000000,8,,,,,...,8,0.442846,1375,0.031122,3,0.346068,M3574631517,1287652,0.000015,0


In [29]:
logging.info("feature size %s", len(feature_cols))

X_train_all = data.iloc[:len(train_label)][feature_cols]
y_train_all = data.iloc[:len(train_label)]['label']
test = data.iloc[len(train_label):]
# del data
assert len(test) == sub_size

[2019-12-05 21:35:36,305] INFO in <ipython-input-29-a8e75023efc9>: feature size 126


# 3 训练网络

In [36]:
model_lgb

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=2000, n_jobs=-1, num_leaves=31, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1000,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

In [None]:
logging.info("train shape %s, test shape %s", train_label.shape, test.shape)

fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for index, (train_idx, val_idx) in enumerate(fold.split(X=X_train_all, y=y_train_all)):
    break

X_train, X_val, y_train, y_val = X_train_all.iloc[train_idx][feature_cols], X_train_all.iloc[val_idx][feature_cols], \
                                 y_train_all.iloc[train_idx], \
                                 y_train_all.iloc[val_idx]
# del X_train_all

In [30]:
model_lgb = LGBMClassifier(n_estimators=2000, 
                           n_jobs=6, 
                           objective='binary', 
                           seed=1000, 
                           silent=True)
model_lgb.fit(X_train, y_train,
              eval_metric=['logloss', 'auc'],
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50)

[2019-12-05 21:35:48,701] INFO in <ipython-input-30-b5ffee06c4e2>: train shape (2593669, 120), test shape (1141683, 130)


[1]	valid_0's auc: 0.753468	valid_0's binary_logloss: 0.427513
Training until validation scores don't improve for 50 rounds
[2]	valid_0's auc: 0.766018	valid_0's binary_logloss: 0.418101
[3]	valid_0's auc: 0.772021	valid_0's binary_logloss: 0.410615
[4]	valid_0's auc: 0.774905	valid_0's binary_logloss: 0.404363
[5]	valid_0's auc: 0.778247	valid_0's binary_logloss: 0.398762
[6]	valid_0's auc: 0.780496	valid_0's binary_logloss: 0.393881
[7]	valid_0's auc: 0.782302	valid_0's binary_logloss: 0.389754
[8]	valid_0's auc: 0.784272	valid_0's binary_logloss: 0.386325
[9]	valid_0's auc: 0.785887	valid_0's binary_logloss: 0.383083
[10]	valid_0's auc: 0.787088	valid_0's binary_logloss: 0.380393
[11]	valid_0's auc: 0.788498	valid_0's binary_logloss: 0.378056
[12]	valid_0's auc: 0.78943	valid_0's binary_logloss: 0.375943
[13]	valid_0's auc: 0.79039	valid_0's binary_logloss: 0.374114
[14]	valid_0's auc: 0.791517	valid_0's binary_logloss: 0.372395
[15]	valid_0's auc: 0.792708	valid_0's binary_logloss:

[129]	valid_0's auc: 0.818318	valid_0's binary_logloss: 0.343502
[130]	valid_0's auc: 0.818357	valid_0's binary_logloss: 0.343477
[131]	valid_0's auc: 0.818391	valid_0's binary_logloss: 0.343453
[132]	valid_0's auc: 0.818404	valid_0's binary_logloss: 0.343445
[133]	valid_0's auc: 0.818427	valid_0's binary_logloss: 0.343426
[134]	valid_0's auc: 0.818468	valid_0's binary_logloss: 0.343393
[135]	valid_0's auc: 0.818494	valid_0's binary_logloss: 0.343374
[136]	valid_0's auc: 0.818536	valid_0's binary_logloss: 0.343338
[137]	valid_0's auc: 0.818567	valid_0's binary_logloss: 0.343312
[138]	valid_0's auc: 0.81861	valid_0's binary_logloss: 0.343281
[139]	valid_0's auc: 0.81863	valid_0's binary_logloss: 0.343266
[140]	valid_0's auc: 0.818705	valid_0's binary_logloss: 0.343178
[141]	valid_0's auc: 0.818757	valid_0's binary_logloss: 0.343138
[142]	valid_0's auc: 0.818802	valid_0's binary_logloss: 0.343092
[143]	valid_0's auc: 0.818813	valid_0's binary_logloss: 0.343085
[144]	valid_0's auc: 0.8188

[256]	valid_0's auc: 0.821451	valid_0's binary_logloss: 0.341031
[257]	valid_0's auc: 0.821468	valid_0's binary_logloss: 0.341017
[258]	valid_0's auc: 0.821497	valid_0's binary_logloss: 0.340995
[259]	valid_0's auc: 0.821499	valid_0's binary_logloss: 0.340993
[260]	valid_0's auc: 0.82152	valid_0's binary_logloss: 0.340975
[261]	valid_0's auc: 0.821545	valid_0's binary_logloss: 0.340959
[262]	valid_0's auc: 0.821546	valid_0's binary_logloss: 0.340958
[263]	valid_0's auc: 0.821567	valid_0's binary_logloss: 0.34094
[264]	valid_0's auc: 0.821594	valid_0's binary_logloss: 0.34092
[265]	valid_0's auc: 0.821634	valid_0's binary_logloss: 0.340894
[266]	valid_0's auc: 0.821664	valid_0's binary_logloss: 0.340872
[267]	valid_0's auc: 0.821701	valid_0's binary_logloss: 0.34084
[268]	valid_0's auc: 0.821728	valid_0's binary_logloss: 0.340817
[269]	valid_0's auc: 0.821744	valid_0's binary_logloss: 0.340804
[270]	valid_0's auc: 0.821756	valid_0's binary_logloss: 0.340795
[271]	valid_0's auc: 0.821762

[384]	valid_0's auc: 0.82302	valid_0's binary_logloss: 0.339812
[385]	valid_0's auc: 0.82302	valid_0's binary_logloss: 0.339812
[386]	valid_0's auc: 0.823025	valid_0's binary_logloss: 0.339807
[387]	valid_0's auc: 0.823041	valid_0's binary_logloss: 0.339792
[388]	valid_0's auc: 0.823046	valid_0's binary_logloss: 0.339787
[389]	valid_0's auc: 0.823054	valid_0's binary_logloss: 0.339781
[390]	valid_0's auc: 0.823053	valid_0's binary_logloss: 0.339781
[391]	valid_0's auc: 0.823054	valid_0's binary_logloss: 0.339781
[392]	valid_0's auc: 0.823052	valid_0's binary_logloss: 0.339783
[393]	valid_0's auc: 0.823051	valid_0's binary_logloss: 0.339782
[394]	valid_0's auc: 0.823055	valid_0's binary_logloss: 0.339778
[395]	valid_0's auc: 0.823089	valid_0's binary_logloss: 0.33975
[396]	valid_0's auc: 0.823119	valid_0's binary_logloss: 0.339726
[397]	valid_0's auc: 0.823149	valid_0's binary_logloss: 0.339702
[398]	valid_0's auc: 0.823157	valid_0's binary_logloss: 0.339697
[399]	valid_0's auc: 0.82315

[511]	valid_0's auc: 0.824233	valid_0's binary_logloss: 0.338856
[512]	valid_0's auc: 0.824252	valid_0's binary_logloss: 0.338842
[513]	valid_0's auc: 0.824258	valid_0's binary_logloss: 0.338836
[514]	valid_0's auc: 0.824262	valid_0's binary_logloss: 0.338832
[515]	valid_0's auc: 0.824276	valid_0's binary_logloss: 0.338822
[516]	valid_0's auc: 0.82428	valid_0's binary_logloss: 0.338818
[517]	valid_0's auc: 0.824292	valid_0's binary_logloss: 0.338808
[518]	valid_0's auc: 0.824299	valid_0's binary_logloss: 0.338803
[519]	valid_0's auc: 0.82431	valid_0's binary_logloss: 0.338789
[520]	valid_0's auc: 0.824324	valid_0's binary_logloss: 0.338777
[521]	valid_0's auc: 0.824332	valid_0's binary_logloss: 0.33877
[522]	valid_0's auc: 0.824334	valid_0's binary_logloss: 0.338769
[523]	valid_0's auc: 0.824336	valid_0's binary_logloss: 0.338766
[524]	valid_0's auc: 0.824332	valid_0's binary_logloss: 0.33877
[525]	valid_0's auc: 0.824338	valid_0's binary_logloss: 0.338764
[526]	valid_0's auc: 0.82435	

[638]	valid_0's auc: 0.825075	valid_0's binary_logloss: 0.338171
[639]	valid_0's auc: 0.825079	valid_0's binary_logloss: 0.338168
[640]	valid_0's auc: 0.82508	valid_0's binary_logloss: 0.338169
[641]	valid_0's auc: 0.825078	valid_0's binary_logloss: 0.33817
[642]	valid_0's auc: 0.82508	valid_0's binary_logloss: 0.338168
[643]	valid_0's auc: 0.825087	valid_0's binary_logloss: 0.338161
[644]	valid_0's auc: 0.825091	valid_0's binary_logloss: 0.338158
[645]	valid_0's auc: 0.8251	valid_0's binary_logloss: 0.33815
[646]	valid_0's auc: 0.825108	valid_0's binary_logloss: 0.338144
[647]	valid_0's auc: 0.825115	valid_0's binary_logloss: 0.338139
[648]	valid_0's auc: 0.825118	valid_0's binary_logloss: 0.338136
[649]	valid_0's auc: 0.825131	valid_0's binary_logloss: 0.338125
[650]	valid_0's auc: 0.825132	valid_0's binary_logloss: 0.338124
[651]	valid_0's auc: 0.82514	valid_0's binary_logloss: 0.338118
[652]	valid_0's auc: 0.825145	valid_0's binary_logloss: 0.338114
[653]	valid_0's auc: 0.825149	va

[767]	valid_0's auc: 0.825638	valid_0's binary_logloss: 0.337698
[768]	valid_0's auc: 0.825647	valid_0's binary_logloss: 0.33769
[769]	valid_0's auc: 0.825653	valid_0's binary_logloss: 0.33768
[770]	valid_0's auc: 0.825662	valid_0's binary_logloss: 0.337674
[771]	valid_0's auc: 0.825667	valid_0's binary_logloss: 0.337669
[772]	valid_0's auc: 0.825674	valid_0's binary_logloss: 0.337664
[773]	valid_0's auc: 0.825674	valid_0's binary_logloss: 0.337665
[774]	valid_0's auc: 0.825678	valid_0's binary_logloss: 0.337662
[775]	valid_0's auc: 0.825678	valid_0's binary_logloss: 0.337662
[776]	valid_0's auc: 0.825675	valid_0's binary_logloss: 0.337664
[777]	valid_0's auc: 0.825679	valid_0's binary_logloss: 0.337657
[778]	valid_0's auc: 0.82569	valid_0's binary_logloss: 0.337649
[779]	valid_0's auc: 0.825693	valid_0's binary_logloss: 0.337647
[780]	valid_0's auc: 0.825695	valid_0's binary_logloss: 0.337648
[781]	valid_0's auc: 0.825688	valid_0's binary_logloss: 0.337651
[782]	valid_0's auc: 0.8257	

[895]	valid_0's auc: 0.826236	valid_0's binary_logloss: 0.337226
[896]	valid_0's auc: 0.826238	valid_0's binary_logloss: 0.337225
[897]	valid_0's auc: 0.826239	valid_0's binary_logloss: 0.337224
[898]	valid_0's auc: 0.826243	valid_0's binary_logloss: 0.337221
[899]	valid_0's auc: 0.826241	valid_0's binary_logloss: 0.337222
[900]	valid_0's auc: 0.826246	valid_0's binary_logloss: 0.337217
[901]	valid_0's auc: 0.826245	valid_0's binary_logloss: 0.337217
[902]	valid_0's auc: 0.82625	valid_0's binary_logloss: 0.337212
[903]	valid_0's auc: 0.826259	valid_0's binary_logloss: 0.337206
[904]	valid_0's auc: 0.826263	valid_0's binary_logloss: 0.337202
[905]	valid_0's auc: 0.826265	valid_0's binary_logloss: 0.337201
[906]	valid_0's auc: 0.826266	valid_0's binary_logloss: 0.337201
[907]	valid_0's auc: 0.826282	valid_0's binary_logloss: 0.337186
[908]	valid_0's auc: 0.826286	valid_0's binary_logloss: 0.337182
[909]	valid_0's auc: 0.82629	valid_0's binary_logloss: 0.337179
[910]	valid_0's auc: 0.8263

[1022]	valid_0's auc: 0.826777	valid_0's binary_logloss: 0.336789
[1023]	valid_0's auc: 0.826803	valid_0's binary_logloss: 0.336767
[1024]	valid_0's auc: 0.82681	valid_0's binary_logloss: 0.336761
[1025]	valid_0's auc: 0.826808	valid_0's binary_logloss: 0.336762
[1026]	valid_0's auc: 0.826804	valid_0's binary_logloss: 0.336765
[1027]	valid_0's auc: 0.826807	valid_0's binary_logloss: 0.336763
[1028]	valid_0's auc: 0.826817	valid_0's binary_logloss: 0.336754
[1029]	valid_0's auc: 0.826823	valid_0's binary_logloss: 0.33675
[1030]	valid_0's auc: 0.826831	valid_0's binary_logloss: 0.336744
[1031]	valid_0's auc: 0.826837	valid_0's binary_logloss: 0.336739
[1032]	valid_0's auc: 0.82684	valid_0's binary_logloss: 0.336737
[1033]	valid_0's auc: 0.82684	valid_0's binary_logloss: 0.336737
[1034]	valid_0's auc: 0.826839	valid_0's binary_logloss: 0.336737
[1035]	valid_0's auc: 0.826844	valid_0's binary_logloss: 0.336733
[1036]	valid_0's auc: 0.826845	valid_0's binary_logloss: 0.336732
[1037]	valid_0

[1148]	valid_0's auc: 0.827262	valid_0's binary_logloss: 0.336408
[1149]	valid_0's auc: 0.827266	valid_0's binary_logloss: 0.336405
[1150]	valid_0's auc: 0.827264	valid_0's binary_logloss: 0.336406
[1151]	valid_0's auc: 0.827268	valid_0's binary_logloss: 0.336403
[1152]	valid_0's auc: 0.827271	valid_0's binary_logloss: 0.336401
[1153]	valid_0's auc: 0.827285	valid_0's binary_logloss: 0.336386
[1154]	valid_0's auc: 0.827285	valid_0's binary_logloss: 0.336386
[1155]	valid_0's auc: 0.827283	valid_0's binary_logloss: 0.336387
[1156]	valid_0's auc: 0.827284	valid_0's binary_logloss: 0.336387
[1157]	valid_0's auc: 0.827287	valid_0's binary_logloss: 0.336384
[1158]	valid_0's auc: 0.827285	valid_0's binary_logloss: 0.336386
[1159]	valid_0's auc: 0.827283	valid_0's binary_logloss: 0.336388
[1160]	valid_0's auc: 0.827285	valid_0's binary_logloss: 0.336387
[1161]	valid_0's auc: 0.827284	valid_0's binary_logloss: 0.336387
[1162]	valid_0's auc: 0.827285	valid_0's binary_logloss: 0.336386
[1163]	val

[1274]	valid_0's auc: 0.827593	valid_0's binary_logloss: 0.336134
[1275]	valid_0's auc: 0.827593	valid_0's binary_logloss: 0.336134
[1276]	valid_0's auc: 0.827596	valid_0's binary_logloss: 0.336132
[1277]	valid_0's auc: 0.827601	valid_0's binary_logloss: 0.336128
[1278]	valid_0's auc: 0.827607	valid_0's binary_logloss: 0.336123
[1279]	valid_0's auc: 0.827613	valid_0's binary_logloss: 0.336119
[1280]	valid_0's auc: 0.827615	valid_0's binary_logloss: 0.336118
[1281]	valid_0's auc: 0.827613	valid_0's binary_logloss: 0.336118
[1282]	valid_0's auc: 0.827616	valid_0's binary_logloss: 0.336115
[1283]	valid_0's auc: 0.827617	valid_0's binary_logloss: 0.336114
[1284]	valid_0's auc: 0.827616	valid_0's binary_logloss: 0.336114
[1285]	valid_0's auc: 0.827616	valid_0's binary_logloss: 0.336113
[1286]	valid_0's auc: 0.827615	valid_0's binary_logloss: 0.336115
[1287]	valid_0's auc: 0.827613	valid_0's binary_logloss: 0.336115
[1288]	valid_0's auc: 0.827613	valid_0's binary_logloss: 0.336115
[1289]	val

[1399]	valid_0's auc: 0.827915	valid_0's binary_logloss: 0.335878
[1400]	valid_0's auc: 0.827912	valid_0's binary_logloss: 0.33588
[1401]	valid_0's auc: 0.827912	valid_0's binary_logloss: 0.33588
[1402]	valid_0's auc: 0.82791	valid_0's binary_logloss: 0.335882
[1403]	valid_0's auc: 0.827909	valid_0's binary_logloss: 0.335883
[1404]	valid_0's auc: 0.827916	valid_0's binary_logloss: 0.335877
[1405]	valid_0's auc: 0.827923	valid_0's binary_logloss: 0.33587
[1406]	valid_0's auc: 0.82792	valid_0's binary_logloss: 0.335872
[1407]	valid_0's auc: 0.827929	valid_0's binary_logloss: 0.335866
[1408]	valid_0's auc: 0.827926	valid_0's binary_logloss: 0.335868
[1409]	valid_0's auc: 0.827938	valid_0's binary_logloss: 0.335861
[1410]	valid_0's auc: 0.827941	valid_0's binary_logloss: 0.335859
[1411]	valid_0's auc: 0.827943	valid_0's binary_logloss: 0.335858
[1412]	valid_0's auc: 0.827943	valid_0's binary_logloss: 0.335858
[1413]	valid_0's auc: 0.827941	valid_0's binary_logloss: 0.33586
[1414]	valid_0's

[1525]	valid_0's auc: 0.828348	valid_0's binary_logloss: 0.33554
[1526]	valid_0's auc: 0.828351	valid_0's binary_logloss: 0.335539
[1527]	valid_0's auc: 0.828351	valid_0's binary_logloss: 0.335538
[1528]	valid_0's auc: 0.828351	valid_0's binary_logloss: 0.335538
[1529]	valid_0's auc: 0.828355	valid_0's binary_logloss: 0.335535
[1530]	valid_0's auc: 0.828365	valid_0's binary_logloss: 0.335528
[1531]	valid_0's auc: 0.828367	valid_0's binary_logloss: 0.335527
[1532]	valid_0's auc: 0.828367	valid_0's binary_logloss: 0.335528
[1533]	valid_0's auc: 0.828368	valid_0's binary_logloss: 0.335526
[1534]	valid_0's auc: 0.828371	valid_0's binary_logloss: 0.335525
[1535]	valid_0's auc: 0.828377	valid_0's binary_logloss: 0.335521
[1536]	valid_0's auc: 0.828376	valid_0's binary_logloss: 0.335522
[1537]	valid_0's auc: 0.828381	valid_0's binary_logloss: 0.335518
[1538]	valid_0's auc: 0.828383	valid_0's binary_logloss: 0.335517
[1539]	valid_0's auc: 0.828384	valid_0's binary_logloss: 0.335516
[1540]	vali

[1651]	valid_0's auc: 0.828744	valid_0's binary_logloss: 0.335261
[1652]	valid_0's auc: 0.828745	valid_0's binary_logloss: 0.33526
[1653]	valid_0's auc: 0.828746	valid_0's binary_logloss: 0.33526
[1654]	valid_0's auc: 0.828744	valid_0's binary_logloss: 0.335261
[1655]	valid_0's auc: 0.828745	valid_0's binary_logloss: 0.335262
[1656]	valid_0's auc: 0.828746	valid_0's binary_logloss: 0.33526
[1657]	valid_0's auc: 0.82875	valid_0's binary_logloss: 0.335256
[1658]	valid_0's auc: 0.828747	valid_0's binary_logloss: 0.335258
[1659]	valid_0's auc: 0.828746	valid_0's binary_logloss: 0.335259
[1660]	valid_0's auc: 0.82875	valid_0's binary_logloss: 0.335255
[1661]	valid_0's auc: 0.828756	valid_0's binary_logloss: 0.335251
[1662]	valid_0's auc: 0.828755	valid_0's binary_logloss: 0.335252
[1663]	valid_0's auc: 0.828754	valid_0's binary_logloss: 0.335252
[1664]	valid_0's auc: 0.828757	valid_0's binary_logloss: 0.33525
[1665]	valid_0's auc: 0.828762	valid_0's binary_logloss: 0.335245
[1666]	valid_0's

[1776]	valid_0's auc: 0.829136	valid_0's binary_logloss: 0.334926
[1777]	valid_0's auc: 0.829143	valid_0's binary_logloss: 0.33492
[1778]	valid_0's auc: 0.829142	valid_0's binary_logloss: 0.334921
[1779]	valid_0's auc: 0.829143	valid_0's binary_logloss: 0.33492
[1780]	valid_0's auc: 0.829152	valid_0's binary_logloss: 0.334914
[1781]	valid_0's auc: 0.829157	valid_0's binary_logloss: 0.334912
[1782]	valid_0's auc: 0.829153	valid_0's binary_logloss: 0.334914
[1783]	valid_0's auc: 0.829156	valid_0's binary_logloss: 0.334911
[1784]	valid_0's auc: 0.829155	valid_0's binary_logloss: 0.334911
[1785]	valid_0's auc: 0.829151	valid_0's binary_logloss: 0.334914
[1786]	valid_0's auc: 0.829157	valid_0's binary_logloss: 0.33491
[1787]	valid_0's auc: 0.829159	valid_0's binary_logloss: 0.334907
[1788]	valid_0's auc: 0.829163	valid_0's binary_logloss: 0.334904
[1789]	valid_0's auc: 0.829158	valid_0's binary_logloss: 0.334906
[1790]	valid_0's auc: 0.829158	valid_0's binary_logloss: 0.334907
[1791]	valid_

[1901]	valid_0's auc: 0.829305	valid_0's binary_logloss: 0.334795
[1902]	valid_0's auc: 0.829303	valid_0's binary_logloss: 0.334794
[1903]	valid_0's auc: 0.829303	valid_0's binary_logloss: 0.334794
[1904]	valid_0's auc: 0.829307	valid_0's binary_logloss: 0.334792
[1905]	valid_0's auc: 0.829312	valid_0's binary_logloss: 0.334788
[1906]	valid_0's auc: 0.829314	valid_0's binary_logloss: 0.334787
[1907]	valid_0's auc: 0.829316	valid_0's binary_logloss: 0.334785
[1908]	valid_0's auc: 0.829321	valid_0's binary_logloss: 0.334782
[1909]	valid_0's auc: 0.82932	valid_0's binary_logloss: 0.334783
[1910]	valid_0's auc: 0.829321	valid_0's binary_logloss: 0.334781
[1911]	valid_0's auc: 0.829322	valid_0's binary_logloss: 0.334781
[1912]	valid_0's auc: 0.829322	valid_0's binary_logloss: 0.33478
[1913]	valid_0's auc: 0.829319	valid_0's binary_logloss: 0.334782
[1914]	valid_0's auc: 0.829331	valid_0's binary_logloss: 0.334772
[1915]	valid_0's auc: 0.829343	valid_0's binary_logloss: 0.334765
[1916]	valid

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=2000, n_jobs=-1, num_leaves=31, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, seed=1000,
        silent=True, subsample=1.0, subsample_for_bin=200000,
        subsample_freq=0)

# 4 计算结果

In [31]:
sub['label'] = model_lgb.predict_proba(test[feature_cols])[:, 1]

In [32]:
sub.to_csv('./storage/result.txt', index=None, header=None, sep='\t')