In [1]:
import gc
import pandas as pd
import numpy as np
import os
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.externals import joblib
from imblearn.over_sampling import SMOTE 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve 

In [2]:
####################训练集#######################
DATA_PATH = "datasets"
csv_path = os.path.join(DATA_PATH, "atec_anti_fraud_train.csv")
data = pd.read_csv(csv_path)

In [3]:
####################测试集#######################
csv_path = os.path.join(DATA_PATH, "atec_anti_fraud_test_b.csv")
data_test = pd.read_csv(csv_path)

In [4]:
data.loc[data['label'] == -1, 'label'] = 1
data.loc[data['label'] == 1, 'label'] = 2
data.loc[data['label'] == 0, 'label'] = 1
data.loc[data['label'] == 2, 'label'] = 0
data['label'].value_counts()

1    977884
0     16847
Name: label, dtype: int64

In [5]:
feature_nan = data.isnull().sum()
drop_list = feature_nan[feature_nan / data.shape[0] > 0.5].reset_index()['index'].tolist()

In [6]:
data = data.drop(drop_list, axis=1)
data_test = data_test.drop(drop_list, axis=1)

In [7]:
on_train = data[data['date'] <= 20171031]
on_val = data[data['date'] > 20171031]
print(on_train.shape, on_val.shape)

(911606, 288) (83125, 288)


In [8]:
del data
gc.collect()

2007

In [9]:
##################数据集划分（线上）########################
on_train1 = on_train[on_train['date'] > 20170915]
#on_train1 = sampling(on_train1, 0.5, 2018)

on_train2 = on_train[(on_train['date'] <= 20170915) | (on_train['date'] > 20170927)]
#on_train2 = sampling(on_train2, 0.5, 2019)

on_train3 = on_train[(on_train['date'] <= 20170927) | (on_train['date'] > 20171009)]
#on_train3 = sampling(on_train3, 0.5, 2020)

on_train4 = on_train[(on_train['date'] <= 20171009) | (on_train['date'] > 20171020)]
#on_train4 = sampling(on_train4, 0.5, 2021)

on_train5 = on_train[on_train['date'] <= 20171020]
#on_train5 = sampling(on_train5, 0.5, 2022)

print(on_train1.shape)
print(on_train2.shape)
print(on_train3.shape)
print(on_train4.shape)
print(on_train5.shape)

(731486, 288)
(723597, 288)
(729032, 288)
(728626, 288)
(733683, 288)


In [10]:
X_train1 = on_train1.loc[:,'f1':].as_matrix()
y_train1 = on_train1.loc[:,'label'].as_matrix()
print(X_train1.shape, y_train1.shape)

X_train2 = on_train2.loc[:,'f1':].as_matrix()
y_train2 = on_train2.loc[:,'label'].as_matrix()
print(X_train2.shape, y_train2.shape)

X_train3 = on_train3.loc[:,'f1':].as_matrix()
y_train3 = on_train3.loc[:,'label'].as_matrix()
print(X_train3.shape, y_train3.shape)

X_train4 = on_train4.loc[:,'f1':].as_matrix()
y_train4 = on_train4.loc[:,'label'].as_matrix()
print(X_train4.shape, y_train4.shape)

X_train5 = on_train5.loc[:,'f1':].as_matrix()
y_train5 = on_train5.loc[:,'label'].as_matrix()
print(X_train5.shape, y_train5.shape)

X_val = on_val.loc[:,'f1':].as_matrix()
y_val = on_val.loc[:,'label'].as_matrix()
print(X_val.shape, y_val.shape)

X_test = data_test.loc[:,'f1':].as_matrix()
print(X_test.shape)

(731486, 285) (731486,)
(723597, 285) (723597,)
(729032, 285) (729032,)
(728626, 285) (728626,)
(733683, 285) (733683,)
(83125, 285) (83125,)
(500538, 285)


In [11]:
del on_train1, on_train2, on_train3, on_train4, on_train5
gc.collect()

115

In [13]:
lgb_train1 = lgb.Dataset(X_train1, y_train1)
lgb_train2 = lgb.Dataset(X_train2, y_train2)
lgb_train3 = lgb.Dataset(X_train3, y_train3)
lgb_train4 = lgb.Dataset(X_train4, y_train4)
lgb_train5 = lgb.Dataset(X_train5, y_train5)
lgb_eval1 = lgb.Dataset(X_val, y_val, reference=lgb_train1)
lgb_eval2 = lgb.Dataset(X_val, y_val, reference=lgb_train2)
lgb_eval3 = lgb.Dataset(X_val, y_val, reference=lgb_train3)
lgb_eval4 = lgb.Dataset(X_val, y_val, reference=lgb_train4)
lgb_eval5 = lgb.Dataset(X_val, y_val, reference=lgb_train5)

In [15]:
def LGB_RF_training(lgb_train, lgb_eval, random_state):
    params = {
        'task': 'train',
        'boosting_type': 'rf',
        'objective': 'binary',
        'metric': {'auc'},
        'num_leaves': 255,
        'learning_rate': 0.05,
        'feature_fraction': 0.4,
        'bagging_fraction': 0.5,
        'max_bin':2000,
        'bagging_freq': 5,
        'random_state': random_state,
        'min_data_in_leaf' : 10,
        'n_jobs': 8,
        'verbose': 0
    }
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=200,
        valid_sets=[lgb_eval],
    #     categorical_feature=[1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19],
    #                 categorical_feature=[5],
    #                 feval=eval_func,
        verbose_eval=True,
        early_stopping_rounds=20)
    return model

In [16]:
model1 = LGB_RF_training(lgb_train1, lgb_eval1, 20)

[1]	valid_0's auc: 0.951065
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.966218
[3]	valid_0's auc: 0.968654
[4]	valid_0's auc: 0.969332
[5]	valid_0's auc: 0.969484
[6]	valid_0's auc: 0.97125
[7]	valid_0's auc: 0.971328
[8]	valid_0's auc: 0.970657
[9]	valid_0's auc: 0.971381
[10]	valid_0's auc: 0.97174
[11]	valid_0's auc: 0.971354
[12]	valid_0's auc: 0.971604
[13]	valid_0's auc: 0.971473
[14]	valid_0's auc: 0.971814
[15]	valid_0's auc: 0.97152
[16]	valid_0's auc: 0.971356
[17]	valid_0's auc: 0.97131
[18]	valid_0's auc: 0.971321
[19]	valid_0's auc: 0.971904
[20]	valid_0's auc: 0.971953
[21]	valid_0's auc: 0.972196
[22]	valid_0's auc: 0.972368
[23]	valid_0's auc: 0.972385
[24]	valid_0's auc: 0.972469
[25]	valid_0's auc: 0.972389
[26]	valid_0's auc: 0.972379
[27]	valid_0's auc: 0.972525
[28]	valid_0's auc: 0.972547
[29]	valid_0's auc: 0.97246
[30]	valid_0's auc: 0.9725
[31]	valid_0's auc: 0.97263
[32]	valid_0's auc: 0.97282
[33]	valid_0's auc: 0.97287


In [17]:
model2 = LGB_RF_training(lgb_train2, lgb_eval2, 21)

[1]	valid_0's auc: 0.951238
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.964811
[3]	valid_0's auc: 0.967325
[4]	valid_0's auc: 0.967502
[5]	valid_0's auc: 0.968139
[6]	valid_0's auc: 0.969484
[7]	valid_0's auc: 0.970548
[8]	valid_0's auc: 0.970885
[9]	valid_0's auc: 0.971166
[10]	valid_0's auc: 0.971276
[11]	valid_0's auc: 0.971019
[12]	valid_0's auc: 0.971088
[13]	valid_0's auc: 0.971255
[14]	valid_0's auc: 0.971542
[15]	valid_0's auc: 0.971402
[16]	valid_0's auc: 0.971326
[17]	valid_0's auc: 0.971325
[18]	valid_0's auc: 0.971237
[19]	valid_0's auc: 0.971686
[20]	valid_0's auc: 0.971662
[21]	valid_0's auc: 0.971798
[22]	valid_0's auc: 0.971984
[23]	valid_0's auc: 0.972215
[24]	valid_0's auc: 0.972318
[25]	valid_0's auc: 0.972284
[26]	valid_0's auc: 0.972276
[27]	valid_0's auc: 0.9724
[28]	valid_0's auc: 0.972356
[29]	valid_0's auc: 0.972216
[30]	valid_0's auc: 0.972179
[31]	valid_0's auc: 0.972202
[32]	valid_0's auc: 0.972431
[33]	valid_0's auc: 0

In [18]:
model3 = LGB_RF_training(lgb_train3, lgb_eval3, 22)

[1]	valid_0's auc: 0.953307
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.965336
[3]	valid_0's auc: 0.966914
[4]	valid_0's auc: 0.967205
[5]	valid_0's auc: 0.967352
[6]	valid_0's auc: 0.969053
[7]	valid_0's auc: 0.969341
[8]	valid_0's auc: 0.969589
[9]	valid_0's auc: 0.96997
[10]	valid_0's auc: 0.970765
[11]	valid_0's auc: 0.970644
[12]	valid_0's auc: 0.970747
[13]	valid_0's auc: 0.970835
[14]	valid_0's auc: 0.971194
[15]	valid_0's auc: 0.971057
[16]	valid_0's auc: 0.971565
[17]	valid_0's auc: 0.971498
[18]	valid_0's auc: 0.971419
[19]	valid_0's auc: 0.97171
[20]	valid_0's auc: 0.971687
[21]	valid_0's auc: 0.971786
[22]	valid_0's auc: 0.972025
[23]	valid_0's auc: 0.972226
[24]	valid_0's auc: 0.97227
[25]	valid_0's auc: 0.972232
[26]	valid_0's auc: 0.972197
[27]	valid_0's auc: 0.972278
[28]	valid_0's auc: 0.972219
[29]	valid_0's auc: 0.972197
[30]	valid_0's auc: 0.972155
[31]	valid_0's auc: 0.972216
[32]	valid_0's auc: 0.972209
[33]	valid_0's auc: 0.

In [19]:
model4 = LGB_RF_training(lgb_train4, lgb_eval4, 23)

[1]	valid_0's auc: 0.951585
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.963093
[3]	valid_0's auc: 0.966241
[4]	valid_0's auc: 0.967775
[5]	valid_0's auc: 0.967703
[6]	valid_0's auc: 0.969567
[7]	valid_0's auc: 0.970853
[8]	valid_0's auc: 0.971256
[9]	valid_0's auc: 0.971558
[10]	valid_0's auc: 0.971849
[11]	valid_0's auc: 0.971637
[12]	valid_0's auc: 0.97154
[13]	valid_0's auc: 0.971776
[14]	valid_0's auc: 0.97202
[15]	valid_0's auc: 0.971746
[16]	valid_0's auc: 0.97185
[17]	valid_0's auc: 0.971731
[18]	valid_0's auc: 0.971532
[19]	valid_0's auc: 0.97181
[20]	valid_0's auc: 0.971863
[21]	valid_0's auc: 0.972003
[22]	valid_0's auc: 0.972183
[23]	valid_0's auc: 0.972306
[24]	valid_0's auc: 0.972404
[25]	valid_0's auc: 0.972359
[26]	valid_0's auc: 0.972379
[27]	valid_0's auc: 0.972535
[28]	valid_0's auc: 0.972472
[29]	valid_0's auc: 0.972382
[30]	valid_0's auc: 0.972417
[31]	valid_0's auc: 0.972482
[32]	valid_0's auc: 0.972619
[33]	valid_0's auc: 0.9

In [20]:
model5 = LGB_RF_training(lgb_train5, lgb_eval5, 24)

[1]	valid_0's auc: 0.950773
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.961921
[3]	valid_0's auc: 0.960898
[4]	valid_0's auc: 0.963765
[5]	valid_0's auc: 0.964842
[6]	valid_0's auc: 0.964957
[7]	valid_0's auc: 0.965352
[8]	valid_0's auc: 0.965723
[9]	valid_0's auc: 0.965942
[10]	valid_0's auc: 0.966623
[11]	valid_0's auc: 0.96661
[12]	valid_0's auc: 0.966443
[13]	valid_0's auc: 0.966845
[14]	valid_0's auc: 0.967018
[15]	valid_0's auc: 0.966851
[16]	valid_0's auc: 0.966567
[17]	valid_0's auc: 0.966507
[18]	valid_0's auc: 0.966507
[19]	valid_0's auc: 0.967118
[20]	valid_0's auc: 0.967208
[21]	valid_0's auc: 0.967417
[22]	valid_0's auc: 0.967376
[23]	valid_0's auc: 0.967516
[24]	valid_0's auc: 0.967639
[25]	valid_0's auc: 0.96773
[26]	valid_0's auc: 0.9677
[27]	valid_0's auc: 0.967677
[28]	valid_0's auc: 0.967624
[29]	valid_0's auc: 0.967494
[30]	valid_0's auc: 0.967475
[31]	valid_0's auc: 0.967528
[32]	valid_0's auc: 0.967599
[33]	valid_0's auc: 0.9

In [21]:
y_pred1 = 1 - model1.predict(X_test)
y_pred2 = 1 - model2.predict(X_test)
y_pred3 = 1 - model3.predict(X_test)
y_pred4 = 1 - model4.predict(X_test)
y_pred5 = 1 - model5.predict(X_test)

In [22]:
print(y_pred1[:10])
print(y_pred2[:10])
print(y_pred3[:10])
print(y_pred4[:10])
print(y_pred5[:10])

[0.11959613 0.11959613 0.11959613 0.13885224 0.13971564 0.11971309
 0.14449931 0.11959613 0.11959613 0.16002588]
[0.11969078 0.11969078 0.11969078 0.13078475 0.13475383 0.1197389
 0.13153146 0.11969078 0.11969078 0.14827593]
[0.11968547 0.11968547 0.11968547 0.1268364  0.1341897  0.11973836
 0.12594032 0.11968547 0.11968547 0.13718371]
[0.11971122 0.11971122 0.11971122 0.13028632 0.1387086  0.11974522
 0.12567915 0.11971122 0.11971122 0.142727  ]
[0.11974429 0.11971944 0.1197559  0.12833011 0.14923644 0.11975607
 0.12451651 0.1197559  0.1197559  0.14841213]


In [23]:
y_pred = 0.2*y_pred1 + 0.2*y_pred2 + 0.2*y_pred3 + 0.2*y_pred4 + 0.2*y_pred5

In [24]:
result = pd.DataFrame({'id':data_test['id'].as_matrix(), 'score':y_pred})
result.to_csv("submission1.csv", index=False)

In [30]:
count = 0
for i in y_pred3:
    if i < 0.1:
        count += 1
print(count)

0
