In [1]:
import gc
import pandas as pd
import numpy as np
import os
# import xgboost as xgb
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score  
from sklearn.metrics import confusion_matrix  
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.model_selection import ParameterGrid

In [2]:
DATA_PATH = "../"
csv_path = os.path.join(DATA_PATH, "atec_anti_fraud_train.csv")
data = pd.read_csv(csv_path)

In [3]:
data.loc[data['label'] == -1, 'label'] = 1
# data_to_label = data[data['label'] == -1]
# data = data[data.label.isin([0, 1])]

In [4]:
data.loc[data['label'] == 1, 'label'] = 2
data.loc[data['label'] == 0, 'label'] = 1
data.loc[data['label'] == 2, 'label'] = 0

In [5]:
feature_nan = data.isnull().sum()
drop_list = feature_nan[feature_nan / data.shape[0] > 0.5].reset_index()['index'].tolist()

In [6]:
data = data.drop(drop_list, axis=1)

In [7]:
csv_path = os.path.join(DATA_PATH, "atec_anti_fraud_test_b.csv")
data_test = pd.read_csv(csv_path)

In [8]:
data_test = data_test.drop(drop_list, axis=1)

In [9]:
# data.nunique()

In [10]:
# data.fillna(0, inplace=True)
# data_test.fillna(0, inplace=True)

In [11]:
data_train = data[data['date'] <= 20171021]
data_val = data[data['date'] > 20171021]

print(data.shape)
print(data_train.shape)
print(data_val.shape)

(994731, 288)
(749690, 288)
(245041, 288)


In [12]:
X_train = data_train.loc[:,'f1':].as_matrix()
y_train = data_train.loc[:,'label'].as_matrix()

X_val = data_val.loc[:,'f1':].as_matrix()
y_val = data_val.loc[:,'label'].as_matrix()

X_test = data_test.loc[:,'f1':].as_matrix()

In [13]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)

(749690, 285)
(749690,)
(245041, 285)
(245041,)
(500538, 285)


In [19]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

In [20]:
# def eval_func(y_pred, y_true):
#     y_true = y_true.get_label()
#     fpr, tpr, thresholds = metrics.roc_curve(1-y_true, 1-y_pred, pos_label=1)
#     score = 0.4 * tpr[np.argwhere(fpr < 0.001)[-1]] + 0.3 * tpr[np.argwhere(
#         fpr < 0.005)[-1]] + 0.3 * tpr[np.argwhere(fpr < 0.01)[-1]]
#     return 'score', score, True

def eval_func(y_pred, y_true):
    fpr, tpr, thresholds = metrics.roc_curve(y_pred, y_true, pos_label=1)
    score = 0.4 * tpr[np.argwhere(fpr < 0.001)[-1]] + 0.3 * tpr[np.argwhere(
        fpr < 0.005)[-1]] + 0.3 * tpr[np.argwhere(fpr < 0.01)[-1]]
    return 'score', score, True

In [21]:
params = {
    'task': 'train',
    'boosting_type': 'rf',
    'objective': 'binary',
    'metric': {'auc'},
    'num_leaves': 255,
    'learning_rate': 0.05,
    'feature_fraction': 0.4,
    'bagging_fraction': 0.5,
    'max_bin':2000,
    'bagging_freq': 5,
    'random_state': 20,
    'min_data_in_leaf' : 10,
    'n_jobs': 8,
    'verbose': 0
}

In [22]:
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=200,
    valid_sets=[lgb_eval],
#     categorical_feature=[1,2,3,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19],
#                 categorical_feature=[5],
#                 feval=eval_func,
    verbose_eval=True,
    early_stopping_rounds=20)

[1]	valid_0's auc: 0.942743
Training until validation scores don't improve for 20 rounds.
[2]	valid_0's auc: 0.960102
[3]	valid_0's auc: 0.962861
[4]	valid_0's auc: 0.963992
[5]	valid_0's auc: 0.96432
[6]	valid_0's auc: 0.965931
[7]	valid_0's auc: 0.966604
[8]	valid_0's auc: 0.966955
[9]	valid_0's auc: 0.966466
[10]	valid_0's auc: 0.966919
[11]	valid_0's auc: 0.966493
[12]	valid_0's auc: 0.966438
[13]	valid_0's auc: 0.966887
[14]	valid_0's auc: 0.967236
[15]	valid_0's auc: 0.96704
[16]	valid_0's auc: 0.966849
[17]	valid_0's auc: 0.96676
[18]	valid_0's auc: 0.96675
[19]	valid_0's auc: 0.967229
[20]	valid_0's auc: 0.967293
[21]	valid_0's auc: 0.967445
[22]	valid_0's auc: 0.967693
[23]	valid_0's auc: 0.967821
[24]	valid_0's auc: 0.967876
[25]	valid_0's auc: 0.967821
[26]	valid_0's auc: 0.967941
[27]	valid_0's auc: 0.967978
[28]	valid_0's auc: 0.967859
[29]	valid_0's auc: 0.967729
[30]	valid_0's auc: 0.967666
[31]	valid_0's auc: 0.967775
[32]	valid_0's auc: 0.967848
[33]	valid_0's auc: 0.9

In [23]:
print(params)
print(eval_func(1-y_train, 1-model.predict(X_train, num_iteration=model.best_iteration)))
print(eval_func(1-y_val, 1-model.predict(X_val, num_iteration=model.best_iteration)))

{'task': 'train', 'boosting_type': 'rf', 'objective': 'binary', 'metric': {'auc'}, 'num_leaves': 255, 'learning_rate': 0.05, 'feature_fraction': 0.4, 'bagging_fraction': 0.5, 'max_bin': 2000, 'bagging_freq': 5, 'random_state': 20, 'min_data_in_leaf': 10, 'n_jobs': 8, 'verbose': 0}
('score', array([0.54831514]), True)
('score', array([0.40378121]), True)


In [54]:
print(params)
print(eval_func(1-y_train, 1-model.predict(X_train, num_iteration=model.best_iteration)))
print(eval_func(1-y_val, 1-model.predict(X_val, num_iteration=model.best_iteration)))

{'task': 'train', 'boosting_type': 'rf', 'objective': 'binary', 'metric': {'auc'}, 'num_leaves': 255, 'learning_rate': 0.05, 'feature_fraction': 0.4, 'bagging_fraction': 0.5, 'max_bin': 2000, 'bagging_freq': 5, 'random_state': 20, 'n_jobs': 8, 'verbose': 0}
('score', array([0.53188578]), True)
('score', array([0.40085612]), True)


In [24]:
y_pred = 1 - model.predict(X_test)
result = pd.DataFrame({'id':data_test['id'].as_matrix(), 'score':y_pred})
result.to_csv("submission.csv", index=False)