In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns


import numpy as np
import pandas as pd
import tensorflow as tf

import atecml.data

from contextlib import contextmanager
from tqdm import tqdm
from time import strftime,time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

plt.style.use('ggplot')

In [2]:
@contextmanager
def timer(func_name: str):
    """Elapsed Time
    """
    start = time()
    print('[{}][{}] Begin ...'.format(strftime('%Y-%m-%d %H:%M:%S'), func_name))
    yield
    print('[{}][{}] End   ...[Elapsed: {:.2f}s]'.format(strftime('%Y-%m-%d %H:%M:%S'), func_name, time()-start))

In [3]:
train_df,test_df = atecml.data.load()

In [4]:
from sklearn.metrics import log_loss
from MLFeatureSelection import FeatureSelection as FS
import lightgbm as lgb

In [5]:
predictors = [x for x in train_df.columns if x not in atecml.data.NOT_FEATURE_COLUMNS]


In [6]:
with timer('PreProcessing: fillna'):
    for idx in tqdm(range(len(predictors))):
        item = predictors[idx]
        train_df[item].fillna(train_df[item].min(), inplace=True)

  3%|▎         | 8/297 [00:00<00:03, 74.54it/s]

[2018-06-21 20:41:45][PreProcessing: fillna] Begin ...


100%|██████████| 297/297 [00:05<00:00, 57.89it/s]

[2018-06-21 20:41:50][PreProcessing: fillna] End   ...[Elapsed: 5.13s]





In [13]:
from sklearn.metrics import roc_curve
from sklearn.model_selection import StratifiedKFold

def score(y_test,y_predict_proba):
    """
    基于ROC的模型性能测量，并根据蚂蚁金服评分标准输出分数
    """
    fpr, tpr, thresholds = roc_curve(y_test,y_predict_proba)
    #roc_auc = auc(fpr, tpr)
    score = 0.4 * tpr[np.where(fpr >= 0.001)[0][0]] + \
            0.3 * tpr[np.where(fpr >= 0.005)[0][0]] + \
            0.3 * tpr[np.where(fpr >= 0.01)[0][0]]
    return score

def validation(X, Y, features, clf, lossfunction):
    totaltest = []
    kf = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
    for train_index, test_index in kf.split(X, Y):
        X_train, X_test = X.ix[train_index,:][features], X.ix[test_index,:][features]
        y_train, y_test = Y[train_index], Y[test_index]
        clf.fit(X_train, y_train)
        totaltest.append(lossfunction(y_test, clf.predict(X_test)))
    return np.mean(totaltest)

In [14]:
import lightgbm as lgb

sf = FS.Select(Sequence = True, Random = False, Cross = False) #初始化选择器，选择你需要的流程
sf.ImportDF(train_df, label ='Fraud') #导入数据集以及目标标签
#sf.ImportCrossMethod(CrossMethod)
sf.ImportLossFunction(score, direction = 'descend') #导入评价函数以及优化方向
sf.InitialNonTrainableFeatures(atecml.data.NOT_FEATURE_COLUMNS) #初始化不能用的特征
sf.InitialFeatures(predictors) #初始化其实特征组合
sf.GenerateCol() #生成特征库 （具体该函数变量请参考根目录下的readme）
sf.SetSample(1, samplemode = 1) #初始化抽样比例和随机过程
sf.SetTimeLimit(240) #设置算法运行最长时间，以分钟为单位
sf.clf = lgb.LGBMClassifier(random_state=10, num_leaves =15, n_estimators=200, max_depth=5, learning_rate = 0.1, n_jobs=-1) #设定模型
sf.SetLogFile('record.log') #初始化日志文件
sf.run(validation) #输入检验函数并开始运行

Features Quantity Limit: inf
Time Limit: 240 min(s)
test performance of initial features combination
Mean loss: 0.7909591235412201
--------------------start greedy--------------------
f1
f2
f3
f4
f5
f6
f7
f8
f9
f10
f11
f12
f13
f14
f15
f16
f17
f18
f19
f20
f21
f22
f23
f24
f25
f26
f27
f28
f29
f30
f31
f32
f33
f34
f35
f36
f37
f38
f39
f40
f41
f42
f43
f44
f45
f46
f47
f48
f49
f50
f51
f52
f53
f54
f55
f56
f57
f58
f59
f60
f61
f62
f63
f64
f65
f66
f67
f68
f69
f70
f71
f72
f73
f74
f75
f76
f77
f78
f79
f80
f81
f82
f83
f84
f85
f86
f87
f88
f89
f90
f91
f92
f93
f94
f95
f96
f97
f98
f99
f100
f101
f102
f103
f104
f105
f106
f107
f108
f109
f110
f111
f112
f113
f114
f115
f116
f117
f118
f119
f120
f121
f122
f123
f124
f125
f126
f127
f128
f129
f130
f131
f132
f133
f134
f135
f136
f137
f138
f139
f140
f141
f142
f143
f144
f145
f146
f147
f148
f149
f150
f151
f152
f153
f154
f155
f156
f157
f158
f159
f160
f161
f162
f163
f164
f165
f166
f167
f168
f169
f170
f171
f172
f173
f174
f175
f176
f177
f178
f179
f180
f181
f182
f183
f184
f185

KeyboardInterrupt: 

In [None]:
import lightgbm as lgb

sf = FS.Select(Sequence = True, Random = False, Cross = False)
sf.ImportDF(train_df, label ='Fraud') 
sf.ImportLossFunction(score, direction = 'ascend')
sf.InitialNonTrainableFeatures(atecml.data.NOT_FEATURE_COLUMNS) 
sf.InitialFeatures(predictors) 
sf.GenerateCol() 
sf.SetSample(1, samplemode = 1) 
sf.SetTimeLimit(720)
sf.clf = lgb.LGBMClassifier(random_state=10, num_leaves =50, n_estimators=1000, max_depth=10, learning_rate = 0.1, n_jobs=-1)
sf.SetLogFile('record.log') 
sf.run(validation) 