In [30]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import os

import numpy as np
import pandas as pd
import tensorflow as tf

import atecml.data

from contextlib import contextmanager
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from imblearn.over_sampling import SMOTE, ADASYN

plt.style.use('ggplot')

In [None]:
@contextmanager
def timer(func_name: str):
    """Elapsed Time
    """
    start = time()
    print('[{}][{}] Begin ...'.format(strftime('%Y-%m-%d %H:%M:%S'), func_name))
    yield
    print('[{}][{}] End   ...[Elapsed: {:.2f}s]'.format(strftime('%Y-%m-%d %H:%M:%S'), func_name, time()-start))

In [2]:
train_df,test_df = atecml.data.load()

In [3]:
predictors = [x for x in train_df.columns if x not in atecml.data.NOT_FEATURE_COLUMNS]
target = 'Fraud'

In [4]:
#0905-1015 for Train
#1015-11xx for Verification

train_df = atecml.data.filter_date(train_df,start_date='2017-09-05',end_date='2017-10-15')

In [7]:
with atecml.data.timer('PreProcessing: fillna'):
    for idx in tqdm(range(len(predictors))):
        item = predictors[idx]
        train_df[item].fillna(train_df[item].min(), inplace=True)

  0%|          | 1/297 [00:00<00:39,  7.45it/s]

[2018-06-22 12:23:53][PreProcessing: fillna] Begin ...


100%|██████████| 297/297 [00:27<00:00, 10.83it/s]

[2018-06-22 12:24:21][PreProcessing: fillna] End   ...[Elapsed: 27.44s]





In [8]:
'''
with timer('PreProcessing: Normalization'):
    scaled_features = StandardScaler().fit_transform(train_df[predictors].values)
    scaled_features_df = pd.DataFrame(scaled_features, index=train_df.index, columns=predictors)
'''    


In [26]:
#build Models...
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

model = {}

#model["RandomForest"] = RandomForestClassifier(n_estimators=1000, max_depth=50, n_jobs=-1)
#model["ExtraTree"] =ExtraTreesClassifier(n_estimators=1000, max_depth=50, n_jobs=-1)
model["LightGBM"] = LGBMClassifier(n_estimators=1000, max_depth=50)
#model["GBDT"] =GradientBoostingClassifier(n_estimators=1000, max_depth=50)
model["XGBOOST"] =XGBClassifier(n_estimators=10, max_depth=5,nthread=80)


def model_train(df, predictors,model_name):
    model_cache_name = './'+model_name+'.model'
    if (os.path.exists(model_cache_name)):
        clf = joblib.load(model_cache_name)
    else:
        params = model_name.split('__')
        model_key = params[0]
        target = params[1]
        clf = model[model_key]
        with atecml.data.timer('> {} <: OverSample for imbalance data'.format(model_key)):
            X_resampled, y_resampled = SMOTE().fit_sample(df[predictors],df[target])
        with atecml.data.timer('> {} <: Training...'.format(model_key)):
            clf.fit(X_resampled,y_resampled)
        joblib.dump(clf,model_cache_name)
    return clf
    

In [27]:
train_model =[]
for idx in range(0,2):
    for item in model.keys():
        for target in ['Normal','Fraud']:
            train_id = item + '__'+target +'__'+str(idx) 
            train_model.append(train_id)

In [28]:
train_model

['XGBOOST__Normal__0',
 'XGBOOST__Fraud__0',
 'LightGBM__Normal__0',
 'LightGBM__Fraud__0',
 'XGBOOST__Normal__1',
 'XGBOOST__Fraud__1',
 'LightGBM__Normal__1',
 'LightGBM__Fraud__1']

In [31]:
a = model_train(train_df,predictors,'XGBOOST__Normal__0')

[2018-06-22 12:40:18][[XGBOOST]: OverSample for imbalance data] Begin ...
[2018-06-22 12:40:47][[XGBOOST]: OverSample for imbalance data] End   ...[Elapsed: 29.39s]
[2018-06-22 12:40:47][[XGBOOST]: Training...] Begin ...
[2018-06-22 12:41:08][[XGBOOST]: Training...] End   ...[Elapsed: 21.00s]


In [32]:
b = model_train(train_df,predictors,'XGBOOST__Normal__0')

In [None]:
trained_model_list =[]
with timer('Classification: Model Training'):
    for train_id in tqdm(range(len(train_model))):
        fit_model = model_train(train_df,predictors,train_id)
        trained_model_list.append(fit_model)
        
 

In [None]:
verify_df = atecml.data.load_train()
verify_data = atecml.data.filter_date(verify_df,start_date='2017-10-16',end_date='2018-10-15')

In [None]:
with timer('Validation: verify_data fillna'):
    for idx in tqdm(range(len(predictors))):
        item = predictors[idx]
        verify_data[item].fillna(verify_data[item].min(), inplace=True)

In [None]:
verify_df =pd.DataFrame()
with timer('Validation: Modelfit'):
    for idx in tqdm(range(len(trained_model_list))):
        clf = trained_model_list[idx]
        y_predict = clf.predict_proba(np.array(verify_data[predictors]))
        verify_df[idx] = pd.DataFrame(y_predict)[1]

In [None]:
verify_df['mean'] = (verify_df[0] +verify_df[1]+verify_df[2]+verify_df[3])/4

In [None]:
from sklearn.metrics import roc_curve, auc
def performance(y_test,y_predict_proba):
    """
    基于ROC的模型性能测量，并根据蚂蚁金服评分标准输出分数
    """
    fpr, tpr, thresholds = roc_curve(y_test,y_predict_proba)
    roc_auc = auc(fpr, tpr)
    roc_result = pd.DataFrame()
    roc_result['fpr'] = pd.Series(fpr)
    roc_result['tpr'] = pd.Series(tpr)
    roc_result['thresholds'] = pd.Series(thresholds)
    TPR1= float(roc_result[roc_result['fpr']<=0.001002].tail(1)['tpr'])
    TPR2=float(roc_result[roc_result['fpr']<=0.005002].tail(1)['tpr'])
    TPR3=float(roc_result[roc_result['fpr']<=0.010002].tail(1)['tpr'])
    FINAL_SCORE = 0.4*TPR1 + 0.3*TPR2 + 0.3 * TPR3
    print(FINAL_SCORE)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()
    return (FINAL_SCORE,roc_result,roc_auc)

In [None]:
a,b,c=performance(verify_data[target],verify_df['mean'])

In [None]:
a,b,c=performance(verify_data[target],verify_df['max'])

In [None]:
def model_fit(clf_params,clf,X,y,n_split=5):
    #consider use data.filter_date function replace Kflod to avoid future info.
    kf = KFold(n_splits=n_split, random_state=33, shuffle=True)
    for train_index, test_index in tqdm(kf.split(X)):
        X_, X_test = X[train_index], X[test_index]
        y_, y_test = y[train_index], y[test_index]        
        
        # Divide into train and validation set for early-stop
        X_train, X_valid, y_train, y_valid = train_test_split(X_, y_, test_size=0.15, random_state=42)
        
        del X_, y_
        gc.collect()
        
        # Model Training
        clf.fit(X=X_train, y=y_train, eval_set=[(X_valid, y_valid)], eval_metric='auc',
                verbose=False, **clf_fit_params)
        
        ## Model Testing
        # On training set
        y_prob_train = clf.predict_proba(X_train)[:,1]
        y_pred_train = clf.predict(X_train)
        eval_train.iloc[i,:] = evaluate(y_train, y_pred_train, y_prob_train)
        
        # On testing set
        y_prob_test = clf.predict_proba(X_test)[:,1]
        y_pred_test = clf.predict(X_test)
        eval_test.iloc[i,:] = evaluate(y_test, y_pred_test, y_prob_test)
        
        # Saving model
        models.append(clf)
        i += 1
    
    
    

In [None]:
kf = KFold(n_splits=5, random_state=33, shuffle=True)
for train_index, test_index in tqdm(kf.split(train_df)):
    X_ , X_test = X[train_index], X[test_index]
    y_ , y_test = y[train_index], y[test_index]
    
    

In [None]:
lgb_params = {'boosting_type': 'gbdt',
                      'num_leaves': 31,
                      'max_depth': 50,
                      'learning_rate': 0.10,
                      'n_estimators': 100000,
                      'reg_alpha': 0.1,
                      'seed': 42,
                      'nthread': -1}
        
        clf = lgb.LGBMClassifier(**lgb_params)