In [1]:
# Evaluation function
def get_model_score(y_true, X_test, pipeline, score_type):
    y_pred = pipeline.predict(X_test)
    if score_type == 'roc_auc':
        score = roc_auc_score(y_true, y_pred)
    elif score_type == 'f1':
        score = f1_score(y_true, y_pred)
    elif score_type == 'precision':
        score = precision_score(y_true, y_pred)
    elif score_type == 'accuracy':
        score = accuracy_score(y_true, y_pred)
    elif score_type == 'recall':
        score = recall_score(y_true, y_pred)
    return score

In [2]:
# import basic apis
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.externals import joblib


# import data
df = pd.read_csv('final_hr_analysis_train.csv', header=0)
df_s = pd.read_csv('final_hr_analysis_test.csv', header=0)
# モデル用
X = df.iloc[:,2:]
y = df.iloc[:,[1]]
# スコア用
ID = df_s.iloc[:,[0]]
X_s = df_s.iloc[:,2:]
    
# one-hot エンコーディング
ohe_columns = ['sales', 'salary']
X_new = pd.get_dummies(X, dummy_na=True, columns=ohe_columns)
X_new_s = pd.get_dummies(X_s, dummy_na=True, columns=ohe_columns)

# holdout
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.20, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=1)

# set Pipelines
pipe_knn = Pipeline([('scl',StandardScaler()), ('est',KNeighborsClassifier())])
pipe_logistic = Pipeline([('scl', StandardScaler()), ('est',LogisticRegression(random_state=1))])
pipe_rf = Pipeline([('scl', StandardScaler()), ('est',RandomForestClassifier(random_state=1))])
pipe_gbc = Pipeline([('scl', StandardScaler()), ('est',GradientBoostingClassifier(random_state=1))])
pipe_mlpc = Pipeline([('scl', StandardScaler()), ('est',MLPClassifier(max_iter=500, random_state=1))])
pipe_svc = Pipeline([('scl', StandardScaler()), ('est',LinearSVC(random_state=1))])
    
pipe_dict = {'KNN':pipe_knn, 'Logistic':pipe_logistic, 'RandomForest':pipe_rf,
             'GradientBoosting':pipe_gbc, 'MLP':pipe_mlpc, 'SVC':pipe_svc}
score_list = ['roc_auc', 'f1', 'precision', 'accuracy', 'recall']
model_score = {}
    
print("評価指標を入力してください : " , end = '')
    
input_score = input()
    
if  input_score not in score_list:
    print("評価指標は以下から指定してください。")
    print("roc_auc、f1、precision、accuracy、recall")
else:
    for (key,pipe) in pipe_dict.items():
        pipe.fit(X_train, y_train.as_matrix().ravel())
        score = get_model_score(y_test.as_matrix().ravel(), X_test, pipe, input_score)
        model_score[key] = score
            
    for i,(name, score) in enumerate(sorted(model_score.items(), key=lambda x:-x[1])):
        print(i, ':', '%s --> %.3f'%(name,score))
            
    # Get the best model and save
    key_at_max_score = max(model_score, key=model_score.get)
    print('-----------------------------------------------------')
    print('best model : ', key_at_max_score)
    print(pipe_dict[key_at_max_score])
    cv_results = cross_val_score(pipe_dict[key_at_max_score], X_train, y_train.as_matrix().ravel(), cv=10, scoring=input_score)
    print('-----------------------------------------------------')
    print(pd.DataFrame(cv_results).describe())
    print('-----------------------------------------------------')

評価指標を入力してください : accuracy
0 : RandomForest --> 0.979
1 : GradientBoosting --> 0.970
2 : MLP --> 0.958
3 : KNN --> 0.935
4 : Logistic --> 0.789
5 : SVC --> 0.773
-----------------------------------------------------
best model :  RandomForest
Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False))])
-----------------------------------------------------
               0
count  10.000000
mean    0.980804
std     0.006391
min     0.968796
25%     0.976183
50%     0.983607
75%     0.984736
max     0.988113
----------------------------------

In [3]:
# パラメータグリッドの設定
param_grid = {'est__n_estimators':[100], 'est__max_depth':[2,4,6,8,10,12,14,16,18,20], 'est__n_jobs':[-1]}
model = RandomForestClassifier()
# 学習
pipe = pipe_dict[key_at_max_score]
best_estimator = []
print('----------------------------------------------------------------------------------------------')
print('探索空間:%s' % param_grid)
gs = GridSearchCV(estimator=pipe, param_grid=param_grid, scoring=input_score, cv=10)
gs = gs.fit(X_val, y_val.as_matrix().ravel())
best_estimator.append(gs.best_estimator_)
print('Best Score %.6f\n' % gs.best_score_)
print('Best Model: %s' % gs.best_estimator_)
joblib.dump(gs.best_estimator_, key_at_max_score + '.pkl')

----------------------------------------------------------------------------------------------
探索空間:{'est__n_estimators': [100], 'est__max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], 'est__n_jobs': [-1]}
Best Score 0.978571

Best Model: Pipeline(memory=None,
     steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('est', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=12, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False))])


['RandomForest.pkl']

In [4]:
est = joblib.load('RandomForest.pkl')
score = get_model_score(y_test.as_matrix().ravel(), X_test, est, 'accuracy')
print(score)

0.9723809523809523


In [5]:
est = joblib.load('RandomForest.pkl')
y_pred = pd.DataFrame(est.predict_proba(X_new_s)[:,[1]], columns=['prob'])
df_ans = ID.join(y_pred)

In [8]:
df_ans.to_csv('aijc1182.csv', index=False)