In [13]:
import pandas as pd
import pandas_profiling
import os
import pickle
import gc
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import lightgbm as lgb

#データ読み込み
train = pd.read_csv("data_EDA/train.csv")
test = pd.read_csv("data_EDA/test.csv")

In [9]:
train

Unnamed: 0,index,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,200,9,125,74.0,27.141005,135.636364,28.536910,0.444902,45,1.0
1,3832,4,109,80.0,27.141005,135.636364,28.047673,0.238243,22,0.0
2,4927,4,88,78.0,39.000000,135.636364,52.371341,0.279471,26,0.0
3,4088,9,125,74.0,27.141005,135.636364,40.062688,0.203922,45,0.0
4,3644,5,107,78.0,44.000000,284.000000,52.935068,0.284959,45,1.0
...,...,...,...,...,...,...,...,...,...,...
2995,4931,4,88,74.0,17.000000,135.636364,33.848723,0.171073,23,0.0
2996,3264,0,144,88.0,27.141005,135.636364,26.846832,0.259957,21,1.0
2997,1653,6,117,96.0,36.000000,135.636364,28.101646,0.716126,22,1.0
2998,2607,2,113,74.0,27.141005,135.636364,33.079021,0.266179,38,1.0


In [28]:
# まずは少ない特徴量から検討
X_train = train[['DiabetesPedigreeFunction','BMI']]
id_train = train[['index']]
y_train = train[['Outcome']]

In [29]:
y_train

Unnamed: 0,Outcome
0,1.0
1,0.0
2,0.0
3,0.0
4,1.0
...,...
2995,0.0
2996,1.0
2997,1.0
2998,1.0


In [30]:
len(X_train)
len(y_train)

3000

ベースライン

In [58]:
params = {
    'boosting_type' :'gbdt',
    'objective' : 'binary',
    'metrics' : 'auc',
    'learning_rate' : 16,
    'random_state' : 123,
    'importance_type' : 'gain',
}

metrics = []
imp = pd.DataFrame()
random_state = 123
n_splits = 4
cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split(X_train, y_train))

In [59]:
for nfold in np.arange(n_splits):
    print('-'*20, nfold, '-'*20)
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr = X_train.loc[idx_tr, :], y_train.loc[idx_tr, :]
    x_va, y_va = X_train.loc[idx_va, :], y_train.loc[idx_va, :]
    print(x_tr.shape, y_tr.shape)
    print(x_va.shape, y_va.shape)
    print('y_train:{:.3f}, y_tr:{:.3f}, y_va{:.3f}'.
          format(y_train['Outcome'].mean(), y_tr['Outcome'].mean(), y_va['Outcome'].mean(),))
    
    model = lgb.LGBMClassifier(**params)
    model.fit(x_tr,
              y_tr,
              eval_set=[(x_tr,y_tr),(x_va,y_va)],
              early_stopping_rounds=100,
              verbose=100,
             )
    
    y_tr_pred = model.predict(x_tr)
    y_va_pred = model.predict(x_va)
    metric_tr = accuracy_score(y_tr, y_tr_pred)
    metric_va = accuracy_score(y_va, y_va_pred)
    print('[accuracy] tr: {:.2f}, va: {:2f}'.
         format(metric_tr, metric_va))
    metrics.append([nfold, metric_tr, metric_va])
    
    _imp = pd.DataFrame({'col':X_train.columns, 'imp':model.feature_importances_,'nfold':nfold})
    imp = pd.concat([imp, _imp], axis=0, ignore_index=True)
    
print('-'*20, 'result', '-'*20)
metrics = np.array(metrics)
print(metrics)

print('[cv] tr: {:.2f}+-{:.2f}, va: {:.2f}'.
     format(metrics[:,1].mean(), metrics[:,1].std(),
            metrics[:,2].mean(), metrics[:,2].std()))

imp = imp.groupby('col')['imp'].agg(['mean', 'std'])
imp.columns = ['imp', 'imp_std']
imp = imp.reset_index(drop=False)
imp.sort_values('imp', ascending=False, ignore_index=True)
print('-'*20, 'imp', '-'*20)
print(imp)

print('Done')

-------------------- 0 --------------------
(2250, 2) (2250, 1)
(750, 2) (750, 1)
y_train:0.239, y_tr:0.239, y_va0.240
[100]	training's auc: 0.600018	valid_1's auc: 0.594352
[accuracy] tr: 0.74, va: 0.694667
-------------------- 1 --------------------
(2250, 2) (2250, 1)
(750, 2) (750, 1)
y_train:0.239, y_tr:0.239, y_va0.239
[100]	training's auc: 0.551289	valid_1's auc: 0.549448
[accuracy] tr: 0.71, va: 0.658667
-------------------- 2 --------------------
(2250, 2) (2250, 1)
(750, 2) (750, 1)
y_train:0.239, y_tr:0.239, y_va0.239
[100]	training's auc: 0.57966	valid_1's auc: 0.551649
[accuracy] tr: 0.74, va: 0.694667
-------------------- 3 --------------------
(2250, 2) (2250, 1)
(750, 2) (750, 1)
y_train:0.239, y_tr:0.239, y_va0.239
[100]	training's auc: 0.465095	valid_1's auc: 0.424689
[accuracy] tr: 0.72, va: 0.654667
-------------------- result --------------------
[[0.         0.73777778 0.69466667]
 [1.         0.70755556 0.65866667]
 [2.         0.74444444 0.69466667]
 [3.        

## XGboost
https://qiita.com/c60evaporator/items/a9a049c3469f6b4872c6

In [66]:
from sklearn.model_selection import cross_val_score

パラメタ調整の場合

In [76]:
from xgboost import XGBClassifier
model = XGBClassifier(random_state=seed, n_estimators=10000)

fit_params = {'verbose': 0,  # 学習中のコマンドライン出力
              'early_stopping_rounds': 10,  # 学習時、評価指標がこの回数連続で改善しなくなった時点でストップ
              'eval_metric': 'logloss',  # early_stopping_roundsの評価指標
              'eval_set': [(X_train, y_train)]  # early_stopping_roundsの評価指標算出用データ
              }
scoring = 'neg_log_loss'
scores = cross_val_score(model, X_train, y_train, cv=kf,
                         scoring=scoring, n_jobs=-1, fit_params=fit_params)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [77]:
print(f'scores={scores}')
print(f'average_score={np.mean(scores)}')

scores=[-0.58192722 -0.55068123 -0.55545767 -0.56509035]
average_score=-0.5632891189373359
