In [104]:
import pandas as pd
import pandas_profiling
import os
import pickle
import gc
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
import lightgbm as lgb

#データ読み込み
train = pd.read_csv("data_EDA/train.csv")
test = pd.read_csv("data_EDA/test.csv")

from pytorch_tabnet.tab_model import TabNetClassifier
#https://www.kaggle.com/code/masaonda/titanic-how-to-use-tabnet/notebook

In [105]:
X_train = train[['DiabetesPedigreeFunction',
                 'BMI',
                 'Glucose',
                 'Age',
                 'Pregnancies',
                 'Pregnancies_bin',
                 'BloodPressure_na' ,
                 'BloodPressure', 
                 'SkinThickness',
                 'Insulin',
                 'Insulin_na',
                 ]]
id_train = train[['index']]
y_train = train[['Outcome']]

In [106]:
X_train['BloodPressure_na'] = X_train['BloodPressure_na'].astype(str)
X_train['Insulin_na'] = X_train['Insulin_na'].astype(str)
X_train['Pregnancies_bin'] = X_train['Pregnancies_bin'].astype(str)

In [120]:
BloodPressure_oh = pd.get_dummies(X_train['BloodPressure_na'],prefix='BP_oh')
Insulin_oh = pd.get_dummies(X_train['Insulin_na'],prefix='In_oh')
Pregnancies_bin_oh =pd.get_dummies(X_train['Pregnancies_bin'],prefix='Pre_oh')
BloodPressure_oh

Unnamed: 0,BP_oh_0,BP_oh_1
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0
...,...,...
2995,1,0
2996,1,0
2997,1,0
2998,1,0


In [108]:
scaling_columns = ['DiabetesPedigreeFunction',
                    'BMI',
                    'Glucose',
                    'Age',
                    'Pregnancies',
                    'BloodPressure', 
                    'SkinThickness',
                    'Insulin',
                 ]
std = StandardScaler()
std.fit(X_train[scaling_columns])
# 標準化したカラムのみ元のDataFrameに戻す
scaled_X_train = pd.DataFrame(std.transform(X_train[scaling_columns]), columns=scaling_columns, index=X_train.index)
X_train.update(scaled_X_train)

In [58]:
random_state = 123
params = {
    
    'random_state' : random_state,
    
    }


In [59]:
x_tr, x_va, y_tr, y_va = train_test_split(X_train,
                                          y_train,
                                          test_size=0.2,
                                          shuffle=True,
                                          stratify=y_train,
                                          random_state=random_state)


In [60]:
y_tr=np.squeeze(y_tr.values)
y_va=np.squeeze(y_va.values)

x_tr=x_tr.values
x_va=x_va.values

In [61]:
model = TabNetClassifier()
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr,y_tr),(x_va,y_va)],
          
         )

epoch 0  | loss: 0.71333 | val_0_auc: 0.42985 | val_1_auc: 0.48085 |  0:00:00s
epoch 1  | loss: 0.59412 | val_0_auc: 0.46189 | val_1_auc: 0.46627 |  0:00:00s
epoch 2  | loss: 0.56575 | val_0_auc: 0.48694 | val_1_auc: 0.47225 |  0:00:00s
epoch 3  | loss: 0.53255 | val_0_auc: 0.44495 | val_1_auc: 0.40373 |  0:00:00s
epoch 4  | loss: 0.52056 | val_0_auc: 0.46161 | val_1_auc: 0.39683 |  0:00:00s
epoch 5  | loss: 0.50049 | val_0_auc: 0.46271 | val_1_auc: 0.43251 |  0:00:00s
epoch 6  | loss: 0.50311 | val_0_auc: 0.45889 | val_1_auc: 0.44559 |  0:00:00s
epoch 7  | loss: 0.48966 | val_0_auc: 0.48419 | val_1_auc: 0.48539 |  0:00:00s
epoch 8  | loss: 0.48051 | val_0_auc: 0.50468 | val_1_auc: 0.53073 |  0:00:00s
epoch 9  | loss: 0.48362 | val_0_auc: 0.50451 | val_1_auc: 0.50539 |  0:00:00s
epoch 10 | loss: 0.47942 | val_0_auc: 0.49359 | val_1_auc: 0.48407 |  0:00:00s
epoch 11 | loss: 0.48047 | val_0_auc: 0.50979 | val_1_auc: 0.48814 |  0:00:00s
epoch 12 | loss: 0.47631 | val_0_auc: 0.51291 | val_

In [13]:
# cvでの評価用
def train_cv(input_x,
             input_y,
             input_id,
             params,
             random_state=123,
             n_splits=5
            ):
    
    metrics = []
    imp = pd.DataFrame()
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).
              split(input_x, input_y))
    
    
    for nfold in np.arange(n_splits):
        print('-'*20, nfold, '-'*20)
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = input_x.loc[idx_tr, :], input_y.loc[idx_tr, :]
        x_va, y_va = input_x.loc[idx_va, :], input_y.loc[idx_va, :]
        print(x_tr.shape, y_tr.shape)
        print(x_va.shape, y_va.shape)
        print('y_train:{:.3f}, y_tr:{:.3f}, y_va{:.3f}'.
              format(y_train['Outcome'].mean(), y_tr['Outcome'].mean(), y_va['Outcome'].mean(),))

        
        model = TabNetClassifier()
        model.fit(x_tr.values,
                  y_tr.values,
                  eval_set=[(x_tr.values,y_tr.values),(x_va.values,y_va.values)],
                  
                 )

        y_tr_pred = model.predict(x_tr)
        y_va_pred = model.predict(x_va)
        metric_tr = accuracy_score(y_tr, y_tr_pred)
        metric_va = accuracy_score(y_va, y_va_pred)
        print('[accuracy] tr: {:.2f}, va: {:2f}'.
             format(metric_tr, metric_va))
        metrics.append([nfold, metric_tr, metric_va])

        _imp = pd.DataFrame({'col':input_x.columns, 'imp':model.feature_importances_,'nfold':nfold})
        imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

    print('-'*20, 'result', '-'*20)
    metrics = np.array(metrics)
    print(metrics)

    print('[cv] tr: {:.2f}+-{:.2f}, va: {:.2f}'.format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std()
    ))

    imp = imp.groupby('col')['imp'].agg(['mean', 'std'])
    imp.columns = ['imp', 'imp_std']
    imp = imp.reset_index(drop=False)
    imp=imp.sort_values('imp', ascending=False, ignore_index=True)
    print('-'*20, 'imp', '-'*20)
    print(imp)

    print('Done')
    
    return imp, metrics

In [14]:
imp, metrics = train_cv(X_train, y_train, id_train, params, random_state=random_state, n_splits=4)

-------------------- 0 --------------------
(2250, 11) (2250, 1)
(750, 11) (750, 1)
y_train:0.239, y_tr:0.239, y_va0.240


ValueError: Input contains NaN.