In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import random

from pycaret.classification import *
from sklearn.metrics import log_loss

In [3]:
seed = 42
np.random.seed(seed)

In [4]:
train = pd.read_csv('input/train.csv', index_col=['index'])
test = pd.read_csv('input/test.csv', index_col=['index'])
submission = pd.read_csv('input/submission.csv', index_col=['index'])

train.shape, test.shape, submission.shape

((26457, 19), (10000, 18), (10000, 3))

## null 처리 

In [5]:
print(8171/train.shape[0]*100)
display(train.isna().sum())

print(3152/test.shape[0]*100)
display(test.isna().sum())

30.884076047926822


gender              0
car                 0
reality             0
child_num           0
income_total        0
income_type         0
edu_type            0
family_type         0
house_type          0
DAYS_BIRTH          0
DAYS_EMPLOYED       0
FLAG_MOBIL          0
work_phone          0
phone               0
email               0
occyp_type       8171
family_size         0
begin_month         0
credit              0
dtype: int64

31.52


gender              0
car                 0
reality             0
child_num           0
income_total        0
income_type         0
edu_type            0
family_type         0
house_type          0
DAYS_BIRTH          0
DAYS_EMPLOYED       0
FLAG_MOBIL          0
work_phone          0
phone               0
email               0
occyp_type       3152
family_size         0
begin_month         0
dtype: int64

In [6]:
train.drop(['occyp_type'], axis=1, inplace=True)
test.drop(['occyp_type'], axis=1, inplace=True)
train.shape, test.shape

((26457, 18), (10000, 17))

## 범주형 자료 

In [7]:
col = ['gender','car','reality','phone','email','work_phone'\
      ,'income_type','edu_type','family_type','house_type','begin_month']
#train = pd.get_dummies(train, columns=col)
#test = pd.get_dummies(test, columns=col)
#train.shape, test.shape

In [8]:
train.columns

Index(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'FLAG_MOBIL', 'work_phone', 'phone', 'email', 'family_size',
       'begin_month', 'credit'],
      dtype='object')

In [10]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'credit'
          , categorical_features=col
          , numeric_features = ['child_num','family_size']#,,'tp17','tp4','Q_A15','Q_A3','Q_A15tp17','Q_A3tp4']#['Q_A1','Q_A3','Q_A5','tp17','tp4','Q_A15tp17','Q_A3tp4']
          #, ignore_features =[]
           )

Unnamed: 0,Description,Value
0,session_id,42
1,Target,credit
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(26457, 18)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,12
8,Ordinal Features,False
9,High Cardinality Features,False


Wall time: 4.73 s


In [11]:
#logloss 적용 및 predict_proba로 제출하기 위해 metric 추가
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target="pred_proba")

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function               <function log_loss at 0x000002437EAA8558>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [13]:
#best5 = compare_models(fold = 5, sort = 'logloss', n_select = 5, exclude=['svm','ridge'])
best_3 = compare_models(sort = 'LogLoss', n_select = 3
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                       #,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'xgboost']
                       )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
catboost,CatBoost Classifier,0.7024,0.6985,0.436,0.6953,0.6321,0.2547,0.3284,0.7702,12.688
lightgbm,Light Gradient Boosting Machine,0.6974,0.6917,0.4169,0.695,0.6158,0.224,0.3141,0.7727,0.371
rf,Random Forest Classifier,0.7095,0.7362,0.4875,0.6832,0.6677,0.3259,0.3598,0.799,1.179
gbc,Gradient Boosting Classifier,0.6922,0.6443,0.4065,0.6443,0.6053,0.2033,0.2969,0.7998,4.555
lda,Linear Discriminant Analysis,0.6925,0.6156,0.4099,0.6113,0.6086,0.2109,0.2959,0.8174,0.171
ada,Ada Boost Classifier,0.6924,0.6255,0.4069,0.6227,0.6058,0.2044,0.2974,1.0815,0.482
et,Extra Trees Classifier,0.6789,0.6909,0.4686,0.6402,0.6432,0.2784,0.2964,2.0511,1.886


In [16]:
blended = blend_models(estimator_list = best_3, fold = 3, optimize = 'logloss')
pred_holdout = predict_model(blended)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.7016,0.7106,0.4346,0.6822,0.6315,0.2544,0.3255,0.7584
1,0.7035,0.7148,0.4383,0.695,0.6342,0.2572,0.3329,0.7568
2,0.7027,0.7184,0.4347,0.6888,0.6317,0.2563,0.3296,0.7524
Mean,0.7026,0.7146,0.4359,0.6887,0.6325,0.256,0.3293,0.7559
SD,0.0008,0.0032,0.0017,0.0052,0.0012,0.0012,0.003,0.0025


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,Voting Classifier,0.7016,0.7364,0.4437,0.6949,0.6326,0.2689,0.3433,0.7468


In [17]:
final_model = finalize_model(blended)

In [18]:
predictions = predict_model(final_model, data = test)

In [29]:
prep_pipe = get_config("prep_pipe")
prep_pipe.steps.append(['trained_model', final_model])
prections = prep_pipe.predict_proba(test)
prections

array([[0.05234324, 0.15701009, 0.79064667],
       [0.15888352, 0.1723765 , 0.66873999],
       [0.09759569, 0.14824225, 0.75416207],
       ...,
       [0.03087814, 0.13726961, 0.83185225],
       [0.11777228, 0.18800716, 0.69422056],
       [0.07832889, 0.27659719, 0.64507392]])

In [36]:
submission['0'] = prections[:,0]
submission['1'] = prections[:,1]
submission['2'] = prections[:,2]
submission

Unnamed: 0_level_0,0,1,2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26457,0.052343,0.157010,0.790647
26458,0.158884,0.172376,0.668740
26459,0.097596,0.148242,0.754162
26460,0.134136,0.141764,0.724100
26461,0.127133,0.236350,0.636517
...,...,...,...
36452,0.146825,0.246160,0.607015
36453,0.171459,0.239252,0.589289
36454,0.030878,0.137270,0.831852
36455,0.117772,0.188007,0.694221


In [38]:
submission.to_csv('output/20210405-1.csv')