In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import random
import seaborn as sns

from pycaret.classification import *
from sklearn.metrics import log_loss

In [2]:
seed = 42
np.random.seed(seed)

In [3]:
train = pd.read_csv('input/train.csv', index_col=['index'])
test = pd.read_csv('input/test.csv', index_col=['index'])
submission = pd.read_csv('input/submission.csv', index_col=['index'])

train.shape, test.shape, submission.shape

((26457, 19), (10000, 18), (10000, 3))

## null 처리 

In [None]:
print(8171/train.shape[0]*100)
#display(train.isna().sum())

print(3152/test.shape[0]*100)
#display(test.isna().sum())

In [4]:
train.drop(['occyp_type'], axis=1, inplace=True)
test.drop(['occyp_type'], axis=1, inplace=True)
train.shape, test.shape

((26457, 18), (10000, 17))

In [None]:
train.FLAG_MOBIL.value_counts(), test.FLAG_MOBIL.value_counts()

In [5]:
train.drop(['FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['FLAG_MOBIL'], axis=1, inplace=True)
train.shape, test.shape

((26457, 17), (10000, 16))

In [None]:
train.head()

In [None]:
train['gender'] = train['gender'].replace(['F','M'],[0,1])
train['car'] = train['car'].replace(['N','Y'],[0,1])
train['reality'] = train['reality'].replace(['N','Y'],[0,1])

test['gender'] = test['gender'].replace(['F','M'],[0,1])
test['car'] = test['car'].replace(['N','Y'],[0,1])
test['reality'] = test['reality'].replace(['N','Y'],[0,1])

In [None]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()

In [None]:
train['income_type']=label_encoder.fit_transform(train['income_type'])
train['edu_type']=label_encoder.fit_transform(train['edu_type'])
train['family_type']=label_encoder.fit_transform(train['family_type'])
train['house_type']=label_encoder.fit_transform(train['house_type'])

test['income_type']=label_encoder.fit_transform(test['income_type'])
test['edu_type']=label_encoder.fit_transform(test['edu_type'])
test['family_type']=label_encoder.fit_transform(test['family_type'])
test['house_type']=label_encoder.fit_transform(test['house_type'])

In [None]:
train.loc[train.child_num > 0, 'child_num'] = 1
test.loc[test.child_num > 0, 'child_num'] = 1

In [None]:
train['DAYS_BIRTH'] = list(map(int, -train.DAYS_BIRTH/365))
test['DAYS_BIRTH'] = list(map(int, -test.DAYS_BIRTH/365))

In [None]:
train['EMPLOYED_YN'] = '1'
test['EMPLOYED_YN'] = '1'
train.loc[train.DAYS_EMPLOYED == 365243, 'EMPLOYED_YN'] = 0
test.loc[test.DAYS_EMPLOYED == 365243, 'EMPLOYED_YN'] = 0

train.loc[train.DAYS_EMPLOYED == 365243, 'DAYS_EMPLOYED'] = 0
test.loc[test.DAYS_EMPLOYED == 365243, 'DAYS_EMPLOYED'] = 0

In [None]:
train['DAYS_EMPLOYED'] = list(map(int, -train.DAYS_EMPLOYED/365))
test['DAYS_EMPLOYED'] = list(map(int, -test.DAYS_EMPLOYED/365))

In [None]:
train['begin_month'] = - train.begin_month/12
test['begin_month'] = - train.begin_month/12

In [None]:
train.head()

In [None]:
train.corr()

In [None]:
sns.heatmap(train.corr())

In [20]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1)

train['family'] = pca.fit_transform(train[['child_num','family_size']])
test['family'] = pca.fit_transform(test[['child_num','family_size']])

## 범주형 자료 

In [6]:
col = ['gender','car','reality','phone','email','work_phone'\
      ,'income_type','edu_type','family_type','house_type','begin_month'\
      #,'child_num','EMPLOYED_YN'
      ]
#train = pd.get_dummies(train, columns=col)
#test = pd.get_dummies(test, columns=col)
#train.shape, test.shape

In [None]:
train.columns

In [25]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'credit'
          , categorical_features=col
          , numeric_features = ['family']#,,'tp17','tp4','Q_A15','Q_A3','Q_A15tp17','Q_A3tp4']#['Q_A1','Q_A3','Q_A5','tp17','tp4','Q_A15tp17','Q_A3tp4']
          , ignore_features =['child_num','family_size']
           )

Unnamed: 0,Description,Value
0,session_id,42
1,Target,credit
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(26457, 18)"
5,Missing Values,False
6,Numeric Features,4
7,Categorical Features,11
8,Ordinal Features,False
9,High Cardinality Features,False


Wall time: 5.31 s


In [26]:
#logloss 적용 및 predict_proba로 제출하기 위해 metric 추가
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target="pred_proba")

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function               <function log_loss at 0x000001E0147750D8>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [27]:
#best5 = compare_models(fold = 5, sort = 'logloss', n_select = 5, exclude=['svm','ridge'])
best_4 = compare_models(sort = 'LogLoss', n_select = 4
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda'\
                                 ,'et','ada','lda']
                       #,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'xgboost']
                       )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
catboost,CatBoost Classifier,0.703,0.7004,0.4357,0.6903,0.6326,0.2566,0.3302,0.7682,12.382
lightgbm,Light Gradient Boosting Machine,0.697,0.6883,0.417,0.6965,0.6156,0.2229,0.3127,0.7747,0.37
gbc,Gradient Boosting Classifier,0.6923,0.6446,0.4063,0.6443,0.6052,0.203,0.2975,0.7998,4.768
rf,Random Forest Classifier,0.7095,0.7349,0.4867,0.6838,0.6669,0.3237,0.3591,0.8143,1.54


In [28]:
blended = blend_models(estimator_list = best_4, fold = 5, optimize = 'logloss')
pred_holdout = predict_model(blended)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.7025,0.7171,0.4269,0.7173,0.6256,0.244,0.3295,0.7538
1,0.6987,0.7086,0.4223,0.6838,0.6209,0.2325,0.3174,0.7613
2,0.6974,0.728,0.4207,0.7003,0.6184,0.2276,0.3137,0.7549
3,0.7014,0.7213,0.423,0.6922,0.6221,0.2382,0.3282,0.7539
4,0.7011,0.7241,0.4222,0.7038,0.6217,0.2368,0.3248,0.7521
Mean,0.7002,0.7198,0.423,0.6995,0.6218,0.2358,0.3227,0.7552
SD,0.0019,0.0066,0.0021,0.0113,0.0023,0.0055,0.0062,0.0032


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,Voting Classifier,0.6946,0.7328,0.4251,0.6896,0.6158,0.2375,0.323,0.7549


In [11]:
final_model = finalize_model(blended)

In [12]:
predictions = predict_model(final_model, data = test)

In [13]:
prep_pipe = get_config("prep_pipe")
prep_pipe.steps.append(['trained_model', final_model])
prections = prep_pipe.predict_proba(test)
prections

array([[0.06957352, 0.15107483, 0.77935165],
       [0.1554555 , 0.14799151, 0.69655299],
       [0.10375302, 0.17195439, 0.72429259],
       ...,
       [0.04002814, 0.1048193 , 0.85515256],
       [0.10982675, 0.20106746, 0.68910579],
       [0.09365319, 0.25267882, 0.65366799]])

In [14]:
submission['0'] = prections[:,0]
submission['1'] = prections[:,1]
submission['2'] = prections[:,2]
submission

Unnamed: 0_level_0,0,1,2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26457,0.069574,0.151075,0.779352
26458,0.155456,0.147992,0.696553
26459,0.103753,0.171954,0.724293
26460,0.136337,0.144247,0.719415
26461,0.113320,0.229886,0.656794
...,...,...,...
36452,0.154237,0.191424,0.654339
36453,0.148129,0.261754,0.590116
36454,0.040028,0.104819,0.855153
36455,0.109827,0.201067,0.689106


In [15]:
submission.to_csv('output/20210406-2-1.csv')