In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import random
import seaborn as sns

from pycaret.classification import *
from sklearn.metrics import log_loss

In [2]:
seed = 42
np.random.seed(seed)

In [3]:
train = pd.read_csv('input/train.csv', index_col=['index'])
test = pd.read_csv('input/test.csv', index_col=['index'])
submission = pd.read_csv('input/submission.csv', index_col=['index'])

train.shape, test.shape, submission.shape

((26457, 19), (10000, 18), (10000, 3))

## null 처리 

In [None]:
print(8171/train.shape[0]*100)
#display(train.isna().sum())

print(3152/test.shape[0]*100)
#display(test.isna().sum())

In [5]:
train.drop(['occyp_type'], axis=1, inplace=True)
test.drop(['occyp_type'], axis=1, inplace=True)
train.shape, test.shape

((26457, 17), (10000, 16))

In [4]:
train.FLAG_MOBIL.value_counts(), test.FLAG_MOBIL.value_counts()

train.drop(['FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['FLAG_MOBIL'], axis=1, inplace=True)
train.shape, test.shape

((26457, 18), (10000, 17))

In [7]:
train.loc[train.DAYS_EMPLOYED == 365243, 'DAYS_EMPLOYED'] = 0
test.loc[test.DAYS_EMPLOYED == 365243, 'DAYS_EMPLOYED'] = 0

In [15]:
train['DAYS_EMPLOYED'] = train.DAYS_EMPLOYED*-1
test['DAYS_EMPLOYED'] = test.DAYS_EMPLOYED*-1

In [20]:
train['DAYS_EMPLOYED'] = np.log1p(train.DAYS_EMPLOYED)
test['DAYS_EMPLOYED'] = np.log1p(test.DAYS_EMPLOYED)

In [6]:
train.head()

Unnamed: 0_level_0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month,credit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,0,0,0,2.0,-6.0,1.0
1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,0,0,1,3.0,-5.0,1.0
2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,0,1,0,2.0,-22.0,2.0
3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,0,1,0,2.0,-37.0,0.0
4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,0,0,0,2.0,-26.0,2.0


In [None]:
train['gender'] = train['gender'].replace(['F','M'],[0,1])
train['car'] = train['car'].replace(['N','Y'],[0,1])
train['reality'] = train['reality'].replace(['N','Y'],[0,1])

test['gender'] = test['gender'].replace(['F','M'],[0,1])
test['car'] = test['car'].replace(['N','Y'],[0,1])
test['reality'] = test['reality'].replace(['N','Y'],[0,1])

In [None]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()

In [None]:
train['income_type']=label_encoder.fit_transform(train['income_type'])
train['edu_type']=label_encoder.fit_transform(train['edu_type'])
train['family_type']=label_encoder.fit_transform(train['family_type'])
train['house_type']=label_encoder.fit_transform(train['house_type'])

test['income_type']=label_encoder.fit_transform(test['income_type'])
test['edu_type']=label_encoder.fit_transform(test['edu_type'])
test['family_type']=label_encoder.fit_transform(test['family_type'])
test['house_type']=label_encoder.fit_transform(test['house_type'])

In [None]:
train.loc[train.child_num > 0, 'child_num'] = 1
test.loc[test.child_num > 0, 'child_num'] = 1

In [None]:
train['DAYS_BIRTH'] = list(map(int, -train.DAYS_BIRTH/365))
test['DAYS_BIRTH'] = list(map(int, -test.DAYS_BIRTH/365))

In [None]:
train['DAYS_EMPLOYED'] = list(map(int, -train.DAYS_EMPLOYED/365))
test['DAYS_EMPLOYED'] = list(map(int, -test.DAYS_EMPLOYED/365))

In [None]:
train['begin_month'] = - train.begin_month/12
test['begin_month'] = - train.begin_month/12

In [None]:
train.head()

In [None]:
train.corr()

In [None]:
sns.heatmap(train.corr())

## 범주형 자료 

In [21]:
col = ['gender','car','reality','phone','email','work_phone'\
      ,'income_type','edu_type','family_type','house_type','begin_month'\
      #,'child_num','EMPLOYED_YN'
      ]
#train = pd.get_dummies(train, columns=col)
#test = pd.get_dummies(test, columns=col)
#train.shape, test.shape

In [22]:
train.columns

Index(['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'work_phone', 'phone', 'email', 'family_size', 'begin_month', 'credit'],
      dtype='object')

In [23]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'credit'
          , categorical_features=col
          , numeric_features = ['child_num','family_size']#,,'tp17','tp4','Q_A15','Q_A3','Q_A15tp17','Q_A3tp4']#['Q_A1','Q_A3','Q_A5','tp17','tp4','Q_A15tp17','Q_A3tp4']
          #, ignore_features =[]
           )

Unnamed: 0,Description,Value
0,session_id,42
1,Target,credit
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(26457, 17)"
5,Missing Values,False
6,Numeric Features,5
7,Categorical Features,11
8,Ordinal Features,False
9,High Cardinality Features,False


Wall time: 3.23 s


In [24]:
#logloss 적용 및 predict_proba로 제출하기 위해 metric 추가
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target="pred_proba")

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function               <function log_loss at 0x0000022624EE50D8>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [25]:
#best5 = compare_models(fold = 5, sort = 'logloss', n_select = 5, exclude=['svm','ridge'])
best_4 = compare_models(sort = 'LogLoss', n_select = 4
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda'\
                                 ,'et','ada','lda']
                       #,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'xgboost']
                       )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
catboost,CatBoost Classifier,0.7025,0.6983,0.4357,0.6936,0.6321,0.255,0.3287,0.7698,12.278
lightgbm,Light Gradient Boosting Machine,0.6987,0.6904,0.419,0.7035,0.6179,0.2283,0.3182,0.7734,0.378
gbc,Gradient Boosting Classifier,0.6924,0.6449,0.4066,0.6468,0.6054,0.2033,0.2979,0.7998,4.183
rf,Random Forest Classifier,0.7093,0.7375,0.4892,0.6835,0.6682,0.3265,0.3597,0.8069,1.273


In [26]:
blended = blend_models(estimator_list = best_4, fold = 5, optimize = 'logloss')
pred_holdout = predict_model(blended)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.7028,0.7171,0.4273,0.7065,0.6261,0.2452,0.3304,0.7532
1,0.6971,0.7084,0.4206,0.6798,0.619,0.2289,0.3115,0.7628
2,0.6976,0.7321,0.4203,0.6967,0.6183,0.2275,0.315,0.7523
3,0.7011,0.722,0.424,0.6903,0.6228,0.2391,0.3268,0.7536
4,0.7013,0.7286,0.4221,0.7043,0.6217,0.2371,0.3259,0.7491
Mean,0.7,0.7216,0.4229,0.6955,0.6216,0.2356,0.3219,0.7542
SD,0.0022,0.0084,0.0026,0.0097,0.0028,0.0066,0.0073,0.0046


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,Voting Classifier,0.6954,0.7339,0.4264,0.698,0.617,0.24,0.3252,0.7545


In [27]:
final_model = finalize_model(blended)

In [28]:
predictions = predict_model(final_model, data = test)

In [29]:
prep_pipe = get_config("prep_pipe")
prep_pipe.steps.append(['trained_model', final_model])
prections = prep_pipe.predict_proba(test)
prections

array([[0.07050165, 0.15956567, 0.76993267],
       [0.16365959, 0.14809718, 0.68824324],
       [0.08065095, 0.17186751, 0.74748154],
       ...,
       [0.040429  , 0.12597437, 0.83359663],
       [0.11764448, 0.19276428, 0.68959123],
       [0.10452126, 0.24076152, 0.65471722]])

In [30]:
submission['0'] = prections[:,0]
submission['1'] = prections[:,1]
submission['2'] = prections[:,2]
submission

Unnamed: 0_level_0,0,1,2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26457,0.070502,0.159566,0.769933
26458,0.163660,0.148097,0.688243
26459,0.080651,0.171868,0.747482
26460,0.130709,0.132299,0.736992
26461,0.114025,0.216574,0.669401
...,...,...,...
36452,0.160151,0.223049,0.616800
36453,0.169215,0.248977,0.581808
36454,0.040429,0.125974,0.833597
36455,0.117644,0.192764,0.689591


In [31]:
submission.to_csv('output/20210406-3.csv')

from gtts import gTTS
text ="Finish Finish Finish Finish Finish Finish Finish Finish"

tts = gTTS(text=text, lang='en')
tts.save("helloEN.mp3")

In [12]:
from IPython.display import Audio
sound_file = 'helloEN.mp3'

Audio(sound_file, autoplay=True)