In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import random
import seaborn as sns

from pycaret.classification import *
from sklearn.metrics import log_loss

In [2]:
seed = 42
np.random.seed(seed)

In [3]:
train = pd.read_csv('input/train.csv', index_col=['index'])
test = pd.read_csv('input/test.csv', index_col=['index'])
submission = pd.read_csv('input/submission.csv', index_col=['index'])

train.shape, test.shape, submission.shape

((26457, 19), (10000, 18), (10000, 3))

In [4]:
data=pd.concat([train, test], axis=0)
data.shape

(36457, 19)

## null 처리 

In [5]:
data.drop(['occyp_type','FLAG_MOBIL'], axis=1, inplace=True)
data.shape

(36457, 17)

In [6]:
data.loc[data.DAYS_EMPLOYED == 365243, 'DAYS_EMPLOYED'] = 0
data['DAYS_EMPLOYED'] = data.DAYS_EMPLOYED*-1
data['DAYS_EMPLOYED'] = np.log1p(data.DAYS_EMPLOYED)

In [7]:
# 소득 outlier higher 만 제거 
qua_25 = np.percentile(data.income_total.values, 25)
qua_75 = np.percentile(data.income_total.values, 75)

iqrW = (qua_75 - qua_25)*1.5

lowest_val = qua_25 - iqrW
highest_val = qua_75 + iqrW
print(lowest_val, highest_val)

high_idx = data[data.income_total.values > highest_val].index

data.loc[high_idx, 'income_total'] = highest_val

-33750.0 380250.0


In [8]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()

data['gender'] = data['gender'].replace(['F','M'],[0,1])
data['car'] = data['car'].replace(['N','Y'],[0,1])
data['reality'] = data['reality'].replace(['N','Y'],[0,1])

data['income_type']=label_encoder.fit_transform(data['income_type'])
data['edu_type']=label_encoder.fit_transform(data['edu_type'])
data['family_type']=label_encoder.fit_transform(data['family_type'])
data['house_type']=label_encoder.fit_transform(data['house_type'])

In [9]:
from sklearn.cluster import KMeans
#from sklearn.metrics import silhouette_samples, silhouette_score

In [10]:
kmeans = KMeans(n_clusters=16, init='k-means++', max_iter=300,random_state=seed)
data['cluster'] = kmeans.fit_predict(data[['gender', 'car', 'reality', 'child_num', 'income_total', 'income_type',
       'edu_type', 'family_type', 'house_type', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'work_phone', 'phone', 'email', 'family_size']])

In [31]:
data['family_size'] = data.family_size - data.child_num

In [34]:
data.loc[data.family_size < 0, 'family_size'] = 0

In [None]:
data.corr()

In [None]:
sns.heatmap(data.corr())

## 범주형 자료 

In [11]:
train = data[~data.credit.isna()]
test = data[data.credit.isna()]
test.drop('credit', axis=1, inplace=True)
print(train.shape, test.shape)
test.head(2)

(26457, 18) (10000, 17)


Unnamed: 0_level_0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month,cluster
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
26457,1,1,0,0,112500.0,1,4,0,1,-21990,0.0,0,1,0,2.0,-60.0,8
26458,0,0,1,0,135000.0,2,1,1,1,-18964,9.067855,0,1,0,2.0,-36.0,5


In [12]:
col = ['gender','car','reality','phone','email','work_phone'\
      ,'income_type','edu_type','family_type','house_type'\
      #,'child_num','EMPLOYED_YN'
      ]
#train = pd.get_dummies(train, columns=col)
#test = pd.get_dummies(test, columns=col)
#train.shape, test.shape

In [36]:
%%time
clf = setup(session_id=seed, 
            data = train, target = 'credit'
          , categorical_features=col
          #, numeric_features = ['child_num','family_size']#,,'tp17','tp4','Q_A15','Q_A3','Q_A15tp17','Q_A3tp4']#['Q_A1','Q_A3','Q_A5','tp17','tp4','Q_A15tp17','Q_A3tp4']
          #, ignore_features =['child_num','family_size']
           )

Unnamed: 0,Description,Value
0,session_id,42
1,Target,credit
2,Target Type,Multiclass
3,Label Encoded,
4,Original Data,"(26457, 18)"
5,Missing Values,False
6,Numeric Features,6
7,Categorical Features,11
8,Ordinal Features,False
9,High Cardinality Features,False


Wall time: 2.99 s


In [37]:
#logloss 적용 및 predict_proba로 제출하기 위해 metric 추가
add_metric('logloss', 'LogLoss', log_loss, greater_is_better=False, target="pred_proba")

Name                                                           LogLoss
Display Name                                                   LogLoss
Score Function               <function log_loss at 0x00000204E5D050D8>
Scorer               make_scorer(log_loss, greater_is_better=False,...
Target                                                      pred_proba
Args                                                                {}
Greater is Better                                                False
Multiclass                                                        True
Custom                                                            True
Name: logloss, dtype: object

In [38]:
#best5 = compare_models(fold = 5, sort = 'logloss', n_select = 5, exclude=['svm','ridge'])
best_4 = compare_models(sort = 'LogLoss', n_select = 4
                       ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda'\
                                 ,'et','ada','lda']
                       #,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'gpc', 'mlp', 'ridge', 'rf', 'qda', 'ada', 'xgboost']
                       )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss,TT (Sec)
catboost,CatBoost Classifier,0.705,0.7078,0.4492,0.6892,0.6425,0.2739,0.3368,0.7627,11.047
lightgbm,Light Gradient Boosting Machine,0.6982,0.7033,0.42,0.6938,0.6185,0.2291,0.316,0.7673,0.347
gbc,Gradient Boosting Classifier,0.6937,0.6596,0.4094,0.6457,0.6085,0.2099,0.3014,0.7924,3.133
rf,Random Forest Classifier,0.6988,0.7446,0.5357,0.6769,0.6819,0.3651,0.3716,0.9666,1.389


In [39]:
blended = blend_models(estimator_list = best_4, fold = 5, optimize = 'logloss')
pred_holdout = predict_model(blended)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,0.7084,0.732,0.4448,0.7177,0.6407,0.2714,0.3473,0.737
1,0.7044,0.7251,0.4371,0.6987,0.6341,0.2583,0.3347,0.7471
2,0.7055,0.748,0.4391,0.7002,0.6357,0.263,0.3387,0.736
3,0.706,0.7434,0.4389,0.6926,0.6361,0.2642,0.3408,0.7369
4,0.7078,0.7374,0.4411,0.7109,0.6382,0.269,0.3448,0.7349
Mean,0.7064,0.7372,0.4402,0.704,0.637,0.2652,0.3413,0.7384
SD,0.0015,0.0081,0.0026,0.009,0.0023,0.0046,0.0044,0.0044


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,LogLoss
0,Voting Classifier,0.7014,0.7543,0.4413,0.6961,0.6308,0.266,0.3431,0.7339


In [41]:
final_model = finalize_model(blended)

In [42]:
predictions = predict_model(final_model, data = test)

In [43]:
prep_pipe = get_config("prep_pipe")
prep_pipe.steps.append(['trained_model', final_model])
prections = prep_pipe.predict_proba(test)
prections

array([[0.0968769 , 0.15850111, 0.74462199],
       [0.23543283, 0.14291192, 0.62165525],
       [0.06002495, 0.11060109, 0.82937396],
       ...,
       [0.04030925, 0.09651105, 0.86317969],
       [0.22193265, 0.21429488, 0.56377247],
       [0.09403671, 0.21108366, 0.69487963]])

In [44]:
submission['0'] = prections[:,0]
submission['1'] = prections[:,1]
submission['2'] = prections[:,2]
submission

Unnamed: 0_level_0,0,1,2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
26457,0.096877,0.158501,0.744622
26458,0.235433,0.142912,0.621655
26459,0.060025,0.110601,0.829374
26460,0.098779,0.109632,0.791589
26461,0.093639,0.201148,0.705213
...,...,...,...
36452,0.105625,0.248304,0.646071
36453,0.142349,0.379219,0.478432
36454,0.040309,0.096511,0.863180
36455,0.221933,0.214295,0.563772


In [45]:
submission.to_csv('output/20210410-3.csv')

from gtts import gTTS
text ="Finish Finish Finish Finish Finish Finish Finish Finish"

tts = gTTS(text=text, lang='en')
tts.save("helloEN.mp3")

In [46]:
from IPython.display import Audio
sound_file = 'helloEN.mp3'

Audio(sound_file, autoplay=True)