# ML Models

In [None]:
import os
root_dir = '../../' if os.getcwd().split('/')[-1] != 'credit-now' else './'
os.chdir(root_dir)

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import credit_data
from tqdm import tqdm
from vecstack import StackingTransformer
from itertools import combinations
from joblib import dump, load

In [2]:
x_train, x_test, y_train, y_test = credit_data.load_data()
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(15906, 67) (6818, 67) (15906, 1) (6818, 1)


In [3]:
models = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(36)),
    ('SVM', SVC(C=1000, gamma=0.01)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 2)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('NaiveBayes', BernoulliNB(alpha=10)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 400, max_depth = 4, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 400, max_depth = 10))]

In [4]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', accuracy_score(model.predict(x_test), y_test))

Logistic:  0.6421237899677325
KNN:  0.6472572601936052
SVM:  0.6806981519507187
DecisionTree:  0.6914051041361103
RandomForest:  0.6915517747139923
NaiveBayes:  0.6344969199178645
XGBoost:  0.7068055148137283
LightGBM:  0.7062188325022001


In [5]:
clf = LinearDiscriminantAnalysis()
clf.fit(x_train.todense(), y_train)
print('Linear Discriminat Analysis:', accuracy_score(clf.predict(x_test), y_test))

Linear Discriminat Analysis: 0.64124376650044


## Model Stacking

### 5-Fold

In [6]:
# Stacking 모델은 모든 조합에 대해 for문을 돌렸을 때 가장 높은 Accuracy가 나온 모델 조합을 선택
estimator = [
    ('SVM', SVC(kernel='rbf', C=1000, gamma=0.01)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('NaiveBayes', BernoulliNB(alpha=10)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 400, max_depth = 4, verbosity=0))]

In [7]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 5, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [8]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = XGBClassifier(seed = 0, n_estimators = 400, max_depth = 4, verbosity=0)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
accuracy_score(y_test, y_pred)

0.7100322675271341

In [None]:
dump(stack, 'models/stack_5fold.pkl')
dump(model, 'models/model_5fold.pkl')

In [18]:
stack = load('models/stack_5fold.pkl')

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = load('models/model_5fold.pkl')
pred_train = model.predict(S_train)
pred_test = model.predict(S_test)
print('train accuracy:', accuracy_score(y_train, pred_train))
print('test accuracy:', accuracy_score(y_test, pred_test))

train accuracy: 0.703005155287313
test accuracy: 0.7100322675271341


### 10-Fold

In [12]:
# Stacking 모델은 모든 조합에 대해 for문을 돌렸을 때 가장 높은 Accuracy가 나온 모델 조합을 선택
estimator = [
    ('KNN', KNeighborsClassifier(36)),
    ('SVM', SVC(kernel='rbf', C=1000, gamma=0.01)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('NaiveBayes', BernoulliNB(alpha=10)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 400, max_depth = 4, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 400, max_depth = 10))]

In [13]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 10, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [14]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
accuracy_score(y_test, y_pred)

0.7119389850396011

In [None]:
dump(stack, 'models/stack_10fold.pkl')
dump(model, 'models/model_10fold.pkl')

In [20]:
stack = load('models/stack_10fold.pkl')

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = load('models/model_10fold.pkl')
pred_train = model.predict(S_train)
pred_test = model.predict(S_test)
print('train accuracy:', accuracy_score(y_train, pred_train))
print('test accuracy:', accuracy_score(y_test, pred_test))

train accuracy: 0.7100465233245317
test accuracy: 0.7119389850396011


### Combination

In [15]:
estimator = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(36)),
    ('SVM', SVC(kernel='rbf', C=1000, gamma=0.01)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 2)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 100, max_depth = 6)),
    ('NaiveBayes', BernoulliNB(alpha=10)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 400, max_depth = 4, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 400, max_depth = 10))]

estimators = list()
for i in range(2,len(estimator)+1):
    estimators += list(combinations(estimator, i))

In [16]:
stacks = [StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 10, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)
                            for estimator in estimators]

In [17]:
acc_scores = list()

for stack in tqdm(stacks, desc='Stack'):
    stack = stack.fit(x_train, y_train)

    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)

    for name, model in models:
        model.fit(S_train, y_train)
        y_pred = model.predict(S_test)
        acc_scores.append((accuracy_score(y_test, y_pred), (stack, model)))

Stack: 100%|██████████| 247/247 [27:36:04<00:00, 402.29s/it]    


In [None]:
sorted(acc_scores, key=lambda x: x[0], reverse=True)

## Submission

In [None]:
x_train, y_train = credit_data.load_train_data(test_size=0)
x_test = credit_data.load_test_data() # old data의 경우 전처리 별도 진행
sub = pd.read_csv('original_data/sample_submission.csv')

stack = load('models/stack_10fold.pkl')

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = load('models/model_10fold.pkl')
pred_train = model.predict(S_train)
pred_test = model.predict(S_test)

pred_test = model.predict_proba(S_test)
sub.iloc[:,1:] = pred_test
sub.to_csv('credit_data/submission_stacking.csv',index=False)