# ML Models

In [1]:
import os
root_dir = '../../' if os.getcwd().split('/')[-1] != 'credit-now' else './'
os.chdir(root_dir)

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import credit_data
from tqdm import tqdm
from vecstack import StackingTransformer
from itertools import combinations
from joblib import dump, load

In [2]:
x_train, x_test, y_train, y_test = credit_data.load_train_data()
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(18519, 65) (7938, 65) (18519, 1) (7938, 1)


In [3]:
models = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(19)),
    ('SVM', SVC(C=1, gamma=0.1)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 2)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 300, max_depth = 10)),
    ('NaiveBayes', BernoulliNB(alpha=1)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 200, max_depth = 6, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 400, max_depth = 10))]

In [4]:
for name, model in models:
    model.fit(x_train, y_train)
    print(f'{name}: ', accuracy_score(model.predict(x_test), y_test))

Logistic:  0.6432350718065004
KNN:  0.6519274376417233
SVM:  0.6810279667422524
DecisionTree:  0.6926177878558831
RandomForest:  0.6953892668178382
NaiveBayes:  0.6423532375913328
XGBoost:  0.7161753590325018
LightGBM:  0.7163013353489543


In [5]:
clf = LinearDiscriminantAnalysis()
clf.fit(x_train, y_train)
print('Linear Discriminat Analysis:', accuracy_score(clf.predict(x_test), y_test))

Linear Discriminat Analysis: 0.6424792139077853


## Model Stacking

### 5-Fold

In [6]:
# Stacking 모델은 모든 조합에 대해 for문을 돌렸을 때 가장 높은 Accuracy가 나온 모델 조합을 선택
estimator = [
    ('SVM', SVC(C=1, gamma=0.1)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 2)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 300, max_depth = 10)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 200, max_depth = 6, verbosity=0))]

In [7]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 5, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [8]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = RandomForestClassifier(random_state = 0, n_estimators = 300, max_depth = 10)
model = model.fit(S_train, y_train)

pred_train = model.predict(S_train)
pred_test = model.predict(S_test)
print('train accuracy:', accuracy_score(y_train, pred_train))
print('test accuracy:', accuracy_score(y_test, pred_test))

train accuracy: 0.7065716291376424
test accuracy: 0.718946837994457


In [9]:
dump(stack, 'models/new_stack_5fold.pkl', compress=3)
dump(model, 'models/new_model_5fold.pkl')

['models/new_model_5fold.pkl']

In [10]:
stack = load('models/new_stack_5fold.pkl')

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = load('models/new_model_5fold.pkl')

pred_train = model.predict(S_train)
pred_test = model.predict(S_test)
print('train accuracy:', accuracy_score(y_train, pred_train))
print('test accuracy:', accuracy_score(y_test, pred_test))

train accuracy: 0.7065716291376424
test accuracy: 0.718946837994457


### 10-Fold

In [11]:
# Stacking 모델은 모든 조합에 대해 for문을 돌렸을 때 가장 높은 Accuracy가 나온 모델 조합을 선택
estimator = [
    ('KNN', KNeighborsClassifier(19)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 300, max_depth = 10)),
    ('NaiveBayes', BernoulliNB(alpha=1)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 200, max_depth = 6, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 400, max_depth = 10))]

In [12]:
stack = StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 10, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)

In [13]:
stack = stack.fit(x_train, y_train)

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = XGBClassifier(seed = 0, n_estimators = 200, max_depth = 6, verbosity=0)
model = model.fit(S_train, y_train)
pred_train = model.predict(S_train)
pred_test = model.predict(S_test)
print('train accuracy:', accuracy_score(y_train, pred_train))
print('test accuracy:', accuracy_score(y_test, pred_test))

train accuracy: 0.7146174199470814
test accuracy: 0.719954648526077


In [14]:
dump(stack, 'models/new_stack_10fold.pkl', compress=3)
dump(model, 'models/new_model_10fold.pkl')

['models/new_model_10fold.pkl']

In [15]:
stack = load('models/new_stack_10fold.pkl')

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = load('models/new_model_10fold.pkl')
pred_train = model.predict(S_train)
pred_test = model.predict(S_test)
print('train accuracy:', accuracy_score(y_train, pred_train))
print('test accuracy:', accuracy_score(y_test, pred_test))

train accuracy: 0.7146174199470814
test accuracy: 0.719954648526077


### Combination

In [4]:
estimator = [
    ('Logistic', LogisticRegression()),
    ('KNN', KNeighborsClassifier(19)),
    ('SVM', SVC(C=1, gamma=0.1)),
    ('DecisionTree', DecisionTreeClassifier(random_state = 0, max_depth = 2)),
    ('RandomForest', RandomForestClassifier(random_state = 0, n_estimators = 300, max_depth = 10)),
    ('NaiveBayes', BernoulliNB(alpha=1)),
    ('XGBoost', XGBClassifier(seed = 0, n_estimators = 200, max_depth = 6, verbosity=0)),
    ('LightGBM', LGBMClassifier(random_state=0, n_estimators = 400, max_depth = 10))]

estimators = list()
for i in range(4,len(estimator)+1):
    estimators += list(combinations(estimator, i))

In [5]:
stacks = [StackingTransformer(estimator, 
                            regression = False, 
                            metric = accuracy_score, 
                            n_folds = 5, stratified = True, shuffle = True, 
                            random_state = 0, verbose = 0)
                            for estimator in estimators]

In [6]:
acc_scores = list()

for stack in tqdm(stacks, desc='Stack'):
    stack = stack.fit(x_train, y_train)

    S_train = stack.transform(x_train)
    S_test = stack.transform(x_test)

    for name, model in models:
        model.fit(S_train, y_train)
        y_pred = model.predict(S_test)
        acc_scores.append((accuracy_score(y_test, y_pred), (stack, model)))

Stack: 100%|██████████| 163/163 [9:16:12<00:00, 204.74s/it]  


In [None]:
sorted(acc_scores, key=lambda x: x[0], reverse=True)

## Submission

In [None]:
x_train, y_train = credit_data.load_train_data(test_size=0)
y_train = np.array(y_train[['credit']])
x_test = credit_data.load_test_data()
sample = pd.read_csv('original_data/sample_submission.csv')
x_train.shape, x_test.shape, y_train.shape, sample.shape

((26457, 65), (10000, 65), (26457, 1), (10000, 4))

In [None]:
stack = load('models/new_stack_10fold.pkl')

S_train = stack.transform(x_train)
S_test = stack.transform(x_test)

model = load('models/new_model_10fold.pkl')
pred_test = model.predict_proba(S_test)
sample.iloc[:,1:] = pred_test
sample.to_csv('credit_data/submission_new_stacking.csv',index=False)