## Best Models in Group Assignment #2

In [1]:
# Data Wrangling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# Preprocessing
from sklearn.model_selection import train_test_split

# Hyperparameter Optimization
import optuna

# Modeling
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score

# Evaluation
from sklearn.metrics import roc_auc_score

# Etc
import warnings
warnings.filterwarnings('ignore')

In [2]:
loan = pd.read_csv('loan_data.csv')

#'ID'와 'ZIP Code' feature 제거
del loan['ID'], loan['ZIP Code']

y = loan[['Personal Loan']]
X = loan.drop('Personal Loan', axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

### DecisionTree

####  max_depth
- 트리의 최대 깊이 지정
- Default = 10
- 지정된 값까지 tree깊이가 늘어나거나 노드가 가지는 데이터 수가 min_samples_split 보다 작아질때까지 계속 분할

#### min_samples_split
- 노드를 분할하기 위한 최소한의 샘플 데이터수 → 과적합을 제어하는데 사용
- Default = 2 → 작게 설정할 수록 분할 노드가 많아져 과적합 가능성 증가  

#### min_samples_leaf
- 리프노드가 되기 위해 필요한 최소한의 샘플 데이터수
- min_samples_split과 함께 과적합 제어 용도
- 불균형 데이터의 경우 특정 클래스의 데이터가 극도로 작을 수 있으므로 작게 설정 필요  

#### max_features
- 최적의 분할을 위해 고려할 최대 feature 개수
- Default = 'auto' (결정트리에서는 default가 none이었음)
- int형으로 지정 →피처 갯수 / float형으로 지정 →비중
- sqrt 또는 auto : 전체 피처 중 √(피처개수) 만큼 선정
- log : 전체 피처 중 log2(전체 피처 개수) 만큼 선정  

#### max_leaf_nodes 
- 리프노드의 최대 개수  

In [4]:
def tree_objective(trial):  
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 15, step=1),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10, step=1),
        'max_leaf_nodes':  trial.suggest_int('max_leaf_nodes', 2, 20, step=1),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20, step=1),
        'max_features': trial.suggest_categorical('max_features',[2,3,4,5,6,7,8,9]),
    }
    classifier_obj = DecisionTreeClassifier(**params, random_state=0)

    score = cross_val_score(classifier_obj, X_train, y_train, cv=5, scoring='roc_auc')
    roc_auc = score.mean()
    return roc_auc

tree_study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
tree_study.optimize(tree_objective, n_trials=500) 

[32m[I 2022-10-14 13:47:29,506][0m A new study created in memory with name: no-name-d1220933-ed60-4961-8fd5-54abe1aad034[0m
[32m[I 2022-10-14 13:47:29,553][0m Trial 0 finished with value: 0.7428184332463859 and parameters: {'max_depth': 9, 'min_samples_leaf': 3, 'max_leaf_nodes': 10, 'min_samples_split': 18, 'max_features': 8}. Best is trial 0 with value: 0.7428184332463859.[0m
[32m[I 2022-10-14 13:47:29,590][0m Trial 1 finished with value: 0.7301857114282857 and parameters: {'max_depth': 3, 'min_samples_leaf': 2, 'max_leaf_nodes': 6, 'min_samples_split': 20, 'max_features': 7}. Best is trial 0 with value: 0.7428184332463859.[0m
[32m[I 2022-10-14 13:47:29,629][0m Trial 2 finished with value: 0.6231070607060705 and parameters: {'max_depth': 3, 'min_samples_leaf': 4, 'max_leaf_nodes': 2, 'min_samples_split': 6, 'max_features': 9}. Best is trial 0 with value: 0.7428184332463859.[0m
[32m[I 2022-10-14 13:47:29,672][0m Trial 3 finished with value: 0.7618750002551276 and paramet

[32m[I 2022-10-14 13:47:31,082][0m Trial 31 finished with value: 0.7674859639025127 and parameters: {'max_depth': 9, 'min_samples_leaf': 6, 'max_leaf_nodes': 19, 'min_samples_split': 17, 'max_features': 9}. Best is trial 28 with value: 0.7685239478029435.[0m
[32m[I 2022-10-14 13:47:31,146][0m Trial 32 finished with value: 0.7666594848260336 and parameters: {'max_depth': 6, 'min_samples_leaf': 3, 'max_leaf_nodes': 19, 'min_samples_split': 20, 'max_features': 9}. Best is trial 28 with value: 0.7685239478029435.[0m
[32m[I 2022-10-14 13:47:31,203][0m Trial 33 finished with value: 0.767483828612453 and parameters: {'max_depth': 7, 'min_samples_leaf': 4, 'max_leaf_nodes': 17, 'min_samples_split': 19, 'max_features': 9}. Best is trial 28 with value: 0.7685239478029435.[0m
[32m[I 2022-10-14 13:47:31,261][0m Trial 34 finished with value: 0.7623569159211839 and parameters: {'max_depth': 10, 'min_samples_leaf': 5, 'max_leaf_nodes': 17, 'min_samples_split': 17, 'max_features': 9}. Best 

[32m[I 2022-10-14 13:47:32,851][0m Trial 63 finished with value: 0.7428292683860223 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 13, 'min_samples_split': 7, 'max_features': 6}. Best is trial 48 with value: 0.7860381720570015.[0m
[32m[I 2022-10-14 13:47:32,901][0m Trial 64 finished with value: 0.7389812152133579 and parameters: {'max_depth': 4, 'min_samples_leaf': 9, 'max_leaf_nodes': 15, 'min_samples_split': 20, 'max_features': 6}. Best is trial 48 with value: 0.7860381720570015.[0m
[32m[I 2022-10-14 13:47:32,950][0m Trial 65 finished with value: 0.7407667202944783 and parameters: {'max_depth': 6, 'min_samples_leaf': 10, 'max_leaf_nodes': 12, 'min_samples_split': 19, 'max_features': 6}. Best is trial 48 with value: 0.7860381720570015.[0m
[32m[I 2022-10-14 13:47:33,006][0m Trial 66 finished with value: 0.7535318957916199 and parameters: {'max_depth': 5, 'min_samples_leaf': 8, 'max_leaf_nodes': 14, 'min_samples_split': 19, 'max_features': 6}. Best

[32m[I 2022-10-14 13:47:34,726][0m Trial 95 finished with value: 0.7881758775367332 and parameters: {'max_depth': 12, 'min_samples_leaf': 9, 'max_leaf_nodes': 19, 'min_samples_split': 6, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:34,782][0m Trial 96 finished with value: 0.7917818024149355 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 5, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:34,832][0m Trial 97 finished with value: 0.7305468307034785 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 5, 'max_features': 4}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:34,883][0m Trial 98 finished with value: 0.732654892147378 and parameters: {'max_depth': 10, 'min_samples_leaf': 9, 'max_leaf_nodes': 19, 'min_samples_split': 6, 'max_features': 2}. Best is

[32m[I 2022-10-14 13:47:36,471][0m Trial 127 finished with value: 0.7911309341393322 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 17, 'min_samples_split': 4, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:36,528][0m Trial 128 finished with value: 0.7911309341393322 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 17, 'min_samples_split': 4, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:36,578][0m Trial 129 finished with value: 0.7200813728056479 and parameters: {'max_depth': 10, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 6, 'max_features': 3}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:36,634][0m Trial 130 finished with value: 0.7869049504440241 and parameters: {'max_depth': 9, 'min_samples_leaf': 9, 'max_leaf_nodes': 19, 'min_samples_split': 11, 'max_features': 8}. B

[32m[I 2022-10-14 13:47:38,317][0m Trial 159 finished with value: 0.7331540017522161 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 17, 'min_samples_split': 4, 'max_features': 3}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:38,377][0m Trial 160 finished with value: 0.7917818024149355 and parameters: {'max_depth': 13, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 9, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:38,437][0m Trial 161 finished with value: 0.7917818024149355 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 5, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:38,498][0m Trial 162 finished with value: 0.7917818024149355 and parameters: {'max_depth': 15, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 9, 'max_features': 8}. 

[32m[I 2022-10-14 13:47:40,252][0m Trial 191 finished with value: 0.7917818024149355 and parameters: {'max_depth': 12, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 4, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:40,316][0m Trial 192 finished with value: 0.7917818024149355 and parameters: {'max_depth': 12, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 4, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:40,367][0m Trial 193 finished with value: 0.6231070607060705 and parameters: {'max_depth': 1, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 4, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:40,427][0m Trial 194 finished with value: 0.7917818024149355 and parameters: {'max_depth': 14, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 3, 'max_features': 8}. B

[32m[I 2022-10-14 13:47:42,173][0m Trial 223 finished with value: 0.7911309341393322 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 17, 'min_samples_split': 5, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:42,235][0m Trial 224 finished with value: 0.7917818024149355 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 4, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:42,312][0m Trial 225 finished with value: 0.7917818024149355 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 5, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:42,372][0m Trial 226 finished with value: 0.7735174190888476 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 19, 'min_samples_split': 5, 'max_features': 5}. 

[32m[I 2022-10-14 13:47:44,151][0m Trial 255 finished with value: 0.7911309341393322 and parameters: {'max_depth': 12, 'min_samples_leaf': 10, 'max_leaf_nodes': 17, 'min_samples_split': 3, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:44,215][0m Trial 256 finished with value: 0.7917818024149355 and parameters: {'max_depth': 12, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 3, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:44,279][0m Trial 257 finished with value: 0.7917818024149355 and parameters: {'max_depth': 12, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 9, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:44,344][0m Trial 258 finished with value: 0.7888059405430339 and parameters: {'max_depth': 12, 'min_samples_leaf': 10, 'max_leaf_nodes': 19, 'min_samples_split': 9, 'max_features': 8}. 

[32m[I 2022-10-14 13:47:46,358][0m Trial 287 finished with value: 0.7917818024149355 and parameters: {'max_depth': 12, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 11, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:46,425][0m Trial 288 finished with value: 0.7917818024149355 and parameters: {'max_depth': 12, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 11, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:46,501][0m Trial 289 finished with value: 0.7895620161505946 and parameters: {'max_depth': 10, 'min_samples_leaf': 10, 'max_leaf_nodes': 19, 'min_samples_split': 6, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:46,572][0m Trial 290 finished with value: 0.7862740484507633 and parameters: {'max_depth': 10, 'min_samples_leaf': 9, 'max_leaf_nodes': 17, 'min_samples_split': 6, 'max_features': 8}.

[32m[I 2022-10-14 13:47:48,927][0m Trial 319 finished with value: 0.7917818024149355 and parameters: {'max_depth': 15, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 9, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:49,011][0m Trial 320 finished with value: 0.7796423138487318 and parameters: {'max_depth': 15, 'min_samples_leaf': 10, 'max_leaf_nodes': 16, 'min_samples_split': 9, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:49,082][0m Trial 321 finished with value: 0.7339991750450555 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 19, 'min_samples_split': 8, 'max_features': 4}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:49,162][0m Trial 322 finished with value: 0.7917818024149355 and parameters: {'max_depth': 15, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 4, 'max_features': 8}. 

[32m[I 2022-10-14 13:47:51,404][0m Trial 351 finished with value: 0.7305468307034785 and parameters: {'max_depth': 10, 'min_samples_leaf': 10, 'max_leaf_nodes': 18, 'min_samples_split': 7, 'max_features': 4}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:51,483][0m Trial 352 finished with value: 0.7856575899936932 and parameters: {'max_depth': 14, 'min_samples_leaf': 9, 'max_leaf_nodes': 18, 'min_samples_split': 3, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:51,556][0m Trial 353 finished with value: 0.7911309341393322 and parameters: {'max_depth': 14, 'min_samples_leaf': 10, 'max_leaf_nodes': 17, 'min_samples_split': 9, 'max_features': 8}. Best is trial 75 with value: 0.791937414787397.[0m
[32m[I 2022-10-14 13:47:51,631][0m Trial 354 finished with value: 0.7904180628522035 and parameters: {'max_depth': 6, 'min_samples_leaf': 10, 'max_leaf_nodes': 17, 'min_samples_split': 5, 'max_features': 8}. Be

[32m[I 2022-10-14 13:47:54,040][0m Trial 383 finished with value: 0.7305641451900293 and parameters: {'max_depth': 11, 'min_samples_leaf': 10, 'max_leaf_nodes': 17, 'min_samples_split': 6, 'max_features': 4}. Best is trial 374 with value: 0.7926098852232162.[0m
[32m[I 2022-10-14 13:47:54,139][0m Trial 384 finished with value: 0.7985784160048659 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 9, 'max_features': 8}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:47:54,258][0m Trial 385 finished with value: 0.7985784160048659 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 10, 'max_features': 8}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:47:54,355][0m Trial 386 finished with value: 0.7807636948643843 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 15, 'min_samples_split': 10, 'max_features':

[32m[I 2022-10-14 13:47:56,957][0m Trial 414 finished with value: 0.7967413441599261 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 11, 'max_features': 6}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:47:57,036][0m Trial 415 finished with value: 0.7967413441599261 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 11, 'max_features': 6}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:47:57,107][0m Trial 416 finished with value: 0.7967413441599261 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 11, 'max_features': 6}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:47:57,181][0m Trial 417 finished with value: 0.7967413441599261 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 11, 'max_features'

[32m[I 2022-10-14 13:47:59,643][0m Trial 445 finished with value: 0.7967413441599261 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 15, 'max_features': 6}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:47:59,743][0m Trial 446 finished with value: 0.7845696643643956 and parameters: {'max_depth': 6, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 15, 'max_features': 6}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:47:59,845][0m Trial 447 finished with value: 0.7967413441599261 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 11, 'max_features': 6}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:47:59,948][0m Trial 448 finished with value: 0.7812095104918655 and parameters: {'max_depth': 8, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 10, 'max_features'

[32m[I 2022-10-14 13:48:02,776][0m Trial 476 finished with value: 0.7967413441599261 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 15, 'max_features': 6}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:48:02,866][0m Trial 477 finished with value: 0.7845696643643956 and parameters: {'max_depth': 6, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 11, 'max_features': 6}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:48:02,952][0m Trial 478 finished with value: 0.7967413441599261 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 12, 'max_features': 6}. Best is trial 384 with value: 0.7985784160048659.[0m
[32m[I 2022-10-14 13:48:03,055][0m Trial 479 finished with value: 0.7967413441599261 and parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 11, 'max_features'

In [5]:
print("Best score:", tree_study.best_value)
print("Best parameters:", tree_study.best_params)

Best score: 0.7985784160048659
Best parameters: {'max_depth': 7, 'min_samples_leaf': 10, 'max_leaf_nodes': 20, 'min_samples_split': 9, 'max_features': 8}


In [6]:
model = DecisionTreeClassifier(**tree_study.best_params, random_state=0)
model.fit(X_train, y_train)
ROC_AUC = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
print('ROC_AUC_score :', ROC_AUC)

ROC_AUC_score : 0.8229336250169582


### LogisticRegression

#### penalty : 규제(regularization)
- default = l2
input 옵션 : 'l1', 'l2', 'elasticnet', 'none'

#### C : 규제의 강도를 조절
값이 클수록 규제가 약해지고, 값이 작을수혹 규제가 강해짐
- default = 1.0
input 옵션 : 실수

#### class_weight : 학습 시 클래스에 따라 가중치 설정
dict 형태로 {클래스 : 가중치값} 적용 또는 blancerd 옵션 사용
- defualt = None
input 옵션 : 딕셔너리(dict) or 'blanced'

#### solver : 최적화 문제를 해결하기 위한 알고리즘 선택
최적화 문제에서 비용(cost) 또는 손실(loss)함수를 최소화하는 최상의 가중치/모형모수를 찾는 파라미터
- defualt = 'lbfgs'
input 옵션 : 'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'


In [7]:
def lr_objective(trial):
    params = {
        'penalty': trial.suggest_categorical('penalty', ['l1', 'l2', 'elasticnet', 'none']),
        'C': trial.suggest_float('C', 0.01, 105),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', 'None']),
        'solver': trial.suggest_categorical('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),        
    }
    classifier_obj = LogisticRegression(**params)
    
    score = cross_val_score(classifier_obj, X_train, y_train, cv = 5, scoring = 'roc_auc')
    roc_auc = score.mean()
    return roc_auc

lr_study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
lr_study.optimize(lr_objective, n_trials=12)

[32m[I 2022-10-14 13:48:05,104][0m A new study created in memory with name: no-name-5d98e67a-d79d-456f-8875-362ae873e596[0m
[32m[I 2022-10-14 13:48:05,338][0m Trial 0 finished with value: 0.5898840415929347 and parameters: {'penalty': 'none', 'C': 0.5054327114902095, 'class_weight': 'None', 'solver': 'sag'}. Best is trial 0 with value: 0.5898840415929347.[0m
[33m[W 2022-10-14 13:48:05,356][0m Trial 1 failed because of the following error: The value nan is not acceptable.[0m
[32m[I 2022-10-14 13:48:05,560][0m Trial 2 finished with value: 0.5599955507030294 and parameters: {'penalty': 'l2', 'C': 83.54660676461044, 'class_weight': 'None', 'solver': 'saga'}. Best is trial 0 with value: 0.5898840415929347.[0m
[33m[W 2022-10-14 13:48:05,570][0m Trial 3 failed because of the following error: The value nan is not acceptable.[0m
[33m[W 2022-10-14 13:48:05,581][0m Trial 4 failed because of the following error: The value nan is not acceptable.[0m
[33m[W 2022-10-14 13:48:05,590]

In [8]:
print("Best score:", lr_study.best_value)
print("Best parameters:", lr_study.best_params)

Best score: 0.765185216353268
Best parameters: {'penalty': 'l2', 'C': 41.44958309956582, 'class_weight': 'balanced', 'solver': 'lbfgs'}


In [9]:
model = LogisticRegression(**lr_study.best_params)
model.fit(X_train, y_train)

ROC_AUC = roc_auc_score(y_test, model.predict_proba(X_test)[:,1])
print('ROC_AUC_score :', ROC_AUC)

ROC_AUC_score : 0.7808853954687288


### K-NN

#### n_neighbors : 검색할 이웃수, k
- default = 5


#### weights : 예측에 사용되는 가중치 함수
- 'unjiform' : 균일한 가중치
- 'distance' : 거리의 역수로 가중치 부여


#### algorithm : 가장 가까운 이웃을 계산하는 데 사용하는 알고리즘
- 'atuo', 'ball_tree', 'kd_tree', 'brute'


#### p : Minkowski 메트릭에 대한 검정력 매개변수
- default = 2
- p=1 : 맨하튼거리
- p=2 : 유클리드 거리


#### metric : 거리 계산에 사용할 미터법
- default = 'minkowski'

In [10]:
def knn_objective(trial): 
    params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 40, step=1),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'algorithm': trial.suggest_categorical('algorithm', ['ball_tree', 'brute']),
        'p': trial.suggest_int('p', 1, 2),
        'metric': trial.suggest_categorical(
            'metric', ['minkowski', 'canberra', 'braycurtis', 'sokalmichener', 'matching', 'jaccard', 'russellrao'])   
    }  
    classifier_obj = KNeighborsClassifier(**params)

    score = cross_val_score(classifier_obj, X_train, y_train, cv = 5, scoring = 'roc_auc')
    roc_auc = score.mean()
    return roc_auc

knn_study = optuna.create_study(sampler=optuna.samplers.TPESampler(seed=100), direction="maximize")
knn_study.optimize(knn_objective, n_trials=100)

[32m[I 2022-10-14 13:48:06,654][0m A new study created in memory with name: no-name-0729c5b0-b78f-49ee-a4ae-cea3448c1fac[0m
[32m[I 2022-10-14 13:48:06,948][0m Trial 0 finished with value: 0.7010843940261374 and parameters: {'n_neighbors': 22, 'weights': 'distance', 'algorithm': 'ball_tree', 'p': 1, 'metric': 'matching'}. Best is trial 0 with value: 0.7010843940261374.[0m
[32m[I 2022-10-14 13:48:07,017][0m Trial 1 finished with value: 0.6010787094015524 and parameters: {'n_neighbors': 5, 'weights': 'distance', 'algorithm': 'ball_tree', 'p': 2, 'metric': 'braycurtis'}. Best is trial 0 with value: 0.7010843940261374.[0m
[32m[I 2022-10-14 13:48:07,150][0m Trial 2 finished with value: 0.5497629405797724 and parameters: {'n_neighbors': 1, 'weights': 'distance', 'algorithm': 'brute', 'p': 2, 'metric': 'matching'}. Best is trial 0 with value: 0.7010843940261374.[0m
[32m[I 2022-10-14 13:48:07,441][0m Trial 3 finished with value: 0.6919272857132652 and parameters: {'n_neighbors': 2

[32m[I 2022-10-14 13:48:14,421][0m Trial 32 finished with value: 0.7630358887929609 and parameters: {'n_neighbors': 33, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'canberra'}. Best is trial 31 with value: 0.7686972343918065.[0m
[32m[I 2022-10-14 13:48:14,595][0m Trial 33 finished with value: 0.6868544709062743 and parameters: {'n_neighbors': 32, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'matching'}. Best is trial 31 with value: 0.7686972343918065.[0m
[32m[I 2022-10-14 13:48:14,929][0m Trial 34 finished with value: 0.7643112753877429 and parameters: {'n_neighbors': 34, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'canberra'}. Best is trial 31 with value: 0.7686972343918065.[0m
[32m[I 2022-10-14 13:48:15,141][0m Trial 35 finished with value: 0.6355694180387426 and parameters: {'n_neighbors': 30, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'minkowski'}. Best is trial 31 with value: 0.7686972343918065.[0m

[32m[I 2022-10-14 13:48:24,202][0m Trial 65 finished with value: 0.7707775069598797 and parameters: {'n_neighbors': 40, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'canberra'}. Best is trial 55 with value: 0.7710742336988801.[0m
[32m[I 2022-10-14 13:48:24,542][0m Trial 66 finished with value: 0.7707775069598797 and parameters: {'n_neighbors': 40, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'canberra'}. Best is trial 55 with value: 0.7710742336988801.[0m
[32m[I 2022-10-14 13:48:24,709][0m Trial 67 finished with value: 0.616235791181159 and parameters: {'n_neighbors': 40, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'russellrao'}. Best is trial 55 with value: 0.7710742336988801.[0m
[32m[I 2022-10-14 13:48:24,882][0m Trial 68 finished with value: 0.6854880352831201 and parameters: {'n_neighbors': 40, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'sokalmichener'}. Best is trial 55 with value: 0.7710742336988801

[32m[I 2022-10-14 13:48:33,636][0m Trial 98 finished with value: 0.6271090221267024 and parameters: {'n_neighbors': 39, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'minkowski'}. Best is trial 55 with value: 0.7710742336988801.[0m
[32m[I 2022-10-14 13:48:34,000][0m Trial 99 finished with value: 0.7686972343918065 and parameters: {'n_neighbors': 38, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'canberra'}. Best is trial 55 with value: 0.7710742336988801.[0m


In [11]:
print("Best score:", knn_study.best_value)
print("Best parameters:", knn_study.best_params)

Best score: 0.7710742336988801
Best parameters: {'n_neighbors': 39, 'weights': 'uniform', 'algorithm': 'brute', 'p': 2, 'metric': 'canberra'}


In [12]:
model = KNeighborsClassifier(**knn_study.best_params)
model.fit(X_train, y_train)

ROC_AUC = roc_auc_score(y_test, model.predict_proba(X_test)[:,1]) # AUC 
print('ROC_AUC_score :', ROC_AUC)

ROC_AUC_score : 0.8120887600054266


## End