In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from time import time

# cross validation score 함수
def get_cross_val(clf, X, y, model_name,
                  cv_num=5, # 5-Fold
                  metric='f1'): # f1-score 기준
    scores = cross_val_score(clf, X, y, cv=cv_num, scoring=metric)
    mean = scores.mean()
    std = scores.std() # 결과에 대한 standard deviation
    p025 = np.quantile(scores, 0.025)
    p975 = np.quantile(scores, 0.975)
    metrics = ['mean', 'stadard deviation', 'p025', 'p975']
    s = pd.Series([mean, std, p025, p975],
                  index=metrics)
    s.name = mo
    return s

# precision 등의 점수 확인 함수
def calculate_metrics(y_true, y_pred, duration, model_name, *args):
    acc = accuracy_score(y_true, y_pred) # 현업에서 accuracy는 요즘 많이 안보는데, 베이스라인으로는 보기 좋음
    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    ck = cohen_kappa_score(y_true, y_pred)
    p, r, fbeta, support = precision_recall_fscore_support(y_true, y_pred)
    metrics = ['accuracy', 'precision', 'recall', 'roc_auc', 'f1_score', 'cohen_kappa',
               'precision_both', 'recall_both', 'fbeta_both', 'support_both', 'time_to_fit (seconds)']
    s = pd.Series([acc, pre, rec, roc_auc, f1, ck, p, r, fbeta, support, duration],
                  index=metrics)
    s.name = model_name
    return s

In [None]:
# Logistic Regression with LogisticRegressionCV
from sklearn.linear_model import LogisticRegressionCV

# 1. modeling training
lr_clf = LogisticRegressionCV(cv=5,
                              penalty='elasticnet',
                              solver='saga',
                              Cs=np.power(10, np.arange(-3, 1, dtype=float)),
                              l1_ratios=np.linspace(0, 1, num=6, dtype=float),
                              max_iter=100,
                              random_state=0)

# measuring time
# why? --> 모델의 학습시간을 알아야 나중에 현업에서 모델의 훈련시간을 고려할 일이 필요할 때 유용하게 사용 가능
start_time = time()
lr_clf.fit(X_train_std, y_train)
time_elapsed = time() - start

# print results
print('took {:.2f} seconds for {} cv iterations with {} parameter settings'.format(time_elapsed,
                                                                                   lr_clf.n_iter_.shape[1],
                                                                                   lr_clf.n_iter_.shape[2]*lr_clf.n_iter_.shape[3]))
print('Optimal regularization strength : {}, Optimal L1 ration : {}'.format(lr_clf.C_[0],
                                                                            lr_clf.l1_ratio_[0]))
print('accuracy (train)', lr_clf.score(X_train_std, y_train))
print('accuracy (test)', lr_clf.score(X_test_std, y_test))

# 2. cross validation
lr_cv = get_cross_val(lr_clf, X_test_std, y_test, 'logistic regression')
print(lr_cv.round(2))

# 3. confusion matrix
y_pred = lr_clf.predict(X_test_std)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm,
                                    display_labels=lr_clf.classes_)
cm_display.plot()

# 4. classification report
# y_pred = lr_clf.predict(X_test_std)
print(classification_report(y_test, y_pred,
                            target_names=list(np.unique(df['diagnosis']))))

lr_metrics = calculate_metrics(y_test, y_pred, time_elasped, 'logistic regression')
print('\n', lr_metrics)

In [None]:
# CV with GridSearchCV
# K-Nearest Neighbors / LDA / SVC / Random Forest

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# 1. model training

# param_grid

# -- KNN --
# param_grid = {'weights' : ['uniform', 'distance'],
#               'n_neighbors' : np.arange(1,16)}
# -- LDA --
# param_grid = {'solver' : ['lsqr', 'eigen'],
#               'shrinkage' : [None, 'auto'],
#               'n_components' : np.arange(1,5)}
# -- SVC --
# param_grid = {'C' : np.power(10, np.arange(0, 3, dtype=float)),
#               'kernel' : ['linear', 'sigmoid', 'rbf'],
#               'gamma' : ['auto', 'scale']}
# -- RF --
# param_grid = {'n_estimators' : np.arange(100, 1000, 200, dtype=int),
#               'max_features' : [None, 'sqrt', 'log2'],
#               'criterion' : ['gini', 'entropy'],
#               'max_depth' : [None, 3, 5, 7]} # depth 지정안하면 무제한으로 들어감 (설정 꼭 필요!)


clf =  # classifier name
gs_clf = GridSearchCV(clf, param_grid=param_grid)

# measuring time
start_time = time()
gs_clf.fit(X_train_std, y_train)
time_elapsed = time() - start

# print results
print('took {:.2f} seconds for {} candidate parameter settings'.format(time_elapsed,
                                                                       len(gs_clf.cv_results_['params'])))
print('Optimal  : {}, Optimal  : {}'.format(gs_clf.best_params_[''],
                                            gs_clf.best_params_['']))
print('accuracy (train)', gs_clf.score(X_train_std, y_train))
print('accuracy (test)', gs_clf.score(X_test_std, y_test))
print(gs_clf.best_estimator_.get_params())

# 2. cross validation
clf_cv = get_cross_val(gs_clf, X_test_std, y_test, 'model_name')
print(clf_cv.round(2))

# 3. confusion matrix
y_pred = gs_clf.predict(X_test_std)
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm,
                                    display_labels=gs_clf.classes_)

# 4. classification report
print(classification_report(y_test, y_pred,
                            target_names=list(np.unique(df['target_name']))))

clf_metrics = calculate_metrics(y_test, y_pred, time_elasped, 'model_name')
print('\n', clf_metrics)