In [39]:
import os
import sys
import numpy as np
from scipy import sparse
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
from mpl_toolkits.mplot3d import Axes3D, axes3d
from scipy.cluster import hierarchy
import seaborn as sns
import spacy
import nltk
from konlpy.tag import Okt
import graphviz
from sklearn.utils.fixes import loguniform
import scipy.stats as ss

In [43]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GroupKFold, StratifiedGroupKFold
from sklearn.model_selection import RepeatedKFold, RepeatedStratifiedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR, SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay
from sklearn.metrics import roc_curve, roc_auc_score, RocCurveDisplay
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score
from sklearn.utils.fixes import loguniform

In [3]:
from sklearn.datasets import make_blobs
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits

In [4]:
iris = load_iris()
logreg = LogisticRegression(max_iter=1000)

In [7]:
scores = cross_val_score(logreg, iris['data'], iris['target'], cv=10)
scores

array([1.        , 0.93333333, 1.        , 1.        , 0.93333333,
       0.93333333, 0.93333333, 1.        , 1.        , 1.        ])

In [10]:
res = cross_validate(logreg, iris['data'], iris['target'], return_train_score=True, cv=5)
res

{'fit_time': array([0.03471446, 0.02602267, 0.0160141 , 0.01701546, 0.01701498]),
 'score_time': array([0., 0., 0., 0., 0.]),
 'test_score': array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ]),
 'train_score': array([0.96666667, 0.96666667, 0.98333333, 0.98333333, 0.975     ])}

In [15]:
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
cross_val_score(logreg, iris['data'], iris['target'], cv=kfold)

array([0.98, 0.96, 0.96])

In [23]:
skfold = StratifiedKFold(n_splits=5)
cross_val_score(logreg, iris['data'], iris['target'], cv=skfold)

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [25]:
loo = LeaveOneOut()
cross_val_score(logreg, iris['data'], iris['target'], cv=loo).mean()

0.9666666666666667

In [33]:
shuffle_split = StratifiedShuffleSplit(train_size=0.6, test_size=0.2, n_splits=10, random_state=0)
cross_val_score(logreg, iris['data'], iris['target'], cv=shuffle_split).mean()

0.9566666666666667

In [34]:
X, y = make_blobs(n_samples=12, random_state=0)
groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
cross_val_score(logreg, X, y, cv=StratifiedGroupKFold(n_splits=3), groups=groups)

array([0.75      , 0.66666667, 0.8       ])

In [36]:
rskfold = RepeatedStratifiedKFold(n_splits=3, n_repeats=10, random_state=0)
cross_val_score(logreg, iris['data'], iris['target'], cv=rskfold).mean()

0.9626666666666669

In [50]:
np.logspace(-3, 1, 5)

array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01])

In [52]:
X_train, X_test, y_train, y_test = train_test_split(iris['data'], iris['target'], random_state=0)

In [53]:
param_grid = {'C':np.logspace(-3, 1, 5), 'gamma':np.logspace(-3, 1, 5)}
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=5, return_train_score=True).fit(X_train, y_train)
grid.best_params_, grid.best_score_, grid.score(X_test, y_test)

({'C': 10.0, 'gamma': 0.1}, 0.9731225296442687, 0.9736842105263158)

In [54]:
grid.best_estimator_

SVC(C=10.0, gamma=0.1)