### 교차 검증 실습

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [3]:
fish_df = pd.read_csv('./data/fish.csv')
fish_input = fish_df.drop('Species', axis=1)
fish_target = fish_df['Species']

X_train, X_test, y_train, y_test = train_test_split(fish_input, fish_target, random_state=42, stratify=fish_target)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [8]:
fish_input

Unnamed: 0,Weight,Length,Diagonal,Height,Width
0,242.0,25.4,30.0,11.5200,4.0200
1,290.0,26.3,31.2,12.4800,4.3056
2,340.0,26.5,31.1,12.3778,4.6961
3,363.0,29.0,33.5,12.7300,4.4555
4,430.0,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...
154,12.2,12.2,13.4,2.0904,1.3936
155,13.4,12.4,13.5,2.4300,1.2690
156,12.2,13.0,13.8,2.2770,1.2558
157,19.7,14.3,15.2,2.8728,2.0672


##### 생선 다중 분류 with cross_var_score

In [None]:
# 생선 다중 분류 with cross_var_score
# 교차 검증 검수
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(max_iter=1000, solver='newton-cg')
# 최적화 알고리즘 중 newton-cg 선택

scores = cross_val_score(lr_clf, X_train_scaled, y_train, cv=5)
print(scores, np.mean(scores))

# 학습/예측/평가
lr_clf.fit(X_train_scaled, y_train)

y_pred = lr_clf.predict(X_test_scaled)

print(accuracy_score(y_test, y_pred))
# accuracy 값이 더 높게 나옴

[0.875      0.83333333 0.83333333 0.79166667 0.73913043] 0.8144927536231885
0.825


##### 생선 다중 분류 with GridSearchCV

In [None]:
# 생선 다중 분류 with GridSearchCV
# 교차 검증 검수(그리드서치 사용해 최적의 파라미터, 평가 점수, 모델 확인)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
lr_clf = LogisticRegression(max_iter=1000)

# 하이퍼 파라미터
params = {
    'C': range(200, 1000, 200),
    'solver': ['liblinear', 'newton-cg', 'lbfgs']
}

# 예측 결과 평가(score)
stratifiedkfold = StratifiedKFold(n_splits=5, shuffle=True)

grid = GridSearchCV(lr_clf, params, cv=stratifiedkfold)
grid.fit(X_train_scaled, y_train)

print('최적의 파라미터:', grid.best_params_)
print('최적의 모델 객체:', grid.best_estimator_)
print('최적화된 점수:', grid.best_score_)

# 최적화된 모델로 예측, 평가
best_lr = grid.best_estimator_
print('최적화 모델 score:', best_lr.score(X_train_scaled, y_train), best_lr.score(X_test_scaled, y_test))



최적의 파라미터: {'C': 800, 'solver': 'liblinear'}
최적의 모델 객체: LogisticRegression(C=800, max_iter=1000, solver='liblinear')
최적화된 점수: 0.9409420289855073
최적화 모델 score: 0.957983193277311 0.975


