In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold, LeaveOneOut, cross_val_score
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("titanic_preprocessed.csv", index_col=0)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,alone
0,0,3,0,22,1,0,7.25,0,0
1,1,1,1,38,1,0,71.2833,1,0
2,1,3,1,26,0,0,7.925,0,1
3,1,1,1,35,1,0,53.1,0,0
4,0,3,0,35,0,0,8.05,0,1


In [3]:
data = df.drop('survived', axis=1)
label = df['survived']
model = DecisionTreeClassifier(random_state=111)

In [22]:
kf = KFold(n_splits=10, shuffle=True, random_state=111)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=111)
loo = LeaveOneOut()

In [24]:
print("K-Fold 교차검증 평균 정확도: ", round(np.mean(cross_val_score(model, data, label, scoring='accuracy', cv=kf)), 4))
print("SK-Fold 교차검증 평균 정확도: ", round(np.mean(cross_val_score(model, data, label, scoring='accuracy', cv=skf)), 4))
print("LeaveOneOut 교차검증 평균 정확도: ", round(np.mean(cross_val_score(model, data, label, scoring='accuracy', cv=loo)), 4))

K-Fold 교차검증 평균 정확도:  0.7449
SK-Fold 교차검증 평균 정확도:  0.7372
LeaveOneOut 교차검증 평균 정확도:  0.7295


## Bootstrap 교차 검증 
중복을 허용한 무작위 샘플링을 통해 각 Fold가 서로 다른 데이터를 가질 수 있다 --> 이를 통해 작은 데이터나 클래스 불균형에 의한 편향을 방지할 수 있다 

In [25]:
from sklearn.utils import resample

n_iterations = 10
bootstrap_scores = []

for _ in range(n_iterations):
    # 데이터를 무작위로 샘플링 (부트스트랩 샘플링)
    X_sampled, y_sampled = resample(data, label, replace=True, random_state=111)
    
    # 모델을 훈련하고 평가
    scores = cross_val_score(model, X_sampled, y_sampled, scoring='accuracy', cv=5)  # 5-Fold 교차 검증
    bootstrap_scores.append(scores.mean())

# Bootstrap 샘플링 결과의 평균을 계산하여 최종 성능을 얻음
final_score = sum(bootstrap_scores) / n_iterations
print("Bootstrap 교차 검증 평균 정확도:", final_score)

Bootstrap 교차 검증 평균 정확도: 0.8564102564102567
