# 07. Moon 데이터셋에 결정트리 훈련, 튜닝

In [92]:
from sklearn.datasets import make_moons
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
import numpy as np

In [93]:
X, y = make_moons(n_samples=1000, noise=0.4, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [109]:
grid_space = {
    'max_leaf_nodes': [None, 2, 5, 10, 20, 50, 100],
}

rand_space = {
    'max_leaf_nodes': np.arange(2, 201),
}

In [110]:
dt_clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(
    estimator=dt_clf, 
    param_grid=grid_space, 
    cv=5)

In [111]:
# 학습
grid_search.fit(X_train, y_train)

# 결과 확인
print(f'Best max_leaf_nodes: {grid_search.best_params_}')
print(f'Best cross-validation score: {grid_search.best_score_}')

Best max_leaf_nodes: {'max_leaf_nodes': 5}
Best cross-validation score: 0.8487499999999999


In [97]:
best_dt_clf = grid_search.best_estimator_
test_score = best_dt_clf.score(X_test, y_test)
print(f'Test set score: {test_score}')

Test set score: 0.855


In [98]:
rand_search = RandomizedSearchCV(
    estimator=dt_clf,
    param_distributions=rand_space,
    cv=5,
    n_iter=100,        # 랜덤하게 50개 후보 샘플링
    random_state=42,
)

In [99]:
rand_search.fit(X_train, y_train)

# 결과 확인
print(f'Best max_leaf_nodes: {rand_search.best_params_}')
print(f'Best cross-validation score: {rand_search.best_score_}')

Best max_leaf_nodes: {'max_leaf_nodes': np.int64(6)}
Best cross-validation score: 0.85


In [100]:
best_dt_clf = rand_search.best_estimator_
test_score = rand_search.score(X_test, y_test)
print(f'Test set score: {test_score}')

Test set score: 0.855


# 08. 단계별 렌덤 포레스트 만들기

In [103]:
from sklearn.model_selection import ShuffleSplit

n_subsets = 1000     # 생성할 서브셋 개수
subset_size = 100    # 각 서브셋 샘플 수


# ShuffleSplit 설정
ss = ShuffleSplit(
    n_splits=n_subsets,       # 생성할 subset 개수
    train_size=subset_size,   # 각 subset 샘플 수
    random_state=42,
)

X_subsets = []
y_subsets = []

for train_idx, _ in ss.split(X_train):
    X_subsets.append(X_train[train_idx])
    y_subsets.append(y_train[train_idx])

In [114]:
from sklearn.metrics import accuracy_score

# grid_search에서 찾은 최적 max_leaf_nodes
best_max_leaf_nodes = grid_search.best_params_['max_leaf_nodes']
test_scores = []

for X_sub, y_sub in zip(X_subsets, y_subsets):
    dt_clf = DecisionTreeClassifier(max_leaf_nodes=best_max_leaf_nodes, random_state=42)
    dt_clf.fit(X_sub, y_sub)
    
    y_pred = dt_clf.predict(X_test)
    test_scores.append(accuracy_score(y_test, y_pred))

test_scores = np.array(test_scores)
print(f'Mean test accuracy: {test_scores.mean():.4f}')

Mean test accuracy: 0.8165


In [119]:
from scipy.stats import mode

# grid_search에서 찾은 최적 max_leaf_nodes
best_max_leaf_nodes = grid_search.best_params_['max_leaf_nodes']

# 각 트리의 테스트 예측을 저장할 배열
all_predictions = []

for X_sub, y_sub in zip(X_subsets, y_subsets):
    dt_clf = DecisionTreeClassifier(max_leaf_nodes=best_max_leaf_nodes, random_state=42)
    dt_clf.fit(X_sub, y_sub)
    
    y_pred = dt_clf.predict(X_test)
    all_predictions.append(y_pred)

# (n_trees, n_samples) 배열로 변환
all_predictions = np.array(all_predictions)

# 각 테스트 샘플마다 다수결 예측
y_pred_majority, n_votes = mode(all_predictions, axis=0, keepdims=False)

In [118]:
all_predictions

array([[1, 1, 1, ..., 1, 1, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 1, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 1, 0]], shape=(1000, 200))

In [123]:
len(y_pred_majority)

200

In [122]:
len(n_votes)

200