In [None]:
from sklearn.datasets import load_breast_cancer
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import accuracy_score

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
sns.set_theme(style="whitegrid", context="notebook")
pd.set_option("display.max_columns", 100)

# 데이터 이해 및 탐색

In [None]:
# 데이터
data = load_breast_cancer(as_frame=True)
X, y = data.data, data.target
feature_names = data.feature_names
class_names = data.target_names  # malignant: 악성의, 해로운, benign: 양성의, 해가 거의 없는

In [None]:
feature_names

In [None]:
class_names

In [None]:
df = pd.concat([X, y], axis=1)
df.head()

In [None]:
# 크기/결측치/타입
print("[+] Shape:", df.shape)
print("\n[+] Dtypes:\n", df.dtypes.value_counts())
print("\n[+] Missing values (sum):\n", df.isna().sum())

# 타깃 분포(클래스 불균형 확인)

In [None]:
counts = y.value_counts().sort_index()
ratios = y.value_counts(normalize=True).sort_index()
print("Counts:\n", counts.map(int))
print("\nRatios:\n", (ratios*100).round(2).astype(str) + "%")

In [None]:
plt.figure(figsize=(4,3))
sns.barplot(x=[class_names[i] for i in counts.index], y=counts.values)
plt.title("Target distribution")
plt.ylabel("count"); plt.xlabel("")
plt.tight_layout(); plt.show()

# 기술통계(전체/그룹별)

In [None]:
X.describe().T.round(2)

In [None]:
group_desc = df.groupby("target").agg(['mean', 'min', 'max']).T
group_desc.columns = [class_names[i] for i in group_desc.columns]
with pd.option_context('display.max_rows', None):
    display(group_desc)

# 학습용 데이터 생성

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
X.shape, y.shape

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

In [None]:
X_tr.shape, y_tr.shape

In [None]:
X_te.shape, y_te.shape

# 모델 생성

In [None]:
rf = RandomForestClassifier(
    n_estimators=300, max_depth=None, random_state=42, n_jobs=-1, oob_score=True
)

# 학습

In [None]:
rf.fit(X_tr, y_tr)

# 검증

In [None]:
y_pr = rf.predict(X_te)
print("Acc:", accuracy_score(y_te, y_pr))
print("OOB score:", rf.oob_score_)

## 성능평가

In [None]:
print(classification_report(y_te, y_pr))

## 혼동 행렬

In [None]:
conf_mat = confusion_matrix(y_te, y_pr, normalize='true')
sns.heatmap(conf_mat, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

## Gini(불순도) 기반 중요도
- 학습중 각 특징을 써서 불순도를 얼마나 줄였는가를 누적해 측정

In [None]:
imp = pd.Series(rf.feature_importances_, index=feature_names).sort_values(ascending=True)
plt.figure(figsize=(6, 8))
imp.tail(20).plot(kind="barh")
plt.title("Feature importance (impurity-based)")
plt.tight_layout()
plt.show()

## Permutation Importance (권장)
- 학습된 모델을 고정하고, 특정 특징 값을 셔플해 성능이 얼마나 떨어지는지로 측정
- 성능이 많이 떨어지는 Feature는 중요도가 높다는 의미로 해석

In [None]:
from sklearn.inspection import permutation_importance

In [None]:
perm = permutation_importance(
    rf, X_te, y_te, n_repeats=10, random_state=42, n_jobs=-1
)
pi = pd.Series(perm.importances_mean, index=feature_names).sort_values(ascending=True)

plt.figure(figsize=(6, 8))
pi.tail(20).plot(kind="barh")
plt.title("Permutation importance (test)")
plt.tight_layout()
plt.show()


## 개별 트리 시각화

In [None]:
from sklearn import tree
import graphviz

In [None]:
est = rf.estimators_[0]  # 첫 번째 나무

In [None]:
dot = tree.export_graphviz(
    est,
    out_file=None,
    feature_names=feature_names,
    class_names=class_names,
    filled=True, rounded=True, max_depth=3
)
graphviz.Source(dot)  # 노트북에 바로 표시