In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# -----------------------------
# 1) 데이터 준비
# -----------------------------

# 사용할 주요 15개 컬럼 + label
cols = [
    "worst area",
    "worst concave points",
    "mean concave points",
    "worst radius",
    "mean concavity",
    "worst perimeter",
    "mean perimeter",
    "mean radius",
    "mean area",
    "worst concavity",
    "area error",
    "worst texture",
    "worst compactness",
    "radius error",
    "mean compactness",
    "label"
]

df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/breast_cancer.csv")

# 필요한 컬럼만 추출
df = df[cols].dropna()

X = df.drop(columns=["label"]) # 입력
y = df["label"] # 출력

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# 2) 모델 구성
# -----------------------------

dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=200, random_state=42)
lr = LogisticRegression(max_iter=2000)

# -----------------------------
# 3) 모델 학습
# -----------------------------

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가
# -----------------------------

dt_acc = accuracy_score(y_test, dt.predict(X_test))
rf_acc = accuracy_score(y_test, rf.predict(X_test))
lr_acc = accuracy_score(y_test, lr.predict(X_test))

print("=== Test Accuracy ===")
print(f"Decision Tree : {dt_acc:.4f}")
print(f"Random Forest : {rf_acc:.4f}")
print(f"Logistic Reg. : {lr_acc:.4f}")

=== Test Accuracy ===
Decision Tree : 0.8947
Random Forest : 0.9561
Logistic Reg. : 0.9561


In [None]:
from google.colab import drive
drive.mount('/content/drive')