In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# -----------------------------
# 1) 데이터 준비
# -----------------------------

cols = ["pregnancies", "glucose", "bloodPressure", "skinThickness", "insulin", "bmi",
        "diabetesPedigreeFunction", "Age", "outcome"]
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/diabetes.csv", header=None, names=cols, skiprows=1).dropna()

X = df.drop(columns=["bmi", "outcome"])
y = df["bmi"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 2) 모델 구성
# -----------------------------

dt = DecisionTreeRegressor(random_state=42)
rf = RandomForestRegressor(n_estimators=200, random_state=42)
lr = LinearRegression()

# -----------------------------
# 3) 모델 학습
# -----------------------------

dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)

# -----------------------------
# 4) 모델 평가
# -----------------------------
# 예측값 계산
dt_pred = dt.predict(X_test)
rf_pred = rf.predict(X_test)
lr_pred = lr.predict(X_test)

# 평가 지표 계산 (MSE와 R²)
def evaluate(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{model_name:15s} | MSE: {mse:.4f} | R²: {r2:.4f}")

print("=== Test Performance ===")
evaluate(y_test, dt_pred, "Decision Tree")
evaluate(y_test, rf_pred, "Random Forest")
evaluate(y_test, lr_pred, "Linear Regression")

=== Test Performance ===
Decision Tree   | MSE: 104.1275 | R²: -0.4648
Random Forest   | MSE: 46.7562 | R²: 0.3423
Linear Regression | MSE: 52.2406 | R²: 0.2651
