In [4]:
from sklearn.datasets import load_diabetes

X, y = load_diabetes(return_X_y=True)
print("X shape : ", X.shape)
print("y shape : ", y.shape)

In [14]:
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size = 0.2, random_state=42)
print("X_tr : ", X_tr.shape)
print("X_te : ", X_te.shape)
print("y_tr : ", y_tr.shape)
print("y_te : ", y_te.shape)

X_tr :  (353, 10)
X_te :  (89, 10)
y_tr :  (353,)
y_te :  (89,)


In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
lr = LinearRegression().fit(X_tr,y_tr)
pred = lr.predict(X_te)

mse = mean_squared_error(y_te, pred)
rmse = mse**0.5
mae = mean_absolute_error(y_te, pred)
r2 = r2_score(y_te, pred)
print("mse: ", mse)
print("rmse: ", rmse)
print("mae: ", mae)
print("r2: ", r2)

mse:  2900.1936284934814
rmse:  53.85344583676593
mae:  42.79409467959994
r2:  0.4526027629719195


# 1. 선형회귀분석(SKlearn)

In [19]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import numpy as np

X, y = load_diabetes(return_X_y=True)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression().fit(X_tr, y_tr)
pred = lr.predict(X_te)

mse = mean_squared_error(y_te, pred)
rmse = mse**0.5
mae = mean_absolute_error(y_te, pred)
r2 = r2_score(y_te, pred)

print(f"RMSE={rmse:.3f}  MAE={mae:.3f}  R2={r2:.3f}")


RMSE=53.853  MAE=42.794  R2=0.453


# 1-1. 선형회귀분석 성능 최적화(Advanced)

In [20]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# 1️⃣ 데이터 로드
X, y = load_diabetes(return_X_y=True)

# 2️⃣ 데이터 분할
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)

# 3️⃣ 표준화 (정규화 필수)
scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)
X_te = scaler.transform(X_te)

# 4️⃣ 모델 + 하이퍼파라미터 튜닝
ridge = Ridge()
param_grid = {'alpha': np.logspace(-3, 3, 20)}  # 0.001 ~ 1000 사이 로그스케일
cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid = GridSearchCV(ridge, param_grid, scoring='r2', cv=cv, n_jobs=-1)
grid.fit(X_tr, y_tr)

best_model = grid.best_estimator_

# 5️⃣ 예측 및 평가
pred = best_model.predict(X_te)
mse = mean_squared_error(y_te, pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_te, pred)
r2 = r2_score(y_te, pred)

# 6️⃣ 결과 출력
print(f"✅ 최적 alpha: {grid.best_params_['alpha']:.4f}")
print(f"RMSE={rmse:.3f}  MAE={mae:.3f}  R2={r2:.3f}")


✅ 최적 alpha: 0.6952
RMSE=53.795  MAE=42.808  R2=0.454


# 2. 선형회귀분석(numpy skratch)

In [None]:
import numpy as np
from sklearn.datasets import load_diabetes

# 1. 데이터 불러오기
data = load_diabetes()
X = data.data
y = data.target

# 2. 상수항(절편) 추가 (bias term)
X = np.c_[np.ones(X.shape[0]), X]  # 첫 열에 1 추가

# 3. 가중치(회귀계수) 계산 (Normal Equation)
#    w = (X^T X)^(-1) X^T y
w = np.linalg.inv(X.T @ X) @ X.T @ y

# 4. 예측값 계산
y_pred = X @ w

# 5. 성능 평가 (MSE, R^2)
mse = np.mean((y - y_pred) ** 2)
r2 = 1 - np.sum((y - y_pred)**2) / np.sum((y - np.mean(y))**2)

# 6. 결과 출력
print("회귀계수 (w):", w)
print("MSE:", mse)
print("R^2 Score:", r2)
