In [1]:
import pandas as pd

df = pd.read_csv('../data/house_prices.csv')
df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [2]:
X = df.iloc[:, :-1]
y = df['MEDV']

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
from sklearn.preprocessing import StandardScaler

std_scale = StandardScaler()
std_scale.fit(X_train)

X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

In [5]:
from sklearn.linear_model import LinearRegression

clf_lr = LinearRegression()
clf_lr.fit(X_train_std, y_train)

In [6]:
print(clf_lr.coef_) # 가중치(w)
print(clf_lr.intercept_) # 바이어스(b)

[-0.97082019  1.05714873  0.03831099  0.59450642 -1.8551476   2.57321942
 -0.08761547 -2.88094259  2.11224542 -1.87533131 -2.29276735  0.71817947
 -3.59245482]
22.611881188118804


In [7]:
from sklearn.linear_model import Ridge

clf_ridge = Ridge(alpha=1)
clf_ridge.fit(X_train_std, y_train)

In [8]:
from sklearn.linear_model import Lasso

clf_lasso = Lasso(alpha=0.01)
clf_lasso.fit(X_train_std, y_train)

In [9]:
from sklearn.linear_model import ElasticNet

clf_elastic = ElasticNet(alpha=0.01, l1_ratio=0.01)
clf_elastic.fit(X_train_std, y_train)

In [13]:
pred_lr = clf_lr.predict(X_test_std)
pred_ridge = clf_ridge.predict(X_test_std)
pred_lasso = clf_lasso.predict(X_test_std)
pred_elastic = clf_elastic.predict(X_test_std)

In [14]:
from sklearn.metrics import r2_score

print(r2_score(y_test, pred_lr))
print(r2_score(y_test, pred_ridge))
print(r2_score(y_test, pred_lasso))
print(r2_score(y_test, pred_elastic))

0.5892223849182512
0.5881400471345535
0.5874763161420908
0.5849789631400006


In [15]:
from sklearn.metrics import mean_squared_error

print(mean_squared_error(y_test, pred_lr))
print(mean_squared_error(y_test, pred_ridge))
print(mean_squared_error(y_test, pred_lasso))
print(mean_squared_error(y_test, pred_elastic))

33.448979997676496
33.53711307394071
33.59115965261396
33.7945152093607


In [44]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import numpy as np

# 학습 데이터와 라벨 데이터를 준비합니다. (X_train, y_train, X_test, y_test)

# Lasso 모델을 생성합니다.
lasso_model = Lasso()

# 탐색할 alpha 값의 범위를 정의합니다.
alphas = [0.01, 0.05, 0.1, 0.5, 1]

# GridSearchCV를 설정합니다. 평가 지표로는 'neg_mean_squared_error'를 사용하며, negation을 사용합니다.
grid_search = GridSearchCV(lasso_model, param_grid={'alpha': alphas}, scoring='neg_mean_squared_error', cv=5)

# GridSearchCV를 통해 최적의 alpha 값을 찾습니다.
grid_search.fit(X_train_std, y_train)

# 최적의 alpha 값을 확인합니다.
best_alpha = grid_search.best_params_['alpha']

# 최적의 alpha 값으로 Lasso 모델을 다시 학습시킵니다.
best_lasso_model = Lasso(alpha=0.01)
best_lasso_model.fit(X_train_std, y_train)

# 테스트 데이터로 예측을 수행합니다.
y_pred = best_lasso_model.predict(X_test_std)

# MSE를 계산하여 최종 성능을 평가합니다. (negation을 다시 제거합니다.)
mse_test = mean_squared_error(y_test, y_pred)

print("Best alpha:", best_alpha)
print("Best MSE (Training):", -grid_search.best_score_)
print("Test MSE:", mse_test)


Best alpha: 0.01
Best MSE (Training): 21.142893189527037
Test MSE: 33.59115965261396
