In [1]:
# 필요한 라이브러리 import
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')


In [2]:
# df_selected_02.pkl 파일을 불러와서 df로 저장
with open('../../data/processed/df_selected_02.pkl', 'rb') as f:
    df = pickle.load(f)

print("데이터프레임 정보:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\n데이터프레임 미리보기:")
df.head()


데이터프레임 정보:
Shape: (1460, 12)
Columns: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'TotRmsAbvGrd', 'SalePrice']

데이터프레임 미리보기:


Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,YearBuilt,YearRemodAdd,GarageYrBlt,TotRmsAbvGrd,SalePrice
0,0.651479,0.370333,0.311725,0.351,-0.459303,-0.793434,0.789741,1.050994,0.878668,1.021157,0.91221,12.247699
1,-0.071836,-0.482512,0.311725,-0.060731,0.466465,0.25714,0.789741,0.156734,-0.429577,-0.104483,-0.318683,12.109016
2,0.651479,0.515013,0.311725,0.631726,-0.313369,-0.627826,0.789741,0.984752,0.830215,0.937776,-0.318683,12.317171
3,0.651479,0.383659,1.650307,0.790804,-0.687324,-0.521734,-1.026041,-1.863632,-0.720298,0.812705,0.296763,11.849405
4,1.374795,1.299326,1.650307,1.698485,0.19968,-0.045611,0.789741,0.951632,0.733308,0.896086,1.527656,12.42922


In [3]:
# SalePrice를 y로, 나머지 컬럼을 X로 지정
y = df['SalePrice']
X = df.drop('SalePrice', axis=1)

print("Target variable (y) 정보:")
print(f"y shape: {y.shape}")
print(f"y 통계:\n{y.describe()}")

print("\nFeature variables (X) 정보:")
print(f"X shape: {X.shape}")
print(f"X columns: {list(X.columns)}")


Target variable (y) 정보:
y shape: (1460,)
y 통계:
count    1460.000000
mean       12.024057
std         0.399449
min        10.460271
25%        11.775105
50%        12.001512
75%        12.273736
max        13.534474
Name: SalePrice, dtype: float64

Feature variables (X) 정보:
X shape: (1460, 11)
X columns: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'TotRmsAbvGrd']


In [4]:
# Ridge Regression 교차검증 (cv=5)
from sklearn.model_selection import cross_validate

# Ridge 모델 생성
ridge_model = Ridge()

# 교차검증을 위한 scoring 지정
scoring = ['r2', 'neg_mean_squared_error']

# 교차검증 수행
cv_results = cross_validate(ridge_model, X, y, cv=5, scoring=scoring, return_train_score=False)

# R² 점수 계산
r2_scores = cv_results['test_r2']
r2_mean = r2_scores.mean()
r2_std = r2_scores.std()

# RMSE 점수 계산 (neg_mean_squared_error의 음수를 취하고 제곱근)
mse_scores = -cv_results['test_neg_mean_squared_error']
rmse_scores = np.sqrt(mse_scores)
rmse_mean = rmse_scores.mean()
rmse_std = rmse_scores.std()

#결과 출력
print("\n=== 5-fold 교차 검증 결과 ===")
print(f"R² 평균: {r2_mean:.4f} (±{r2_std:.4f})")
print(f"RMSE 평균: {rmse_mean:.4f} (±{rmse_std:.4f})")
print()





=== 5-fold 교차 검증 결과 ===
R² 평균: 0.8089 (±0.0567)
RMSE 평균: 0.1725 (±0.0235)



In [5]:
# df_selected_03.pkl 파일을 불러와서 df_improved_01로 저장
with open('../../data/processed/df_selected_03.pkl', 'rb') as f:
    df_improved_01 = pickle.load(f)

print("개선된 데이터프레임 정보:")
print(f"Shape: {df_improved_01.shape}")
print(f"Columns: {list(df_improved_01.columns)}")
print("\n데이터프레임 미리보기:")
df_improved_01.head()


개선된 데이터프레임 정보:
Shape: (1460, 15)
Columns: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'EffectiveAge', 'TotalBathrooms', 'Quality_x_Area', 'AvgRoomSize', 'IsNewBuild', 'TotalSF', 'GarageRatio', 'SalePrice']

데이터프레임 미리보기:


Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,TotRmsAbvGrd,EffectiveAge,TotalBathrooms,Quality_x_Area,AvgRoomSize,IsNewBuild,TotalSF,GarageRatio,SalePrice
0,7,1710,2,548,856,856,8,5,3.5,11970,213.75,False,2566,0.320468,12.247699
1,6,1262,2,460,1262,1262,6,31,2.5,7572,210.333333,False,2524,0.364501,12.109016
2,7,1786,2,608,920,920,6,6,3.5,12502,297.666667,False,2706,0.340426,12.317171
3,7,1717,3,642,756,961,7,36,2.0,12019,245.285714,False,2473,0.373908,11.849405
4,8,2198,3,836,1145,1145,9,8,3.5,17584,244.222222,False,3343,0.380346,12.42922


In [6]:
# SalePrice를 y로, 나머지 컬럼을 X로 지정 (개선된 데이터)
y = df_improved_01['SalePrice']
X = df_improved_01.drop('SalePrice', axis=1)

# Ridge Regression 교차검증 (cv=5) - 개선된 데이터
from sklearn.model_selection import cross_validate

# Ridge 모델 생성
ridge_model = Ridge()

# 교차검증을 위한 scoring 지정
scoring = ['r2', 'neg_mean_squared_error']

# 교차검증 수행
cv_results = cross_validate(ridge_model, X, y, cv=5, scoring=scoring, return_train_score=False)

# R² 점수 계산
r2_scores = cv_results['test_r2']
r2_mean = r2_scores.mean()
r2_std = r2_scores.std()

# RMSE 점수 계산 (neg_mean_squared_error의 음수를 취하고 제곱근)
mse_scores = -cv_results['test_neg_mean_squared_error']
rmse_scores = np.sqrt(mse_scores)
rmse_mean = rmse_scores.mean()
rmse_std = rmse_scores.std()

# 지정된 형식으로 결과 출력
print("\n=== 5-fold 교차 검증 결과 ===")
print(f"R² 평균: {r2_mean:.4f} (±{r2_std:.4f})")
print(f"RMSE 평균: {rmse_mean:.4f} (±{rmse_std:.4f})")
print()



=== 5-fold 교차 검증 결과 ===
R² 평균: 0.8152 (±0.0429)
RMSE 평균: 0.1701 (±0.0178)



In [8]:
# df_selected_04.pkl 파일을 불러와서 df_improved_02로 저장
with open('../../data/processed/df_selected_04.pkl', 'rb') as f:
    df_improved_02 = pickle.load(f)

print("개선된 데이터프레임 정보:")
print(f"Shape: {df_improved_02.shape}")
print(f"Columns: {list(df_improved_02.columns)}")
print("\n데이터프레임 미리보기:")
df_improved_02.head()


개선된 데이터프레임 정보:
Shape: (1460, 15)
Columns: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'EffectiveAge', 'Quality_x_Area', 'AvgRoomSize', 'IsNewBuild', 'TotalSF', 'GarageRatio', 'SalePrice', 'TotalBathrooms_weighted']

데이터프레임 미리보기:


Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,TotRmsAbvGrd,EffectiveAge,Quality_x_Area,AvgRoomSize,IsNewBuild,TotalSF,GarageRatio,SalePrice,TotalBathrooms_weighted
0,7,1710,2,548,856,856,8,5,11970,213.75,False,2566,0.320468,12.247699,3.2
1,6,1262,2,460,1262,1262,6,31,7572,210.333333,False,2524,0.364501,12.109016,2.35
2,7,1786,2,608,920,920,6,6,12502,297.666667,False,2706,0.340426,12.317171,3.2
3,7,1717,3,642,756,961,7,36,12019,245.285714,False,2473,0.373908,11.849405,1.7
4,8,2198,3,836,1145,1145,9,8,17584,244.222222,False,3343,0.380346,12.42922,3.2


In [9]:
# SalePrice를 y로, 나머지 컬럼을 X로 지정 (개선된 데이터 02)
y = df_improved_02['SalePrice']
X = df_improved_02.drop('SalePrice', axis=1)

# Ridge Regression 교차검증 (cv=5) - 개선된 데이터 02
from sklearn.model_selection import cross_validate

# Ridge 모델 생성
ridge_model = Ridge()

# 교차검증을 위한 scoring 지정
scoring = ['r2', 'neg_mean_squared_error']

# 교차검증 수행
cv_results = cross_validate(ridge_model, X, y, cv=5, scoring=scoring, return_train_score=False)

# R² 점수 계산
r2_scores = cv_results['test_r2']
r2_mean = r2_scores.mean()
r2_std = r2_scores.std()

# RMSE 점수 계산 (neg_mean_squared_error의 음수를 취하고 제곱근)
mse_scores = -cv_results['test_neg_mean_squared_error']
rmse_scores = np.sqrt(mse_scores)
rmse_mean = rmse_scores.mean()
rmse_std = rmse_scores.std()

# 지정된 형식으로 결과 출력
print("\n=== 5-fold 교차 검증 결과 ===")
print(f"R² 평균: {r2_mean:.4f} (±{r2_std:.4f})")
print(f"RMSE 평균: {rmse_mean:.4f} (±{rmse_std:.4f})")
print()



=== 5-fold 교차 검증 결과 ===
R² 평균: 0.8134 (±0.0437)
RMSE 평균: 0.1710 (±0.0182)



In [11]:
# df_selected_05.pkl 파일을 불러와서 df_improved_03으로 저장
with open('../../data/processed/df_selected_05.pkl', 'rb') as f:
    df_improved_03 = pickle.load(f)

print("개선된 데이터프레임 정보:")
print(f"Shape: {df_improved_03.shape}")
print(f"Columns: {list(df_improved_03.columns)}")
print("\n데이터프레임 미리보기:")
df_improved_03.head()


개선된 데이터프레임 정보:
Shape: (1460, 16)
Columns: ['OverallQual', 'GrLivArea', 'GarageCars', 'GarageArea', 'TotalBsmtSF', '1stFlrSF', 'TotRmsAbvGrd', 'EffectiveAge', 'Quality_x_Area', 'AvgRoomSize', 'IsNewBuild', 'TotalSF', 'GarageRatio', 'SalePrice', 'AboveGradeBath', 'BasementBath']

데이터프레임 미리보기:


Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,TotRmsAbvGrd,EffectiveAge,Quality_x_Area,AvgRoomSize,IsNewBuild,TotalSF,GarageRatio,SalePrice,AboveGradeBath,BasementBath
0,7,1710,2,548,856,856,8,5,11970,213.75,False,2566,0.320468,12.247699,2.5,1.0
1,6,1262,2,460,1262,1262,6,31,7572,210.333333,False,2524,0.364501,12.109016,2.0,0.5
2,7,1786,2,608,920,920,6,6,12502,297.666667,False,2706,0.340426,12.317171,2.5,1.0
3,7,1717,3,642,756,961,7,36,12019,245.285714,False,2473,0.373908,11.849405,1.0,1.0
4,8,2198,3,836,1145,1145,9,8,17584,244.222222,False,3343,0.380346,12.42922,2.5,1.0


In [12]:
# SalePrice를 y로, 나머지 컬럼을 X로 지정 (개선된 데이터 03)
y = df_improved_03['SalePrice']
X = df_improved_03.drop('SalePrice', axis=1)

# Ridge Regression 교차검증 (cv=5) - 개선된 데이터 03
from sklearn.model_selection import cross_validate

# Ridge 모델 생성
ridge_model = Ridge()

# 교차검증을 위한 scoring 지정
scoring = ['r2', 'neg_mean_squared_error']

# 교차검증 수행
cv_results = cross_validate(ridge_model, X, y, cv=5, scoring=scoring, return_train_score=False)

# R² 점수 계산
r2_scores = cv_results['test_r2']
r2_mean = r2_scores.mean()
r2_std = r2_scores.std()

# RMSE 점수 계산 (neg_mean_squared_error의 음수를 취하고 제곱근)
mse_scores = -cv_results['test_neg_mean_squared_error']
rmse_scores = np.sqrt(mse_scores)
rmse_mean = rmse_scores.mean()
rmse_std = rmse_scores.std()

# 지정된 형식으로 결과 출력
print("\n=== 5-fold 교차 검증 결과 ===")
print(f"R² 평균: {r2_mean:.4f} (±{r2_std:.4f})")
print(f"RMSE 평균: {rmse_mean:.4f} (±{rmse_std:.4f})")
print()



=== 5-fold 교차 검증 결과 ===
R² 평균: 0.8163 (±0.0418)
RMSE 평균: 0.1697 (±0.0173)

