In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 0. import & data load

In [None]:
# 공통 모듈 임포트
import numpy as np
import os

# 깔금한 그래프 출력을 위해
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
import seaborn as sns
import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

from scipy import stats
from scipy.stats import norm, skew 

In [None]:
train=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")
train.head()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
train.hist(bins=50, figsize=(20,20))
plt.show()

## 1. EDA

데이터 복사본으로 작업

In [None]:
house_price=train.copy()

### Outliers

In [None]:
house_price.plot(kind='scatter', x = 'GrLivArea', y = 'SalePrice', alpha=0.3)

`(house_price['GrLivArea']>4000) & (house_price['SalePrice']<300000)` 범위의 두 값이 이상치로 제거 필요 (오른쪽 아래 두 점)

### Target Variable

In [None]:
sns.distplot(house_price['SalePrice'], fit=norm)

plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res= stats.probplot(house_price['SalePrice'], plot=plt)
plt.show()

종속 변수 양의 왜도 가지고 있어서 `로그 변환` 필요

### correlation

In [None]:
corr_matrix = house_price.corr(numeric_only=True) # 버전에 따라 `numeric_only=True` 필요

In [None]:
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes=["SalePrice", "OverallQual","GrLivArea","GarageCars","GarageArea", "TotalBsmtSF"]
scatter_matrix(house_price[attributes],figsize=(20,20))

In [None]:
house_price.plot(kind="scatter", x="GrLivArea", y="SalePrice", alpha=0.3)

`GrLivArea` 특성이 `SalePrice`와 관련이 깊어보임 (OverallQual이 상관계수는 더 높지만 수직선의 분포를 가짐)

### feature combination (test)

In [None]:
# 새로운 특성 조합으로 테스트
house_price['TotalBathrooms'] = house_price['FullBath'] + (0.5 * house_price['HalfBath']) + house_price['BsmtFullBath'] + (0.5 * house_price['BsmtHalfBath']) # 총 욕실 수 (지상 층과 지하실의 전체 욕실 수)
house_price['TotalPorchArea'] = house_price['OpenPorchSF'] + house_price['EnclosedPorch'] + house_price['3SsnPorch'] + house_price['ScreenPorch'] # 총 현관 면적 (열린, 폐쇄된, 3 계절 및 스크린 현관 면적의 합)
house_price['TotalSF'] = house_price['TotalBsmtSF'] + house_price['1stFlrSF'] + house_price['2ndFlrSF'] # 총 면적 (지하, 1층 및 2층 면적의 합)
house_price['AgeAtSale'] = house_price['YrSold'] - house_price['YearBuilt'] # 판매 당시 연령 (판매 연도에서 건축 연도를 뺀 값)
house_price['HasPool'] = (house_price['PoolArea'] > 0).astype(int) # 수영장 유무 (수영장이 있는지 여부를 나타내는 이진 변수)
house_price['HasFireplace'] = (house_price['Fireplaces'] > 0).astype(int) # 벽난로 유무 (벽난로가 있는지 여부를 나타내는 이진 변수)
house_price['HighSeason'] = house_price['MoSold'].isin([5, 6, 7]).astype(int) # 판매가 성수기에 이루어졌는지 여부 (봄 및 여름인 경우 1, 그렇지 않으면 0인 이진 변수)
house_price['HighOverallCond'] = (house_price['OverallCond'] >= 8).astype(int) # 높은 전반적 상태 (전반적 상태 등급이 8 이상인 경우 1, 그렇지 않으면 0인 이진 변수)

In [None]:
corr_matrix = house_price.corr(numeric_only=True)
corr_matrix["SalePrice"].sort_values(ascending=False)

In [None]:
house_price.plot(kind="scatter",x="TotalSF",y="SalePrice",alpha=0.3)
plt.show()

In [None]:
house_price.describe()

`TotalSF` (총 면적), `TotalBathrooms` (총 욕실 수) 특성들은 기존의 특성들보다 상관관계 높다.

## 2. Data preprocessing

In [None]:
train.shape

### OutlierRemover

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, grlivarea_threshold=4000, saleprice_threshold=300000):
        self.grlivarea_threshold = grlivarea_threshold
        self.saleprice_threshold = saleprice_threshold

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Copy the input DataFrame to avoid modifying the original data
        X_transformed = X.copy()

        # Remove outliers based on specified thresholds
        X_transformed = X_transformed.drop(
            X_transformed[
                (X_transformed['GrLivArea'] > self.grlivarea_threshold) & 
                (X_transformed['SalePrice'] < self.saleprice_threshold)
            ].index
        )

        return X_transformed


outlier_remover = OutlierRemover()
train_no_outliers  = outlier_remover.fit_transform(train)

In [None]:
#Check the graphic again
fig, ax = plt.subplots()
ax.scatter(train_no_outliers['GrLivArea'], train_no_outliers['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()

In [None]:
train_no_outliers.shape

### Log Transform

In [None]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_transformed = X.copy()
        X_transformed["SalePrice"] = np.log1p(X_transformed["SalePrice"])
        return X_transformed

log_transformer = LogTransformer()
train_no_outliers_transformed = log_transformer.transform(train_no_outliers)

In [None]:
sns.distplot(train_no_outliers_transformed['SalePrice'], fit=norm)

plt.ylabel('Frequency')
plt.title('SalePrice distribution')

fig = plt.figure()
res= stats.probplot(train_no_outliers_transformed['SalePrice'], plot=plt)
plt.show()

### Transform numerical to categorical

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TypeConverter(BaseEstimator, TransformerMixin):
    def __init__(self, columns, target_type=str):
        self.columns = columns
        self.target_type = target_type
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        for column in self.columns:
            X[column] = X[column].astype(self.target_type)
        return X


type_converter = TypeConverter(columns=['MSSubClass', 'OverallCond', 'YrSold', 'MoSold'], target_type=str)
train_tr = type_converter.transform(train_no_outliers_transformed)

### Missing data 

In [None]:
house_price=train_tr.drop("SalePrice",axis=1) # 훈련 세트를 위해 레이블 제거
house_price_labels=train_tr["SalePrice"].copy()

In [None]:
print("house_price size is : {}".format(house_price.shape))

In [None]:
house_price_na=(house_price.isnull().sum()/len(house_price)) *100
house_price_na=house_price_na.drop(house_price_na[house_price_na == 0].index).sort_values(ascending=False)[:30]
missing_data=pd.DataFrame({'Missing ratio': house_price_na})
missing_data.head(20)

In [None]:
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='vertical')
sns.barplot(x=house_price_na.index, y=house_price_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)

In [None]:
class MissingValuesImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # NA: No -> None
        None_cols=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                   'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSSubClass']
        for col in None_cols:
            if col in X.columns:
                X[col] = X[col].fillna('None')

        # NA: missing -> median
        if 'LotFrontage' in X.columns:
            X["LotFrontage"] = X.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

        # NA: No -> 0
        zero_cols=['GarageYrBlt', 'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath',
                   'MasVnrArea']
        for col in zero_cols:
            if col in X.columns:
                X[col] = X[col].fillna(0)

        # NA: missing -> mode
        mode_cols=['MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType']
        for col in mode_cols:
            if col in X.columns:
                X[col] = X[col].fillna(X[col].mode()[0])

        # NA: Typ
        if 'Functional' in X.columns:
            X["Functional"] = X["Functional"].fillna("Typ")

        # NA: drop (all records are "AllPub", except for one "NoSeWa" and 2 NA), drop ID 
        if 'Utilities' in X.columns:
            X = X.drop(['Utilities'], axis=1)
        if 'Id' in X.columns:
            X = X.drop(['Id'], axis=1)

        return X


missing_values_imputer = MissingValuesImputer()
house_price_transformed = missing_values_imputer.transform(house_price)

In [None]:
#Check remaining missing values if any 
house_price_na = (house_price_transformed.isnull().sum() / len(house_price_transformed)) * 100
house_price_na = house_price_na.drop(house_price_na[house_price_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :house_price_na})
missing_data.head()

### Categorical features encoding

In [None]:
house_price_transformed_cat = house_price_transformed.select_dtypes(include=['object'])

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder=OneHotEncoder(sparse=False, handle_unknown='error')
house_price_transformed_cat_la=cat_encoder.fit_transform(house_price_transformed_cat)
house_price_transformed_cat_la

### Feature combination

In [None]:
class CustomFeaturesTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # 총 욕실 수 특성 생성
        X['TotalBathrooms'] = X['FullBath'] + (0.5 * X['HalfBath']) + X['BsmtFullBath'] + (0.5 * X['BsmtHalfBath'])

        # 총 면적 특성 생성
        X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']

        return X

features_transformer = CustomFeaturesTransformer()
house_price_transformed = features_transformer.transform(house_price_transformed)

In [None]:
house_price_transformed.head()

### Pipeline

In [None]:
train.shape

#### 1. 이상치, 로그 변환, 타입 변환


In [None]:
from sklearn.pipeline import Pipeline

first_pipeline=Pipeline([
    ('outlier', OutlierRemover()),
    ('log transform', LogTransformer()),
    ('type convert', TypeConverter(columns=['MSSubClass', 'OverallCond', 'YrSold', 'MoSold'], target_type=str)),
    ('imputer', MissingValuesImputer())
])

train_tr=first_pipeline.fit_transform(train)

In [None]:
train_tr.head()

In [None]:
house_price=train_tr.drop("SalePrice",axis=1)# 훈련 세트를 위해 레이블 제거
house_price_labels=train_tr["SalePrice"].copy()

In [None]:
# 수치형 특성
house_price_num = house_price.select_dtypes(include=['int64', 'float64'])

# 범주형 특성
house_price_cat = house_price.select_dtypes(include=['object'])

#### 2. 수치형 파이프라인

In [None]:
from sklearn.preprocessing import StandardScaler

num_pipeline=Pipeline([
    ('attribs_adder', CustomFeaturesTransformer()),
    ('std_scaler', StandardScaler()),
])

house_price_num_tr=num_pipeline.fit_transform(house_price_num)

In [None]:
house_price_num_tr

In [None]:
house_price_num_tr.shape

#### 3. 전체 파이프라인

In [None]:
house_price

In [None]:
from sklearn.compose import ColumnTransformer

# house_price_num과 house_price_cat을 각각 DataFrame의 열로 변환
num_attribs = list(house_price.select_dtypes(include=['int64', 'float64']).columns)
cat_attribs = list(house_price.select_dtypes(include=['object']).columns)

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(sparse=False,handle_unknown='ignore'), cat_attribs),
])

house_price_prepared = full_pipeline.fit_transform(house_price)

In [None]:
house_price_prepared

## 3. Model training & validation

### RandomizedSearchCV

In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.model_selection import RandomizedSearchCV

def perform_random_search(model, param_dist, X, y, scoring, cv=5, n_iter=10, random_state=None):
    random_search = RandomizedSearchCV(model, param_dist, n_iter=n_iter, scoring=scoring, cv=cv, random_state=random_state)
    random_search.fit(X, y)

    # Print the best parameters and corresponding RMSE
    print(f"{model.__class__.__name__} Best Parameters: {random_search.best_params_}")
    print(f"{model.__class__.__name__} Best RMSE: {np.sqrt(-random_search.best_score_)}")

# Define models and their respective parameter distributions
models = {
    'RandomForest': (RandomForestRegressor(), {'n_estimators': [50, 100, 200, 300], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}),
    'GradientBoosting': (GradientBoostingRegressor(), {'n_estimators': [50, 100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2, 0.5], 'max_depth': [3, 5, 7, 10], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}),
    'SVR': (SVR(), {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'kernel': ['linear', 'rbf']}),
    'XGBoost': (XGBRegressor(), {'n_estimators': [50, 100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2, 0.5], 'max_depth': [3, 5, 7, 10], 'subsample': [0.8, 0.9, 1.0], 'colsample_bytree': [0.8, 0.9, 1.0]}),
    'LightGBM': (LGBMRegressor(), {'n_estimators': [50, 100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2, 0.5], 'max_depth': [3, 5, 7, 10], 'subsample': [0.8, 0.9, 1.0], 'colsample_bytree': [0.8, 0.9, 1.0]}),
    'CatBoost': (CatBoostRegressor(silent=True), {'iterations': [50, 100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2, 0.5], 'depth': [3, 5, 7, 10]}),
}

# Perform random search for each model
for model_name, (model, param_dist) in models.items():
    perform_random_search(model, param_dist, house_price_prepared, house_price_labels, scoring='neg_mean_squared_error', n_iter=5, random_state=42)

In [None]:
svr=SVR(kernel='linear', gamma='auto', C= 10)
xgb=XGBRegressor(subsample= 0.8, n_estimators= 100, max_depth= 3, learning_rate= 0.1, colsample_bytree= 1.0, random_state=42)
lgb=LGBMRegressor(subsample= 0.8, n_estimators= 100, max_depth= 3, learning_rate= 0.1, colsample_bytree= 1.0, random_state=42)
catb=CatBoostRegressor(silent=True,learning_rate= 0.1, iterations= 100, depth= 3, random_state=42)

가장 성능이 좋은 모델 4개 선정

### Averaging models

In [None]:
def rmsle_cv(model, X, y, n_folds=5, random_state=42):
    kf = KFold(n_folds, shuffle=True, random_state=random_state)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

In [None]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [None]:
averaged_models = AveragingModels(models = (svr, xgb, lgb, catb))

score = rmsle_cv(averaged_models, house_price_prepared, house_price_labels, n_folds=5, random_state=42)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

각 모델 별 성능보다 좋은 성능 나옴

### Stacking models

In [None]:
from mlxtend.regressor import StackingCVRegressor

stacked_regressor = StackingCVRegressor(
    regressors=[svr, xgb, lgb, catb],
    meta_regressor=xgb,  # 메타 모델로 XGBRegressor 선택
    cv=KFold(n_splits=5, shuffle=True, random_state=42)
)

score = rmsle_cv(stacked_regressor,house_price_prepared, house_price_labels)
print("StackingCVRegressor score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

### Test data transform

In [None]:
first_test_pipeline=Pipeline([
    ('type convert', TypeConverter(columns=['MSSubClass', 'OverallCond', 'YrSold', 'MoSold'], target_type=str)),
    ('imputer', MissingValuesImputer())
])

test_tr=first_test_pipeline.transform(test)

In [None]:
X_test_prepared = full_pipeline.transform(test_tr)

In [None]:
X_test_prepared.shape

### Ensembling StackedRegressor, XGBoost and LightGBM

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

StackedRegressor:

In [None]:
stacked_regressor.fit(house_price_prepared, house_price_labels)
stacked_train_pred = stacked_regressor.predict(house_price_prepared)
stacked_pred = np.expm1(stacked_regressor.predict(X_test_prepared))
print(rmsle(house_price_labels, stacked_train_pred))

XGBoost:

In [None]:
xgb.fit(house_price_prepared, house_price_labels)
xgb_train_pred = xgb.predict(house_price_prepared)
xgb_pred = np.expm1(xgb.predict(X_test_prepared))
print(rmsle(house_price_labels, xgb_train_pred))

LightGBM:

In [None]:
lgb.fit(house_price_prepared, house_price_labels)
lgb_train_pred = lgb.predict(house_price_prepared)
lgb_pred = np.expm1(lgb.predict(X_test_prepared))
print(rmsle(house_price_labels, lgb_train_pred))

In [None]:
print('RMSLE score on train data:')
print(rmsle(house_price_labels,stacked_train_pred*0.70 +
               xgb_train_pred*0.15 + lgb_train_pred*0.15 ))

Ensemble prediction:

In [None]:
ensemble = stacked_pred*0.70 + xgb_pred*0.15 + lgb_pred*0.15

Submission

In [None]:
submission=pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission

In [None]:
submission['SalePrice']=ensemble

In [None]:
submission.to_csv('submission.csv', index=False)