## Khai báo thư viện

In [1]:
# Imports
import warnings
warnings.filterwarnings('ignore')

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## Nạp dữ liệu

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [3]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
test_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
test_df.shape

(1459, 80)

## Tiền xử lí dữ liệu

### Gộp tập train và test để xử lí đồng bộ

In [6]:
df_all = pd.concat([train_df.drop(['SalePrice', 'Id'], axis=1), test_df], axis=0).reset_index(drop=True)

### Filling mising value

In [7]:
def display_missing_data(df):
    missing = df.isna().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    percent = (missing / len(df)) * 100
    print(pd.DataFrame({'Missing Values': missing, 'Percent (%)': percent.round(2)}))

In [8]:
display_missing_data(train_df)

              Missing Values  Percent (%)
PoolQC                  1453        99.52
MiscFeature             1406        96.30
Alley                   1369        93.77
Fence                   1179        80.75
MasVnrType               872        59.73
FireplaceQu              690        47.26
LotFrontage              259        17.74
GarageType                81         5.55
GarageYrBlt               81         5.55
GarageFinish              81         5.55
GarageQual                81         5.55
GarageCond                81         5.55
BsmtFinType2              38         2.60
BsmtExposure              38         2.60
BsmtFinType1              37         2.53
BsmtCond                  37         2.53
BsmtQual                  37         2.53
MasVnrArea                 8         0.55
Electrical                 1         0.07


In [9]:
display_missing_data(test_df)

              Missing Values  Percent (%)
PoolQC                  1456        99.79
MiscFeature             1408        96.50
Alley                   1352        92.67
Fence                   1169        80.12
MasVnrType               894        61.27
FireplaceQu              730        50.03
LotFrontage              227        15.56
GarageCond                78         5.35
GarageYrBlt               78         5.35
GarageQual                78         5.35
GarageFinish              78         5.35
GarageType                76         5.21
BsmtCond                  45         3.08
BsmtExposure              44         3.02
BsmtQual                  44         3.02
BsmtFinType1              42         2.88
BsmtFinType2              42         2.88
MasVnrArea                15         1.03
MSZoning                   4         0.27
BsmtFullBath               2         0.14
BsmtHalfBath               2         0.14
Functional                 2         0.14
Utilities                  2      

In [10]:
numerical_features = df_all.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df_all.select_dtypes(include=['object']).columns.tolist()


In [11]:
numerical_na_to_zero = [
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea',
    'WoodDeckSF',
    'OpenPorchSF',
    'EnclosedPorch',
    '3SsnPorch',
    'ScreenPorch',
    'PoolArea',
    'MiscVal'
]

categorical_na_cols = [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature',
    'MasVnrType'
]

In [12]:
for col in numerical_na_to_zero:
    if col in df_all.columns:
        df_all[col].fillna(0, inplace=True)

for col in categorical_na_cols:
    if col in df_all.columns:
        df_all[col].fillna('None', inplace=True)

for col in categorical_features:
    if col in df_all.columns:
        # df_all[col].fillna(df_all[col].mode()[0], inplace=True)
        df_all[col].fillna('None', inplace=True)

for col in numerical_features:
    if col in df_all.columns:
        df_all[col].fillna(df_all[col].median(), inplace=True)

display_missing_data(df_all)

Empty DataFrame
Index: []


## Feature engineering

In [13]:
# qual_map = {'None':0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}

# df_all['TotalSF'] = df_all['TotalBsmtSF'] + df_all['1stFlrSF'] + df_all['2ndFlrSF'] 
# df_all['TotalPorchSF'] = df_all['OpenPorchSF'] + df_all['EnclosedPorch'] + df_all['3SsnPorch'] + df_all['ScreenPorch']  # tổng diện tích hiên/ban công
# df_all['TotalBath'] = df_all['FullBath'] + 0.5 * df_all['HalfBath'] + df_all['BsmtFullBath'] + 0.5 * df_all['BsmtHalfBath']  # tổng số phòng tắm
# df_all['TotalRooms'] = df_all['TotRmsAbvGrd'] + df_all['FullBath'] + df_all['HalfBath']            # tổng số phòng có thể sử dụng
# df_all['TotalSpace'] = df_all['TotalSF'] + df_all['GarageArea'] + df_all['TotalPorchSF']           # tổng không gian sử dụng (bao gồm garage và porch)

# # df_all['OverallGrade'] = df_all['OverallQual'] * df_all['OverallCond']     # chất lượng tổng thể của ngôi nhà
# # df_all['GarageGrade'] = df_all['GarageQual'].map(qual_map) * df_all['GarageCond']        # chất lượng garage
# # df_all['ExterGrade'] = df_all['ExterQual'].map(qual_map) * df_all['ExterCond'].map(qual_map)           # chất lượng ngoại thất
# # df_all['BsmtGrade'] = df_all['BsmtQual'].map(qual_map) * df_all['BsmtCond'].map(qual_map)              # chất lượng tầng hầm
# # df_all['QualitySum'] = df_all['OverallQual'] + df_all['ExterQual'].map(qual_map) + df_all['BsmtQual'].map(qual_map) + df_all['GarageQual'].map(qual_map)  # tổng chất lượng

# # df_all['OverallQual_GrLivArea'] = df_all['OverallQual'] * df_all['GrLivArea']
# # df_all['OverallQual_TotalSF'] = df_all['OverallQual'] * df_all['TotalSF']
# # df_all['OverallQual_GarageArea'] = df_all['OverallQual'] * df_all['GarageArea']

# df_all['HasPool'] = (df_all['PoolArea'] > 0).astype(int)             # có hồ bơi không
# df_all['HasFireplace'] = (df_all['Fireplaces'] > 0).astype(int)      # có lò sưởi không
# df_all['HasGarage'] = (df_all['GarageArea'] > 0).astype(int)         # có garage không
# df_all['HasBsmt'] = (df_all['TotalBsmtSF'] > 0).astype(int)          # có tầng hầm không
# df_all['HasPorch'] = (df_all['TotalPorchSF'] > 0).astype(int)  

## Mã hóa và chuẩn hóa dữ liệu

In [14]:
# numerical_cols = df_all.select_dtypes(include=[np.number]).columns.tolist()
# categorical_cols = df_all.select_dtypes(include=['object']).columns.tolist()
# len(numerical_cols), len(categorical_cols)

In [15]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
train_len = len(train_df)
std = StandardScaler()
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

numerical_df_encoded = std.fit_transform(df_all[numerical_features])
categorical_df_encoded = pd.get_dummies(df_all[categorical_features], drop_first=True)
df_encoder = pd.concat(
    [pd.DataFrame(numerical_df_encoded, columns=numerical_features),
     categorical_df_encoded.reset_index(drop=True)],
    axis=1
)

X = df_encoder[:train_len].copy()
x_test = df_encoder[train_len:].copy()
y = np.log1p(train_df['SalePrice'])


## Huấn luyện và đánh giá mô hình

In [16]:
base_models = {
    # === Linear Models ===
    'Linear Regression': LinearRegression(),
    
    # === Tree-based Models ===
    'Random Forest': RandomForestRegressor(
        n_estimators=300, max_depth=None, random_state=42
    ),
    'Gradient Boosting': GradientBoostingRegressor(
        n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42
    ),
    
    # === Boosting Models ===
    'XGBoost': XGBRegressor(
        n_estimators=1000, learning_rate=0.05, max_depth=3,
        subsample=0.7, colsample_bytree=0.7, reg_lambda=1,
        random_state=42
    ),
    'LightGBM': LGBMRegressor(
        n_estimators=1000, learning_rate=0.05, num_leaves=30,
        subsample=0.7, colsample_bytree=0.7,
        random_state=42, verbose=-1
    ),
    'CatBoost': CatBoostRegressor(
        iterations=1000, learning_rate=0.05, depth=6,
        random_state=42, verbose=False,
        allow_writing_files=False, train_dir=None
    ),
    
    # === Others ===
    'SVR': SVR(kernel='rbf', C=10, gamma='scale'),
    'K-Neighbors': KNeighborsRegressor(n_neighbors=5)
}

In [17]:
K = 5
kf = KFold(n_splits=K, shuffle=True, random_state=42)

results = []

for name, model in base_models.items():
    rmses = []
    
    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        
        rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
        
        rmses.append(rmse)
    
    mean_rmse = np.mean(rmses)
    std_rmse = np.std(rmses)
    results.append({'Model': name, 'RMSE_mean': mean_rmse, 'RMSE_std': std_rmse})
    print(f"{name}: RMSE = {mean_rmse:.4f} ± {std_rmse:.4f}")

# Convert results to DataFrame và sắp xếp theo RMSE nhỏ nhất
df_results = pd.DataFrame(results).sort_values('RMSE_mean')
df_results

Linear Regression: RMSE = 0.1720 ± 0.0351
Random Forest: RMSE = 0.1451 ± 0.0195
Gradient Boosting: RMSE = 0.1324 ± 0.0211
XGBoost: RMSE = 0.1287 ± 0.0176
LightGBM: RMSE = 0.1335 ± 0.0192
CatBoost: RMSE = 0.1255 ± 0.0174
SVR: RMSE = 0.1478 ± 0.0110
K-Neighbors: RMSE = 0.1785 ± 0.0134


Unnamed: 0,Model,RMSE_mean,RMSE_std
5,CatBoost,0.125452,0.017374
3,XGBoost,0.128737,0.017623
2,Gradient Boosting,0.132391,0.021074
4,LightGBM,0.133515,0.019214
1,Random Forest,0.145145,0.019519
6,SVR,0.147832,0.01102
0,Linear Regression,0.171952,0.035055
7,K-Neighbors,0.178529,0.013403


## Xuất ra file kết quả

In [18]:
# best_model_name = 'CatBoost'  # <-- đổi tên model bạn muốn dùng

try:
    # Lấy model
    for name, model in base_models.items():
        model_result = model.fit(X, y)

        # Dự đoán trên test
        preds_log = model_result.predict(x_test)      # dự đoán log(SalePrice) nếu bạn train trên log1p
        preds = np.expm1(preds_log)                 # chuyển về giá gốc

        # Tạo file submission
        submission = pd.DataFrame({
            'Id': test_df['Id'],
            'SalePrice': preds
        })
        submission.to_csv(f'submission_{name}_baseline.csv', index=False)
        print(f"File 'submission_{name}_baseline.csv' created using {name}.")

except Exception as e:
    print('Không thể tạo submission tự động:', e)

File 'submission_Linear Regression_baseline.csv' created using Linear Regression.
File 'submission_Random Forest_baseline.csv' created using Random Forest.
File 'submission_Gradient Boosting_baseline.csv' created using Gradient Boosting.
File 'submission_XGBoost_baseline.csv' created using XGBoost.
File 'submission_LightGBM_baseline.csv' created using LightGBM.
File 'submission_CatBoost_baseline.csv' created using CatBoost.
File 'submission_SVR_baseline.csv' created using SVR.
File 'submission_K-Neighbors_baseline.csv' created using K-Neighbors.
