<div style="border: 1px solid #b3d7ff; border-radius: 8px; padding: 12px; background: #e6f2ff; color: #000000; font-size: 20px;">
  <p style="margin: 0;"><strong>Tải các thư viện cần thiết</strong></p>
</div>


In [10]:
# === Core Libraries ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import os

# === Sklearn - Preprocessing, Models, Evaluation ===
from sklearn.linear_model import ElasticNet, Lasso, RidgeCV, BayesianRidge, LassoLarsIC
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

# === Advanced Regressors ===
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# === Ensemble Learning ===
from mlxtend.regressor import StackingCVRegressor

# === Hyperparameter Optimization ===
import optuna

# === Misc ===
from tqdm import tqdm

# === Ignore warnings ===
warnings.filterwarnings("ignore")


In [11]:
train = pd.read_csv('/kaggle/input/feature-engineering-dataset/Train_Feature_Engineering_demo.csv')
test = pd.read_csv('/kaggle/input/feature-engineering-dataset/Test_Feature_Engineering_demo.csv')
test_origin = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

In [12]:
Test_ID = test_origin['Id']

In [13]:
Test_ID

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

<div style="border: 1px solid #b3d7ff; border-radius: 8px; padding: 12px; background: #e6f2ff; color: #000000; font-size: 20px;">
  <p style="margin: 0;"><strong>Tiến hành huấn luyện mô hình để demo</strong></p>
</div>


In [14]:
y_train = train['SalePrice']
train = train.drop('SalePrice', axis=1)

In [15]:
train

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,TotalArea
0,2.885846,5.831328,19.212182,0.730463,0.730463,1.820334,1.540963,2.440268,1.820334,14.187527,...,False,False,True,False,False,False,False,True,False,35.071435
1,2.055642,6.221214,19.712205,0.730463,0.730463,1.820334,1.540963,2.259674,2.440268,14.145138,...,False,False,True,False,False,False,False,True,False,25.584552
2,2.885846,5.914940,20.347241,0.730463,0.730463,1.540963,1.540963,2.440268,1.820334,14.184404,...,False,False,True,False,False,False,False,True,False,35.508677
3,3.011340,5.684507,19.691553,0.730463,0.730463,1.540963,1.540963,2.440268,1.820334,14.047529,...,False,False,True,True,False,False,False,False,False,34.721871
4,2.885846,6.314735,21.325160,0.730463,0.730463,1.540963,1.540963,2.602594,1.820334,14.182841,...,False,False,True,False,False,False,False,True,False,37.292541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,2.885846,5.744420,18.960528,0.730463,0.730463,1.820334,1.540963,2.259674,1.820334,14.181278,...,False,False,True,False,False,False,False,True,False,35.105180
1452,2.055642,6.337529,20.994868,0.730463,0.730463,1.820334,1.540963,2.259674,2.055642,14.148295,...,False,False,True,False,False,False,False,True,False,27.680812
1453,3.011340,5.859551,19.476345,0.730463,0.730463,1.820334,1.540963,2.440268,2.602594,14.089451,...,False,False,True,False,False,False,False,True,False,37.673067
1454,2.055642,5.914940,19.760176,0.730463,0.730463,1.820334,1.540963,2.055642,2.055642,14.103852,...,False,False,True,False,False,False,False,True,False,24.676149


In [16]:
y_train

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1451    12.072547
1452    12.254868
1453    12.493133
1454    11.864469
1455    11.901590
Name: SalePrice, Length: 1456, dtype: float64

In [18]:
model = XGBRegressor(n_estimators=100, random_state=42)
model.fit(train, y_train)

# Feature importance
booster = model.get_booster()
importance_raw = booster.get_score(importance_type='gain')  # hoặc 'weight', 'cover'

importance_df = pd.DataFrame({
    "Feature": list(importance_raw.keys()),
    "Importance": list(importance_raw.values())
})

# Extract original feature (gộp theo tên gốc trước dấu "_")
importance_df['OriginalFeature'] = importance_df['Feature'].str.extract(r"(^[^_]+)")

# Tổng hợp theo feature gốc
grouped_importance = (
    importance_df
    .groupby('OriginalFeature')
    .agg({'Importance': 'sum'})
    .sort_values('Importance', ascending=False)
    .reset_index()
)

# Xem top feature quan trọng nhất
print(grouped_importance.head(15))

   OriginalFeature  Importance
0      OverallQual    2.542013
1       CentralAir    1.328424
2      KitchenQual    0.964870
3        ExterQual    0.875908
4       GarageCars    0.732406
5       GarageCond    0.563011
6         MSZoning    0.470185
7      FireplaceQu    0.299640
8         FullBath    0.284200
9     GarageFinish    0.264881
10       GrLivArea    0.259001
11    Neighborhood    0.253112
12     TotalBsmtSF    0.165249
13      GarageType    0.154182
14     Exterior1st    0.131199


In [19]:
top_features = [
    'OverallQual', 'CentralAir', 'KitchenQual', 'ExterQual', 'GarageCars',
    'GarageCond', 'MSZoning', 'FireplaceQu', 'FullBath', 'GarageFinish',
    'GrLivArea', 'Neighborhood', 'TotalBsmtSF', 'GarageType'
]
filtered_columns = [col for col in train.columns if any(col.startswith(feature) for feature in top_features)]

train_filtered = train[filtered_columns]


In [26]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train_filtered.values)
    rmse= np.sqrt(-cross_val_score(model, train_filtered.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [27]:
from xgboost import XGBRegressor
xgboost = XGBRegressor(
                       learning_rate=0.01, n_estimators=3460,
                       max_depth=3, min_child_weight=0,
                       gamma=0, subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear', nthread=-1,
                       scale_pos_weight=1, seed=27,
                       reg_alpha=0.00006)
score = rmsle_cv(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

xgboost: 0.1369 (0.0096)



In [28]:
xgbMd = xgboost.fit(train.values,y_train)

In [29]:
import pickle

# Ghi mô hình vào file .pkl
with open("xgboost_model.pkl", "wb") as f:
    pickle.dump(xgbMd, f)
xgbMd.get_booster().save_model("xgboost_model.json")