<div style="border: 1px solid #b3d7ff; border-radius: 8px; padding: 12px; background: #e6f2ff; color: #000000; font-size: 20px;">
  <p style="margin: 0;"><strong>Tải các thư viện cần thiết</strong></p>
</div>


In [1]:
# === Core Libraries ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import os

# === Sklearn - Preprocessing, Models, Evaluation ===
from sklearn.linear_model import ElasticNet, Lasso, RidgeCV, BayesianRidge, LassoLarsIC
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

# === Advanced Regressors ===
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# === Ensemble Learning ===
from mlxtend.regressor import StackingCVRegressor

# === Hyperparameter Optimization ===
import optuna

# === Misc ===
from tqdm import tqdm

# === Ignore warnings ===
warnings.filterwarnings("ignore")


In [2]:
train = pd.read_csv('/kaggle/input/feature-engineering-dataset/Train_Feature_Engineering_v7.csv')
test = pd.read_csv('/kaggle/input/feature-engineering-dataset/Test_Feature_Engineering_v7.csv')
test_origin = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')

In [3]:
Test_ID = test_origin['Id']

In [4]:
Test_ID

0       1461
1       1462
2       1463
3       1464
4       1465
        ... 
1454    2915
1455    2916
1456    2917
1457    2918
1458    2919
Name: Id, Length: 1459, dtype: int64

<div style="border: 1px solid #b3d7ff; border-radius: 8px; padding: 12px; background: #e6f2ff; color: #000000; font-size: 20px;">
  <p style="margin: 0;"><strong>Tiến hành huấn luyện mô hình</strong></p>
</div>


In [5]:
y_train = train['SalePrice']
train = train.drop('SalePrice', axis=1)

In [6]:
train

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,TotalArea
0,0.740261,-0.074263,-0.139435,0.064294,0.106659,0.670979,0.210909,0.685992,-0.427932,1.042843,...,False,False,True,False,False,False,False,True,False,1.055124
1,-0.303669,0.574647,0.114727,0.064294,0.106659,0.670979,0.210909,0.026675,1.885731,0.161868,...,False,False,True,False,False,False,False,True,False,-0.619934
2,0.740261,0.064898,0.437517,0.064294,0.106659,-0.967380,0.210909,0.685992,-0.427932,0.977933,...,False,False,True,False,False,False,False,True,False,1.132326
3,0.898061,-0.318626,0.104229,0.064294,0.106659,-0.967380,0.210909,0.685992,-0.427932,-1.866768,...,False,False,True,True,False,False,False,False,False,0.993404
4,0.740261,0.730300,0.934594,0.064294,0.106659,-0.967380,0.210909,1.278615,-0.427932,0.945457,...,False,False,True,False,False,False,False,True,False,1.447296
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1451,0.740261,-0.218908,-0.267351,0.064294,0.106659,0.670979,0.210909,0.026675,-0.427932,0.912967,...,False,False,True,False,False,False,False,True,False,1.061083
1452,-0.303669,0.768239,0.766707,0.064294,0.106659,0.670979,0.210909,0.026675,0.450261,0.227475,...,False,False,True,False,False,False,False,True,False,-0.249806
1453,0.898061,-0.027289,-0.005161,0.064294,0.106659,0.670979,0.210909,0.685992,2.491550,-0.995486,...,False,False,True,False,False,False,False,True,False,1.514484
1454,-0.303669,0.064898,0.139111,0.064294,0.106659,0.670979,0.210909,-0.718210,0.450261,-0.696198,...,False,False,True,False,False,False,False,True,False,-0.780327


In [7]:
y_train

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1451    12.072547
1452    12.254868
1453    12.493133
1454    11.864469
1455    11.901590
Name: SalePrice, Length: 1456, dtype: float64

In [8]:
test

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,TotalArea
0,-0.303669,0.574647,0.504680,0.064294,0.106659,0.670979,0.210909,-0.718210,0.450261,-0.331991,...,False,False,True,False,False,False,False,True,False,-0.971434
1,-0.303669,0.614163,0.935642,0.064294,0.106659,-0.967380,0.210909,0.026675,0.450261,-0.431148,...,False,False,True,False,False,False,False,True,False,-0.566449
2,0.740261,0.328448,0.869402,0.064294,0.106659,-0.967380,0.210909,-0.718210,-0.427932,0.847946,...,False,False,True,False,False,False,False,True,False,1.039618
3,0.740261,0.494356,0.192618,0.064294,0.106659,-0.967380,0.210909,0.026675,0.450261,0.880464,...,False,False,True,False,False,False,False,True,False,1.021815
4,-2.888507,-1.301681,-1.133127,0.064294,0.106659,-0.967380,0.210909,1.278615,-0.427932,0.685152,...,False,False,True,False,False,False,False,True,False,-0.605332
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,-1.386729,-3.234580,-2.747405,0.064294,0.106659,0.670979,0.210909,-1.577274,1.211732,-0.035292,...,False,False,True,False,False,False,False,True,False,0.422875
1455,-1.386729,-3.234580,-2.782027,0.064294,0.106659,0.670979,0.210909,-1.577274,-0.427932,-0.035292,...,False,False,True,True,False,False,False,False,False,0.422875
1456,-0.303669,2.902825,1.675138,0.064294,0.106659,0.670979,0.210909,-0.718210,1.211732,-0.365029,...,False,False,True,True,False,False,False,False,False,-0.651350
1457,1.312286,-0.218908,0.284678,0.064294,0.106659,0.670979,0.210909,-0.718210,-0.427932,0.685152,...,False,False,True,False,False,False,False,True,False,-0.916033


In [9]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [10]:
# import optuna
# from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.model_selection import cross_val_score

# def objective(trial):
#     params = {
#         'n_estimators': trial.suggest_int('n_estimators', 1000, 3500),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
#         'max_depth': trial.suggest_int('max_depth', 3, 6),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 30),
#         'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
#         'loss': trial.suggest_categorical('loss', ['squared_error', 'absolute_error', 'huber']),
#         'random_state': 5
#     }

#     model = GradientBoostingRegressor(**params)
#     score = cross_val_score(model, train, y_train, cv=5, scoring='neg_root_mean_squared_error')
#     return -score.mean()

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=40)

# print("Best parameters found: ", study.best_params)


In [11]:
# import optuna
# from xgboost import XGBRegressor
# from sklearn.model_selection import cross_val_score

# def objective(trial):
#     params = {
#         'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
#         'n_estimators': trial.suggest_int('n_estimators', 1000, 4000),
#         'max_depth': trial.suggest_int('max_depth', 2, 10),
#         'min_child_weight': trial.suggest_int('min_child_weight', 0, 10),
#         'gamma': trial.suggest_float('gamma', 0, 1),
#         'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-5, 1e-1, log=True),
#         'objective': 'reg:squarederror',
#         'nthread': -1,
#         'seed': 27
#     }

#     model = XGBRegressor(**params)
#     score = cross_val_score(model, train, y_train, cv=5, scoring="neg_root_mean_squared_error")
#     return -score.mean()

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=40)

# print("Best params:", study.best_params)


In [12]:
# import optuna
# from lightgbm import LGBMRegressor
# from sklearn.model_selection import cross_val_score

# def objective(trial):
#     model = LGBMRegressor(
#         num_leaves=trial.suggest_int('num_leaves', 4, 128),
#         learning_rate=trial.suggest_float('learning_rate', 0.005, 0.1),
#         n_estimators=trial.suggest_int('n_estimators', 1000, 5000),
#         feature_fraction=trial.suggest_float('feature_fraction', 0.1, 1.0),
#         bagging_fraction=trial.suggest_float('bagging_fraction', 0.1, 1.0),
#     )
#     score = cross_val_score(model, train, y_train, cv=5, scoring='neg_root_mean_squared_error')
#     return -1 * score.mean()

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=40)

# print(study.best_params)


In [13]:
# import optuna
# from sklearn.svm import SVR
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import cross_val_score

# def objective(trial):
#     # Tham số cần tối ưu
#     C = trial.suggest_float('C', 1e-1, 100.0, log=True)
#     epsilon = trial.suggest_float('epsilon', 1e-4, 0.1, log=True)
#     gamma = trial.suggest_float('gamma', 1e-5, 1e-2, log=True)
    
#     # SVR đi kèm chuẩn hóa
#     svr = make_pipeline(StandardScaler(), SVR(C=C, epsilon=epsilon, gamma=gamma))
    
#     # Đánh giá bằng RMSLE (tương tự hàm rmsle_cv bạn dùng)
#     score = cross_val_score(svr, train, y_train, cv=5, scoring="neg_root_mean_squared_error")
#     return -score.mean()

# # Tạo và chạy bài toán tối ưu
# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=40)

# # Kết quả tốt nhất
# print("Best parameters:", study.best_params)
# print("Best RMSE score:", study.best_value)


In [14]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.1120 (0.0046)



In [15]:
lasso = Lasso(alpha =0.0005, random_state=1)
score = rmsle_cv(lasso)
print("Lasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Lasso score: 0.1133 (0.0043)



In [16]:
ENet = ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)
score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.1132 (0.0044)



In [17]:
lightgbm = LGBMRegressor(
    objective='regression',num_leaves=6,
    learning_rate=0.01,
    n_estimators=2000,
    max_bin=300,
    bagging_fraction=0.75,
    bagging_freq=5,
    bagging_seed=7,
    feature_fraction=0.2,
    feature_fraction_seed=7,
    verbose=-1,
    # min_data_in_leaf=2,
    # min_sum_hessian_in_leaf=11
)
score = rmsle_cv(lightgbm)
print("lightgbm: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

lightgbm: 0.1159 (0.0058)



In [18]:
GBoost = GradientBoostingRegressor(
                                   n_estimators=3000, learning_rate=0.01,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.1121 (0.0064)



In [19]:
from xgboost import XGBRegressor
xgboost = XGBRegressor(
                       learning_rate=0.01, n_estimators=3460,
                       max_depth=3, min_child_weight=0,
                       gamma=0, subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:linear', nthread=-1,
                       scale_pos_weight=1, seed=27,
                       reg_alpha=0.00006)
score = rmsle_cv(xgboost)
print("xgboost: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

xgboost: 0.1118 (0.0072)



In [20]:
svr = SVR(C=10, epsilon=0.0008, gamma=0.0004)

# Giả sử bạn đã định nghĩa hàm rmsle_cv
score = rmsle_cv(svr)
print("SVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


SVR score: 0.1110 (0.0048)



In [21]:
LassoMd = lasso.fit(train.values,y_train)
ENetMd = ENet.fit(train.values,y_train)
KRRMd = KRR.fit(train.values,y_train)
GBoostMd = GBoost.fit(train.values,y_train)
svrMd = svr.fit(train.values,y_train)
xgbMd = xgboost.fit(train.values,y_train)
lgbMd = lightgbm.fit(train.values,y_train)

In [22]:
lasso_sub = np.expm1(LassoMd.predict(test.values))
ENet_sub = np.expm1(ENetMd.predict(test.values))
KRR_sub = np.expm1(KRRMd.predict(test.values))
GBoost_Sub = np.expm1(GBoostMd.predict(test.values))
svr_Sub = np.expm1(svrMd.predict(test.values))
xgb_Sub = np.expm1(xgbMd.predict(test.values))
lgb_Sub = np.expm1(lgbMd.predict(test.values))

In [23]:
finalMd = (np.expm1(LassoMd.predict(test.values)) + np.expm1(ENetMd.predict(test.values)) + np.expm1(KRRMd.predict(test.values)) + np.expm1(GBoostMd.predict(test.values)) + np.expm1(svrMd.predict(test.values))+ np.expm1(xgbMd.predict(test.values)) + np.expm1(lgbMd.predict(test.values))) / 7
finalMd

array([120234.00897526, 159643.7544069 , 184330.8680059 , ...,
       167157.41393254, 120930.74897811, 219901.36416144])

In [24]:
from mlxtend.regressor import StackingCVRegressor
from sklearn.base import clone
stack_gen = StackingCVRegressor(
    regressors=( 
        clone(LassoMd), 
        clone(ENetMd), 
        clone(KRRMd), 
        clone(GBoostMd), 
        clone(xgbMd),
        clone(svrMd),
        clone(lgbMd)
    ),
    meta_regressor=clone(xgbMd),
    use_features_in_secondary=True
)


print('StackingCVRegressor')
stack_gen_model = stack_gen.fit(train.values, y_train)
Stack_Sub=np.expm1(stack_gen_model.predict(test.values))

StackingCVRegressor


In [27]:
def blend_models_predict():
    return ((0.05 * ENet_sub) +    # 1132
            (0.05 * lasso_sub) +   # 1133
            (0.1 * KRR_sub) +     # 1120
            (0.2 * svr_Sub) +     # 1110
            (0.1 * GBoost_Sub) +  # 1121
            (0.15  * xgb_Sub) +     # 1118
            (0.05 * lgb_Sub) +     # 1159
            (0.3 * Stack_Sub))    # stacking – giả định tốt nhất

Blend_Sub = blend_models_predict()


<div style="border: 1px solid #b3d7ff; border-radius: 8px; padding: 12px; background: #e6f2ff; color: #000000; font-size: 20px;">
  <p style="margin: 0;"><strong>Predict trên tập test</strong></p>
</div>


In [28]:
sub = pd.DataFrame()
sub['Id'] = Test_ID
sub['SalePrice'] = Blend_Sub
sub.to_csv('submission.csv',index=False)