In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1) Đọc dữ liệu đã xử lý outlier bằng median
df = pd.read_csv(r'D:\Đồ Án\TH1\code\combined_fill0.csv')

# 2) Tạo 5 bộ dữ liệu theo yêu cầu
# a) df_le: label-encode các biến categorical
df_le = df.copy()
cat_cols = df_le.select_dtypes(include=['object']).columns
for c in cat_cols:
    df_le[c] = LabelEncoder().fit_transform(df_le[c].astype(str))

# b) df_dum: one-hot encode tất cả biến categorical
df_dum = pd.get_dummies(df, drop_first=True)

# c) df_pca_dum: rút chiều dữ liệu one-hot bằng PCA (20 thành phần)
X = df_dum.drop('SalePrice', axis=1)
pca = PCA(n_components=20, random_state=42)
X_pca = pca.fit_transform(X)
df_pca_dum = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(20)])
df_pca_dum['SalePrice'] = df['SalePrice'].values

# d) df_cor_dum: chọn biến theo |corr|>0.1 với target, sau đó giữ nguyên dạng one-hot
corr = df_dum.corr()['SalePrice'].abs()
keep = corr[corr > 0.1].index.drop('SalePrice')
df_cor_dum = df_dum[keep].copy()
df_cor_dum['SalePrice'] = df_dum['SalePrice']

# e) df_fs_dum: chọn 20 biến tốt nhất theo f_regression, rồi giữ dạng one-hot
X = df_dum.drop('SalePrice', axis=1)
y = df_dum['SalePrice']
selector = SelectKBest(score_func=f_regression, k=20).fit(X, y)
df_fs_dum = pd.DataFrame(X.iloc[:, selector.get_support()],
                         columns=X.columns[selector.get_support()])
df_fs_dum['SalePrice'] = y.values

# Gom các bộ dữ liệu vào dict để loop
datasets = {
    'df_le': df_le,
    'df_dum': df_dum,
    'df_pca_dum': df_pca_dum,
    'df_cor_dum': df_cor_dum,
    'df_fs_dum': df_fs_dum
}

# 3) Định nghĩa 3 mô hình
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# 4) Hàm đánh giá metrics
def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mae  = mean_absolute_error(y_test, y_pred)
    mse  = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2   = r2_score(y_test, y_pred)
    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

# 5) Vòng lặp train & evaluate
results = []
for ds_name, df_v in datasets.items():
    # tách features / target
    X = df_v.drop('SalePrice', axis=1)
    y = df_v['SalePrice']
    # chia train/test 80/20
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    for model_name, model in models.items():
        # train
        model.fit(X_train, y_train)
        # đánh giá
        metrics = evaluate(model, X_test, y_test)
        results.append({
            'Dataset': ds_name,
            'Model': model_name,
            **metrics
        })

# 6) In bảng kết quả
results_df = pd.DataFrame(results)
print(results_df.pivot(index='Dataset', columns='Model'))


                     MAE                                           MSE  \
Model      Decision Tree Linear Regression Random Forest Decision Tree   
Dataset                                                                  
df_cor_dum  13779.946918      47545.763250   9432.632808  7.646364e+08   
df_dum      14585.027397      45751.513733   9344.933493  1.178046e+09   
df_fs_dum   12942.813356      48055.209113   9769.222380  7.723425e+08   
df_le       13641.477740      44717.431414   9494.409623  7.838907e+08   
df_pca_dum  21005.931507      48378.962837  14567.442808  2.192826e+09   

                                                    RMSE                    \
Model      Linear Regression Random Forest Decision Tree Linear Regression   
Dataset                                                                      
df_cor_dum      3.799057e+09  4.097456e+08  27652.059758      61636.494672   
df_dum          3.821552e+09  4.136300e+08  34322.667520      61818.702162   
df_fs_dum       4