In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.4f' % x)
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [2]:
df = pd.read_csv(r'D:\AI\ai2-project\data\1553768847-housing.csv')

In [3]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [4]:
df.shape

(20640, 10)

In [5]:
df['median_house_value'].describe()

count    20640.0000
mean    206855.8169
std     115395.6159
min      14999.0000
25%     119600.0000
50%     179700.0000
75%     264725.0000
max     500001.0000
Name: median_house_value, dtype: float64

In [6]:
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64

In [7]:
df_model = df.copy()

# Ratio Features
df_model['rooms_per_household'] = df_model['total_rooms'] / df_model['households']
df_model['bedrooms_per_room'] = df_model['total_bedrooms'] / df_model['total_rooms']
df_model['population_per_household'] = df_model['population'] / df_model['households']
df_model['bedrooms_per_household'] = df_model['total_bedrooms'] / df_model['households']

# Income-based features
df_model['income_squared'] = df_model['median_income'] ** 2
df_model['income_cubed'] = df_model['median_income'] ** 3
df_model['income_rooms_interaction'] = df_model['median_income'] * df_model['rooms_per_household']
df_model['income_age_interaction'] = df_model['median_income'] * df_model['housing_median_age']

# Age-based features
df_model['is_new'] = (df_model['housing_median_age'] < 10).astype(int)
df_model['is_old'] = (df_model['housing_median_age'] > 40).astype(int)
df_model['age_squared'] = df_model['housing_median_age'] ** 2

# Density features
df_model['people_per_room'] = df_model['population'] / df_model['total_rooms']
df_model['rooms_density'] = df_model['total_rooms'] / df_model['households']

# Geographical features
df_model['lat_lon_interaction'] = df_model['latitude'] * df_model['longitude']
df_model['distance_to_center'] = np.sqrt((df_model['latitude'] - 34.0)**2 + (df_model['longitude'] + 118.0)**2)
df_model['distance_to_coast'] = np.abs(df_model['longitude'] + 120.0)

# Log transformations for skewed features
df_model['log_population'] = np.log1p(df_model['population'])
df_model['log_total_rooms'] = np.log1p(df_model['total_rooms'])
df_model['log_total_bedrooms'] = np.log1p(df_model['total_bedrooms'])
df_model['log_households'] = np.log1p(df_model['households'])

# Replace infinite values with NaN
df_model.replace([np.inf, -np.inf], np.nan, inplace=True)

print(f"shape: {df_model.shape}")
print(f"\nNew features created:")
new_features = ['rooms_per_household', 'bedrooms_per_room', 'population_per_household', 
                'bedrooms_per_household', 'income_squared', 'income_cubed', 'income_rooms_interaction',
                'income_age_interaction', 'is_new', 'is_old', 'age_squared', 'people_per_room', 
                'rooms_density', 'lat_lon_interaction', 'distance_to_center', 'distance_to_coast',
                'log_population', 'log_total_rooms', 'log_total_bedrooms', 'log_households']
print(new_features)

shape: (20640, 30)

New features created:
['rooms_per_household', 'bedrooms_per_room', 'population_per_household', 'bedrooms_per_household', 'income_squared', 'income_cubed', 'income_rooms_interaction', 'income_age_interaction', 'is_new', 'is_old', 'age_squared', 'people_per_room', 'rooms_density', 'lat_lon_interaction', 'distance_to_center', 'distance_to_coast', 'log_population', 'log_total_rooms', 'log_total_bedrooms', 'log_households']


In [8]:
df_model.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value,rooms_per_household,bedrooms_per_room,population_per_household,bedrooms_per_household,income_squared,income_cubed,income_rooms_interaction,income_age_interaction,is_new,is_old,age_squared,people_per_room,rooms_density,lat_lon_interaction,distance_to_center,distance_to_coast,log_population,log_total_rooms,log_total_bedrooms,log_households
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600,6.9841,0.1466,2.5556,1.0238,69.309,577.0109,58.1443,341.3332,0,1,1681,0.3659,6.9841,-4630.0724,5.74,2.23,5.7777,6.7811,4.8675,4.8442
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500,6.2381,0.1558,2.1098,0.9719,68.9132,572.0764,51.7853,174.3294,0,0,441,0.3382,6.2381,-4627.2492,5.7191,2.22,7.7841,8.8679,7.0094,7.0379
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100,8.2881,0.1295,2.8023,1.0734,52.6699,382.2462,60.1503,377.3848,0,1,2704,0.3381,8.2881,-4626.784,5.7271,2.24,6.2086,7.2917,5.2523,5.1818
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300,5.8174,0.1845,2.5479,1.0731,31.8446,179.7021,32.8279,293.4412,0,1,2704,0.438,5.8174,-4627.1625,5.7345,2.25,6.3261,7.1507,5.4638,5.3936
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200,6.2819,0.1721,2.1815,1.0811,14.7933,56.8978,24.1613,200.0024,0,1,2704,0.3473,6.2819,-4627.1625,5.7345,2.25,6.3386,7.3951,5.6384,5.5607


In [None]:
# categorical variable - One-Hot Encoding
df_model = pd.get_dummies(df_model, columns=['ocean_proximity'], drop_first=False)

X = df_model.drop('median_house_value', axis=1)
y = df_model['median_house_value']

Q1 = y.quantile(0.25)
Q3 = y.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 3 * IQR
upper_bound = Q3 + 3 * IQR

outlier_mask = (y >= lower_bound) & (y <= upper_bound)
X = X[outlier_mask]
y = y[outlier_mask]

print(f"Removed {(~outlier_mask).sum()} outliers ({(~outlier_mask).sum() / len(outlier_mask) * 100:.2f}%)")
print(f"New dataset shape: {X.shape}")

# missing
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

Removed 0 outliers (0.00%)
New dataset shape: (20640, 33)


((16512, 33), (4128, 33))

In [None]:
# Feature Selection
from sklearn.feature_selection import SelectKBest, mutual_info_regression

# top 25
selector = SelectKBest(score_func=mutual_info_regression, k=min(25, X_train.shape[1]))
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

selected_features = X_train.columns[selector.get_support()].tolist()
print(f"Selected {len(selected_features)} features:")
print(selected_features)

Selected 25 features:
['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'population', 'households', 'median_income', 'rooms_per_household', 'bedrooms_per_room', 'population_per_household', 'bedrooms_per_household', 'income_squared', 'income_cubed', 'income_rooms_interaction', 'income_age_interaction', 'age_squared', 'people_per_room', 'rooms_density', 'lat_lon_interaction', 'distance_to_center', 'distance_to_coast', 'log_total_rooms', 'log_households', 'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND']


In [None]:
# polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_train_poly = poly.fit_transform(X_train_selected)
X_test_poly = poly.transform(X_test_selected)

print(f"Polynomial features shape: {X_train_poly.shape}")
print(f"Original features: {X_train_selected.shape[1]}, With polynomial: {X_train_poly.shape[1]}")

Polynomial features shape: (16512, 350)
Original features: 25, With polynomial: 350


In [12]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
    cv_rmse = -cv_scores.mean()
    
    return {
        'Model': model_name,
        'Train RMSE': train_rmse,
        'Test RMSE': test_rmse,
        'Train MAE': train_mae,
        'Test MAE': test_mae,
        'Train R²': train_r2,
        'Test R²': test_r2,
        'CV RMSE': cv_rmse,
        'Model Object': model
    }

In [None]:
models = {}
results = []

# Linear
models['Linear Regression'] = LinearRegression()
models['Ridge'] = Ridge(alpha=10.0, random_state=42)
models['Lasso'] = Lasso(alpha=10.0, random_state=42)
models['ElasticNet'] = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)

# Tree-based
models['Decision Tree'] = DecisionTreeRegressor(max_depth=15, random_state=42)
models['Random Forest'] = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1)

# Gradient Boosting
models['Gradient Boosting'] = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
models['XGBoost'] = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, n_jobs=-1)
models['LightGBM'] = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, n_jobs=-1, verbose=-1)
models['CatBoost'] = CatBoostRegressor(n_estimators=100, learning_rate=0.1, depth=5, random_state=42, verbose=0)

models['AdaBoost'] = AdaBoostRegressor(n_estimators=100, learning_rate=0.5, random_state=42)

print(f"\n{len(models)} models initialized!")


11 models initialized!


In [None]:
for name, model in models.items():
    print(f"Training {name}...")
    result = evaluate_model(model, X_train_scaled, X_test_scaled, y_train, y_test, name)
    results.append(result)
    print(f"\tTest RMSE: ${result['Test RMSE']:,.2f} | Test R²: {result['Test R²']:.4f}")

results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test MSE')
display(results_df[['Model', 'Train RMSE', 'Test RMSE', 'Train R²', 'Test R²', 'CV RMSE']].round(2))

Training Linear Regression...
	Test RMSE: $3,574,715.01 | Test R²: -974.1602
Training Ridge...
	Test RMSE: $103,035.62 | Test R²: 0.1898
Training Lasso...
	Test RMSE: $65,118.51 | Test R²: 0.6764
Training ElasticNet...
	Test RMSE: $67,968.18 | Test R²: 0.6475
Training Decision Tree...
	Test RMSE: $67,120.55 | Test R²: 0.6562
Training Random Forest...
	Test RMSE: $49,370.21 | Test R²: 0.8140
Training Gradient Boosting...
	Test RMSE: $46,901.67 | Test R²: 0.8321
Training XGBoost...
	Test RMSE: $46,877.63 | Test R²: 0.8323
Training LightGBM...
	Test RMSE: $46,849.84 | Test R²: 0.8325
Training CatBoost...
	Test RMSE: $51,788.70 | Test R²: 0.7953
Training AdaBoost...
	Test RMSE: $79,873.44 | Test R²: 0.5131


Unnamed: 0,Model,Train RMSE,Test RMSE,Train R²,Test R²,CV RMSE
8,LightGBM,39148.76,46849.84,0.89,0.83,47454.93
7,XGBoost,37825.08,46877.63,0.89,0.83,47611.96
6,Gradient Boosting,36942.0,46901.67,0.9,0.83,47603.9
5,Random Forest,19234.01,49370.21,0.97,0.81,49931.48
9,CatBoost,48863.43,51788.7,0.82,0.8,50914.46
2,Lasso,58340.07,65118.51,0.75,0.68,215805.19
4,Decision Tree,24003.21,67120.55,0.96,0.66,68521.93
3,ElasticNet,63514.07,67968.18,0.7,0.65,118391.55
10,AdaBoost,79513.17,79873.44,0.53,0.51,78175.57
1,Ridge,57210.93,103035.62,0.76,0.19,3786619.86


In [16]:
tuned_models = {}
tuned_results = []


In [17]:
# Tune Random Forest
rf_params = {
    'n_estimators': [200, 300],
    'max_depth': [25, 30, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1],
    'max_features': ['sqrt', 0.3]
}

rf_grid = RandomizedSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    rf_params,
    n_iter=10,
    cv=2,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=0
)

rf_grid.fit(X_train_scaled, y_train)
best_rf = rf_grid.best_estimator_
print(f"Best RF params: {rf_grid.best_params_}")
print(f"Best RF CV score: ${-rf_grid.best_score_:,.2f}")

result_rf = evaluate_model(best_rf, X_train_scaled, X_test_scaled, y_train, y_test, 'Random Forest (Tuned)')

tuned_results.append(result_rf)
tuned_models['Random Forest (Tuned)'] = best_rf

Best RF params: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 0.3, 'max_depth': None}
Best RF CV score: $50,926.65


In [18]:
# Tune XGBoost
from sklearn.model_selection import train_test_split as tts

X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = tts(X_train_scaled, y_train, test_size=0.2, random_state=42)

xgb_params = {
    'n_estimators': [300, 500],
    'learning_rate': [0.05, 0.1],
    'max_depth': [5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'min_child_weight': [1, 3]
}

xgb_grid = RandomizedSearchCV(
    xgb.XGBRegressor(random_state=42, n_jobs=-1, early_stopping_rounds=20),
    xgb_params,
    n_iter=10,
    cv=2,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=0
)

xgb_grid.fit(X_train_scaled, y_train, 
             eval_set=[(X_val_xgb, y_val_xgb)],
             verbose=False)
best_xgb = xgb_grid.best_estimator_

best_xgb.set_params(early_stopping_rounds=None)

print(f"Best XGB params: {xgb_grid.best_params_}")
print(f"Best XGB CV score: ${-xgb_grid.best_score_:,.2f}")

result_xgb = evaluate_model(best_xgb, X_train_scaled, X_test_scaled, y_train, y_test, 'XGBoost (Tuned)')

tuned_results.append(result_xgb)
tuned_models['XGBoost (Tuned)'] = best_xgb

Best XGB params: {'subsample': 0.8, 'n_estimators': 300, 'min_child_weight': 3, 'max_depth': 7, 'learning_rate': 0.05, 'colsample_bytree': 0.8}
Best XGB CV score: $46,382.07


In [19]:
# Tune LightGBM with early stopping
lgb_params = {
    'n_estimators': [300, 500],
    'learning_rate': [0.05, 0.1],
    'max_depth': [7, -1],
    'num_leaves': [31, 50],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

lgb_grid = RandomizedSearchCV(
    lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1),
    lgb_params,
    n_iter=10,
    cv=2,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1,
    verbose=0
)

lgb_grid.fit(X_train_scaled, y_train,
             eval_set=[(X_val_xgb, y_val_xgb)],
             callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
best_lgb = lgb_grid.best_estimator_
print(f"Best LGB params: {lgb_grid.best_params_}")
print(f"Best LGB CV score: ${-lgb_grid.best_score_:,.2f}")

result_lgb = evaluate_model(best_lgb, X_train_scaled, X_test_scaled, y_train, y_test, 'LightGBM (Tuned)')

tuned_results.append(result_lgb)
tuned_models['LightGBM (Tuned)'] = best_lgb

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's l2: 6.5456e+08
Best LGB params: {'subsample': 0.8, 'num_leaves': 31, 'n_estimators': 500, 'max_depth': -1, 'learning_rate': 0.05, 'colsample_bytree': 1.0}
Best LGB CV score: $46,064.89


In [None]:
# Tune CatBoost with early stopping
cat_params = {
    'iterations': [300, 500],
    'learning_rate': [0.05, 0.1],
    'depth': [6, 8],
    'l2_leaf_reg': [1, 3]
}

cat_grid = RandomizedSearchCV(
    CatBoostRegressor(random_state=42, verbose=0, thread_count=2, early_stopping_rounds=20),
    cat_params,
    n_iter=10,
    cv=2,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=2,
    verbose=0
)

cat_grid.fit(X_train_scaled, y_train,
             eval_set=(X_val_xgb, y_val_xgb),
             verbose=False)

best_params = cat_grid.best_params_.copy()
best_cat = CatBoostRegressor(random_state=42, verbose=0, thread_count=2, **best_params)

print(f"Best CatBoost params: {cat_grid.best_params_}")
print(f"Best CatBoost CV score: ${-cat_grid.best_score_:,.2f}")

result_cat = evaluate_model(best_cat, X_train_scaled, X_test_scaled, y_train, y_test, 'CatBoost (Tuned)')

tuned_results.append(result_cat)
tuned_models['CatBoost (Tuned)'] = best_cat

In [None]:
tuned_df = pd.DataFrame(tuned_results)
tuned_df = tuned_df.sort_values('Test RMSE')

display(tuned_df[['Model', 'Train RMSE', 'Test RMSE', 'Train R²', 'Test R²', 'CV RMSE']].round(2))

Unnamed: 0,Model,Train RMSE,Test RMSE,Train R²,Test R²,CV RMSE
2,LightGBM (Tuned),25593.87,43675.36,0.95,0.85,44703.12
3,CatBoost (Tuned),26322.7,43825.64,0.95,0.85,44591.92
1,XGBoost (Tuned),22454.03,44426.84,0.96,0.85,45035.6
0,Random Forest (Tuned),17871.86,48587.1,0.98,0.82,49280.76


In [None]:
# Simple Voting Ensemble (top 4 models)
base_estimators = [
    ('rf', best_rf),
    ('xgb', best_xgb),
    ('lgb', best_lgb),
    ('cat', best_cat)
]

voting_reg = VotingRegressor(estimators=base_estimators, n_jobs=-1)
voting_reg.fit(X_train_scaled, y_train)

result_voting = evaluate_model(voting_reg, X_train_scaled, X_test_scaled, y_train, y_test, 'Voting Ensemble')
print(f"Voting Ensemble - Test RMSE: ${result_voting['Test RMSE']:,.2f} | Test R²: {result_voting['Test R²']:.4f}")

Voting Ensemble - Test RMSE: $43,971.73 | Test R²: 0.8524


In [None]:
top_4_models = tuned_df.head(4)['Model'].tolist()
base_estimators = []
for model_name in top_4_models:
    model_obj = tuned_df[tuned_df['Model'] == model_name]['Model Object'].values[0]
    base_estimators.append((model_name.replace(' ', '_').replace('(', '').replace(')', ''), model_obj))

print(f"Using top {len(base_estimators)} models for stacking")

stacking_reg = StackingRegressor(
    estimators=base_estimators,
    final_estimator=Ridge(alpha=5.0),
    cv=2,
    n_jobs=-1
)

stacking_reg.fit(X_train_scaled, y_train)
result_stacking = evaluate_model(stacking_reg, X_train_scaled, X_test_scaled, y_train, y_test, 'Stacking Ensemble')
print(f"Stacking - Test RMSE: ${result_stacking['Test RMSE']:,.2f} | Test R²: {result_stacking['Test R²']:.4f}")

Using top 4 models for stacking


KeyboardInterrupt: 

In [None]:
# Weighted Ensemble
weights = []
for model_name in top_4_models:
    rmse = tuned_df[tuned_df['Model'] == model_name]['Test RMSE'].values[0]
    weights.append(1 / (rmse ** 2))

weights = np.array(weights)
weights = weights / weights.sum()

print("Model Weights:")
for name, weight in zip(top_4_models, weights):
    print(f"  {name}: {weight:.4f}")

weighted_voting = VotingRegressor(estimators=base_estimators, weights=weights, n_jobs=-1)
weighted_voting.fit(X_train_scaled, y_train)

result_weighted = evaluate_model(weighted_voting, X_train_scaled, X_test_scaled, y_train, y_test, 'Weighted Voting')
print(f"Weighted Voting - Test RMSE: ${result_weighted['Test RMSE']:,.2f} | Test R²: {result_weighted['Test R²']:.4f}")

In [None]:
ensemble_results = [result_voting, result_stacking, result_weighted]
all_results = results + tuned_results + ensemble_results

final_df = pd.DataFrame(all_results)
final_df = final_df.sort_values('Test RMSE')

print("\n" + "="*80)
print("FINAL MODEL PERFORMANCE RANKING")
print("="*80)
display(final_df[['Model', 'Train RMSE', 'Test RMSE', 'Train R²', 'Test R²', 'CV RMSE']].round(2).head(10))

print(f"\nBEST MODEL: {final_df.iloc[0]['Model']}")
print(f"   Test RMSE: ${final_df.iloc[0]['Test RMSE']:,.2f}")
print(f"   Test R²: {final_df.iloc[0]['Test R²']:.6f}")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 6))

top_10 = final_df.head(10)
ax.barh(top_10['Model'], top_10['Test RMSE'], color='steelblue', alpha=0.8)
ax.set_xlabel('Test RMSE ($)')
ax.set_title('Top 10 Models - Test RMSE Comparison', fontweight='bold', fontsize=14)
ax.invert_yaxis()
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
import pickle
import os

os.makedirs(r'D:\AI\ai2-project\models', exist_ok=True)

best_model_name = final_df.iloc[0]['Model']
best_model_obj = final_df.iloc[0]['Model Object']

model_path = r'D:\AI\ai2-project\models\best_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump(best_model_obj, f)

scaler_path = r'D:\AI\ai2-project\models\scaler.pkl'
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)

selector_path = r'D:\AI\ai2-project\models\selector.pkl'
with open(selector_path, 'wb') as f:
    pickle.dump(selector, f)

poly_path = r'D:\AI\ai2-project\models\poly_features.pkl'
with open(poly_path, 'wb') as f:
    pickle.dump(poly, f)

metadata = {
    'model_name': best_model_name,
    'test_rmse': final_df.iloc[0]['Test RMSE'],
    'test_r2': final_df.iloc[0]['Test R²'],
    'selected_features': selected_features
}

metadata_path = r'D:\AI\ai2-project\models\model_metadata.pkl'
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)