In [8]:
import joblib
import sys
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# Add 'src' to path to import utils
sys.path.append('..')
from src.utils import plot_regression_results, plot_feature_importance

# --- Load Data and Model ---
MODEL_PATH = "../models/best_model.pkl"
TEST_DATA_PATH = "../models/test_data.pkl"

model = joblib.load(MODEL_PATH)
X_test, y_test_log = joblib.load(TEST_DATA_PATH) # y is still log-transformed

print("Model and test data loaded.")
print(f"Model object: {model}")

Model and test data loaded.
Model object: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['MSSubClass', 'LotFrontage',
                                                   'LotArea', 'OverallQual',
                                                   'OverallCond', 'YearBuilt',
                                                   'YearRemodAdd', 'MasVnrArea',
                                                   'BsmtFinSF1', 'BsmtFinSF2',
                                                   'BsmtUnfSF', 'To

In [10]:
print("--- Sample of Test Data (First 5 Rows) ---")

# Łączymy X_test i y_test w jeden DataFrame do ładnego wyświetlenia
# Musimy zresetować indeksy, aby pandas poprawnie je połączył
df_display = X_test.reset_index(drop=True)
df_display['SalePrice_Actual'] = np.expm1(y_test_log).reset_index(drop=True)

# Wyświetlamy pierwsze 5 wierszy
# Używamy .T (transpozycji), aby zamienić kolumny z wierszami, 
# co jest znacznie czytelniejsze, gdy mamy 80 kolumn.
display(df_display.head().T)

--- Sample of Test Data (First 5 Rows) ---


Unnamed: 0,0,1,2,3,4
MSSubClass,20,60,30,50,20
MSZoning,RL,RL,RM,RM,RL
LotFrontage,70.0,98.0,56.0,50.0,89.0
LotArea,8414,12256,8960,5000,12898
Street,Pave,Pave,Pave,Pave,Pave
...,...,...,...,...,...
MoSold,2,4,3,10,9
YrSold,2006,2010,2010,2006,2009
SaleType,WD,WD,WD,WD,WD
SaleCondition,Normal,Normal,Normal,Normal,Normal


In [2]:
# --- Model Evaluation ---
print("Generating predictions on test set...")
y_pred_log = model.predict(X_test)

# --- IMPORTANT: Revert Log Transform ---
# Since we trained on log(price), we must revert the predictions
# and the true values back to the original dollar scale using expm1
y_pred = np.expm1(y_pred_log)
y_test = np.expm1(y_test_log)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n--- Regression Metrics (on original scale) ---")
print(f"R² (R-squared): {r2:.4f}")
print(f"RMSE (Root Mean Squared Error): ${rmse:,.2f}")

# Plot results
print("\nDisplaying regression results plot...")
plot_regression_results(y_test, y_pred, title="XGBoost Model Performance")

Generating predictions on test set...

--- Regression Metrics (on original scale) ---
R² (R-squared): 0.9110
RMSE (Root Mean Squared Error): $26,125.42

Displaying regression results plot...


In [3]:
# --- Feature Importance Analysis ---
# This cell shows WHICH features the model uses to decide.

try:
    # 1. Get the final model step
    xgb_model = model.named_steps['model']
    
    # 2. Get the preprocessor step
    preprocessor = model.named_steps['preprocessor']
    
    # 3. Get feature names from the preprocessor
    # Numerical features are straightforward
    num_features = preprocessor.transformers_[0][2]
    
    # Categorical features are one-hot encoded
    cat_features_raw = preprocessor.transformers_[1][2]
    cat_features_ohe = list(preprocessor.named_transformers_['cat']
                                     .named_steps['onehot']
                                     .get_feature_names_out(cat_features_raw))
    
    # Combine all feature names
    all_feature_names = num_features + cat_features_ohe
    
    print(f"Total features after preprocessing: {len(all_feature_names)}")
    
    # 4. Plot importance
    print("Plotting feature importance...")
    plot_feature_importance(xgb_model, all_feature_names)
    
except Exception as e:
    print(f"Could not plot feature importance: {e}")

Total features after preprocessing: 301
Plotting feature importance...


In [None]:
'''
Feature Importance Analysis
The chart displays the top 20 features that the XGBoost model found most critical when estimating a home's price.

As expected, the most important feature is OverallQual (Overall Quality), which confirms that the quality of materials and finish has the largest impact on price.

Following closely are features related to size: GrLivArea (Above-Ground Living Area) and TotalBsntSF (Total Basement Square Feet).

The model also learned that YearBuilt and GarageCars (Garage Size) are key price indicators. Interestingly, features related to specific, expensive neighborhoods (e.g., Neighborhood_NridgHt) also carry significant positive weight.

'''

