In [13]:
import os
import sys
import json
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

# Add the project root to the Python path
project_root = os.path.abspath("../..")
sys.path.append(project_root)

# Import project constants
from utils.constants import ML_READY_DATA_FILE, MODELS_DIR

# Step 1: Load the ML-ready dataset
df = pd.read_csv(ML_READY_DATA_FILE)
X = df.drop(columns=["id", "url", "price"], errors="ignore")
y = df["price"]

# Step 2: Fit a RandomForest to select the top 30 most important features
rf = RandomForestRegressor(random_state=42)
rf.fit(X, y)

selector = SelectFromModel(rf, threshold=-float("inf"), max_features=30, prefit=True)
top30_features = list(X.columns[selector.get_support()])

# Step 3: Save the full and top30 feature lists as JSON
features_dir = os.path.join(MODELS_DIR, "features")
os.makedirs(features_dir, exist_ok=True)

with open(os.path.join(features_dir, "all_features.json"), "w") as f:
    json.dump(list(X.columns), f, indent=2)

with open(os.path.join(features_dir, "top30_features.json"), "w") as f:
    json.dump(top30_features, f, indent=2)

print("Feature JSON files saved successfully.")


Feature JSON files saved successfully.


In [14]:
import sys, os
import json
from glob import glob
import pandas as pd
from joblib import load
from datetime import datetime

# Add the project root to the Python path
project_root = os.path.abspath("../..")
sys.path.append(project_root)

# Import project constants
from utils.constants import (
    ML_READY_DATA_FILE,
    MODELS_DIR,
    PREDICTIONS_DIR
)

# Step 1: Load the machine learning-ready dataset
if not os.path.exists(ML_READY_DATA_FILE):
    raise FileNotFoundError(f"ML-ready dataset not found at: {ML_READY_DATA_FILE}")

print(f"Using ML-ready dataset: {os.path.basename(ML_READY_DATA_FILE)}")
df = pd.read_csv(ML_READY_DATA_FILE)

# Step 2: Randomly select 10 properties for inference
df_sample = df.sample(n=10, random_state=42).reset_index(drop=True)
print("10 random properties selected for prediction.")

# Remove non-feature columns
base_features = df_sample.drop(columns=["id", "url"], errors="ignore")

# Step 3: Load feature lists for top30 and all features
features_dir = os.path.join(MODELS_DIR, "features")
top30_path = os.path.join(features_dir, "top30_features.json")
all_path = os.path.join(features_dir, "all_features.json")

with open(top30_path, "r") as f:
    top30_features = json.load(f)
with open(all_path, "r") as f:
    all_features = json.load(f)

# Step 4: Load all .pkl models and perform prediction
models_pkl_dir = os.path.join(MODELS_DIR, "pkl")
pkl_files = glob(os.path.join(models_pkl_dir, "*.pkl"))

if not pkl_files:
    raise ValueError(f"No .pkl models found in: {models_pkl_dir}")

predictions = df_sample.copy()

for pkl_path in pkl_files:
    model_name = os.path.basename(pkl_path).replace(".pkl", "")
    try:
        model = load(pkl_path)

        # Choose the correct feature set based on model name
        if "top30" in model_name.lower():
            features = top30_features
        else:
            features = all_features

        # Prepare input data
        X_input = base_features[features]

        # Make predictions
        preds = model.predict(X_input)
        predictions[model_name] = preds
        print(f"Prediction completed for model: {model_name}")

    except Exception as e:
        print(f"Failed prediction for model '{model_name}': {e}")

# Step 5: Save the predictions to output directory
os.makedirs(PREDICTIONS_DIR, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M")
output_file = f"inference_predictions_{timestamp}.csv"
output_path = os.path.join(PREDICTIONS_DIR, output_file)

predictions.to_csv(output_path, index=False)
print(f"Inference predictions saved to: {output_path}")


Using ML-ready dataset: immoweb_real_estate_ml_ready.csv
10 random properties selected for prediction.
Prediction completed for model: catboost_optuna_all_20250629_0814_TEST
Prediction completed for model: catboost_optuna_all_20250629_1033_TEST
Prediction completed for model: catboost_optuna_all_20250629_1102_TEST
Prediction completed for model: catboost_optuna_all_20250629_1111_TEST
Prediction completed for model: catboost_optuna_all_20250629_1234_TEST
Prediction completed for model: catboost_optuna_all_20250629_1257_TEST
Prediction completed for model: catboost_optuna_all_20250629_1303_TEST
Prediction completed for model: catboost_optuna_all_20250629_1311_TEST
Prediction completed for model: catboost_optuna_all_20250629_1320_TEST
Prediction completed for model: catboost_optuna_all_20250629_1329_TEST
Prediction completed for model: catboost_optuna_all_20250629_1408_TEST
Prediction completed for model: catboost_optuna_all_20250629_1423_TEST
Prediction completed for model: catboost_optu