# Model Prediction

In [17]:
import requests
import os
from joblib import load
import joblib

import pandas as pd

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, root_mean_squared_error


## Download Best Model

In [18]:
def download_github_file(raw_url: str, save_path: str):
    """
    Downloads a file from a GitHub raw content URL and saves it to a specified path.

    Args:
        raw_url (str): The raw URL of the file on GitHub (e.g.,
                       'https://raw.githubusercontent.com/user/repo/branch/path/to/file').
        save_path (str): The local path where the downloaded file will be saved.
    """
    print(f"Attempting to download file from: {raw_url}")
    try:
        response = requests.get(raw_url, stream=True)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        # Ensure the directory exists
        os.makedirs(os.path.dirname(save_path), exist_ok=True)

        with open(save_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"File successfully downloaded to: {save_path}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the file: {e}")
    except IOError as e:
        print(f"Error saving the file to disk: {e}")

# Example usage with your provided URL:
# Note: The URL you provided (https://github.com/mlnotes2718/hdb-resale-price-e2e-ml/blob/main/hdb_resale_best_model.joblib)
# is for the GitHub web page. For raw content, you need to use the 'raw.githubusercontent.com' domain.
# The correct raw URL for your file would be:
# https://raw.githubusercontent.com/mlnotes2718/hdb-resale-price-e2e-ml/main/hdb_resale_best_model.joblib

# Define the raw URL
github_raw_url = "https://raw.githubusercontent.com/mlnotes2718/hdb-resale-price-e2e-ml/main/hdb_resale_best_model.joblib"

# Define the local path to save the file
local_save_path = "model/hdb_resale_best_model.joblib"

# Call the function to download the file
download_github_file(github_raw_url, local_save_path)


Attempting to download file from: https://raw.githubusercontent.com/mlnotes2718/hdb-resale-price-e2e-ml/main/hdb_resale_best_model.joblib
File successfully downloaded to: model/hdb_resale_best_model.joblib


## Load Model

In [19]:
def load_model(model_path: str):
    """
    Loads a machine learning model from a .joblib file.

    Args:
        model_path (str): The local path to the .joblib model file.

    Returns:
        object: The loaded machine learning model or pipeline.
    """
    print(f"\nAttempting to load model from: {model_path}")
    try:
        model = joblib.load(model_path)
        print("Model loaded successfully.")
        return model
    except Exception as e:
        print(f"Error loading the model: {e}")
        return None

In [20]:
loaded_model = load_model(local_save_path)


Attempting to load model from: model/hdb_resale_best_model.joblib
Model loaded successfully.


In [21]:
def make_prediction(model, data: dict):
    """
    Makes a prediction using the loaded model, checking feature names.

    Args:
        model: The loaded machine learning model or pipeline.
        data (dict): A dictionary containing the input features for prediction.
                     Keys should match the expected feature names of the model.

    Returns:
        numpy.ndarray or float: The prediction result.
    """
    if model is None:
        print("Model is not loaded. Cannot make prediction.")
        return None

    print("\nPreparing data for prediction...")
    # Convert the input data dictionary into a pandas DataFrame
    # This is crucial as scikit-learn models often expect DataFrame inputs
    input_df = pd.DataFrame([data])

    # --- Feature Name and Pipeline Inspection ---
    # This part demonstrates how to check feature names and pipeline steps.
    # The exact method depends on the model type (e.g., Pipeline, specific estimator).

    # If it's a scikit-learn Pipeline, you can inspect its steps
    if hasattr(model, 'steps') and isinstance(model.steps, list):
        print("\nModel is a scikit-learn Pipeline. Steps:")
        for i, (name, step) in enumerate(model.steps):
            print(f"  Step {i+1}: {name} ({type(step).__name__})")
            # You might want to inspect feature names at different stages if applicable
            # For example, if a step has get_feature_names_out()
            if hasattr(step, 'get_feature_names_out'):
                try:
                    # This might require fitting the step first, or it might work
                    # if the pipeline is already fitted.
                    # For a fresh prediction, we're mostly concerned with the input features.
                    pass
                except Exception:
                    pass # Ignore if it fails, not all steps have this readily available

    # For checking expected input feature names (common for models trained with specific columns)
    # This assumes the model was trained with a DataFrame and might have a feature_names_in_ attribute
    if hasattr(model, 'feature_names_in_'):
        expected_features = model.feature_names_in_
        print(f"\nModel expects the following features: {list(expected_features)}")
        # Check if all expected features are present in the input data
        missing_features = [f for f in expected_features if f not in input_df.columns]
        if missing_features:
            print(f"WARNING: Missing features in input data: {missing_features}")
            # You might want to add default values or raise an error here
            # For simplicity, we'll proceed but this is a common point of error.
            # Add missing columns with NaN or appropriate default values
            for feature in missing_features:
                input_df[feature] = 0 # Or np.nan, or a sensible default for your model

    # Ensure the order of columns in input_df matches the expected order if feature_names_in_ exists
    if hasattr(model, 'feature_names_in_'):
        input_df = input_df[list(model.feature_names_in_)]
    else:
        print("Warning: Model does not expose 'feature_names_in_'. Ensure input data column order matches training data.")


    print(f"Input data for prediction:\n{input_df}")

    try:
        prediction = model.predict(input_df)
        print(f"Prediction successful. Result: {prediction}")
        return prediction
    except Exception as e:
        print(f"Error during prediction: {e}")
        return None


In [24]:
def examine_model_features(model):
    """
    Examines and prints the feature names expected by the loaded model.

    Args:
        model: The loaded machine learning model or pipeline.
    """
    if model is None:
        print("Model is not loaded. Cannot examine features.")
        return

    print("\n--- Examining Model Features ---")
    if hasattr(model, 'feature_names_in_'):
        print(f"Model expects the following input features: {list(model.feature_names_in_)}")
    elif hasattr(model, 'steps') and isinstance(model.steps, list):
        print("Model is a scikit-learn Pipeline. Attempting to find features from the first estimator.")
        # Try to get features from the first estimator in the pipeline if available
        for name, step in model.steps:
            if hasattr(step, 'feature_names_in_'):
                print(f"First step '{name}' expects the following input features: {list(step.feature_names_in_)}")
                break
        else:
            print("Could not directly determine expected features from the pipeline steps using 'feature_names_in_'.")
            print("You might need to refer to the model's training script or documentation.")
    else:
        print("Could not directly determine expected features. Model does not expose 'feature_names_in_'.")
        print("You might need to refer to the model's training script or documentation.")
    print("--- End Feature Examination ---\n")

In [25]:
examine_model_features(loaded_model)


--- Examining Model Features ---
Model expects the following input features: ['town', 'flat_type', 'storey_range', 'floor_area_sqm', 'flat_model', 'transac_year', 'transac_month', 'remaining_lease_by_months']
--- End Feature Examination ---



In [None]:
# Example of more realistic dummy data based on common HDB features:
# You'll need to adjust these based on the actual features used by 'hdb_resale_best_model.joblib'
# To find the exact features, you might need to inspect the model after loading,
# or refer to the repository's data preparation steps.
example_hdb_data = {
    'town': 'ANG MO KIO', # list of towns in Singapore
    'flat_type': '4 ROOM', # list of flat types in Singapore
    'storey_range': 10,
    'floor_area_sqm': 90.0,
    'flat_model': 'New Generation', # list of flat models in Singapore
    'transac_year': 2025,
    'transac_month': 7,
    'remaining_lease_by_months': 600 # Example: 50 years * 12 months, ask user more than or less than between 10 to 90, 95
}

In [33]:
# 4. Make a prediction using the loaded model and dummy data
if loaded_model:
    # Use the more realistic example data
    prediction_result = make_prediction(loaded_model, example_hdb_data)
    if prediction_result is not None:
        print(f"\nFinal Predicted Resale Price: {prediction_result[0]:,.2f}")
    else:
        print("\nPrediction could not be made.")


Preparing data for prediction...

Model is a scikit-learn Pipeline. Steps:
  Step 1: preprocessor (ColumnTransformer)
  Step 2: regressor (LinearRegression)

Model expects the following features: ['town', 'flat_type', 'storey_range', 'floor_area_sqm', 'flat_model', 'transac_year', 'transac_month', 'remaining_lease_by_months']
Input data for prediction:
         town flat_type  storey_range  floor_area_sqm      flat_model  \
0  ANG MO KIO    4 ROOM            10            90.0  New Generation   

   transac_year  transac_month  remaining_lease_by_months  
0          2025              7                        600  
Prediction successful. Result: [550418.88179136]

Final Predicted Resale Price: 550,418.88
