# Prediction Pipeline

Production-ready prediction pipeline using the final selected model for house price prediction on new data.


In [1]:
import pandas as pd
import numpy as np
import pickle
import os
import sys
sys.path.insert(0, os.path.abspath('../../scripts/'))
import warnings
warnings.filterwarnings('ignore')

from pipelines import load_and_prepare_data

# Load final model and selection summary
def load_final_model():
    """Load the final selected model and its metadata"""
    
    # Load the final model directly from final_model.pkl
    model_path = '../../models/final/final_model.pkl'
    
    if not os.path.exists(model_path):
        raise FileNotFoundError(f"Final model not found at {model_path}. Please ensure the final model is saved.")
    
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    
    # Load selection summary for metadata
    summary_path = '../../models/final/model_selection_summary.pkl'
    summary = None
    if os.path.exists(summary_path):
        with open(summary_path, 'rb') as f:
            summary = pickle.load(f)
    
    return model, summary

# Load model and summary
final_model, selection_summary = load_final_model()

print("Final Model Loaded Successfully")
print("=" * 40)
if selection_summary:
    print(f"Model: {selection_summary['selected_model']}")
    print(f"Expected RMSE: ${selection_summary['performance_metrics']['rmse']:,.0f}")
    print(f"Expected R²: {selection_summary['performance_metrics']['r2']:.4f}")
    print(f"Selection Date: {selection_summary['selection_date']}")
else:
    print(f"Model Type: {type(final_model).__name__}")

print(f"Model ready for predictions")


Final Model Loaded Successfully
Model: Ridge Regression Basic (Tuned)
Expected RMSE: $21,622
Expected R²: 0.9212
Selection Date: 2025-09-03 16:42:11
Model ready for predictions


In [9]:
# Create prediction function
def predict_house_price(model, house_data):
    """
    Predict house price for new data
    
    Args:
        model: Trained ML model
        house_data: DataFrame with house features (same format as training data)
    
    Returns:
        predictions: Array of predicted prices
    """
    predictions = model.predict(house_data)
    return predictions

def predict_single_house(model, **kwargs):
    """
    Predict price for a single house using individual feature values
    
    Args:
        model: Trained ML model
        **kwargs: Feature values as keyword arguments
    
    Returns:
        predicted_price: Single predicted price
    """
    # Create DataFrame from keyword arguments
    house_df = pd.DataFrame([kwargs])
    prediction = model.predict(house_df)
    return prediction[0]

# Test prediction functionality with sample data
print("Testing Prediction Functionality")
print("=" * 40)

# Load sample data for testing
X_sample, y_sample = load_and_prepare_data('../../data/cleaned/domain_cleaned.csv')

# Test with first 5 houses
test_sample = X_sample.head(30)
actual_prices = y_sample.head(30)

predictions = predict_house_price(final_model, test_sample)

print("Sample Predictions:")
print(f"{'Actual':<12} {'Predicted':<12} {'Error':<12}")
print("-" * 36)
for i in range(len(predictions)):
    actual = actual_prices.iloc[i]
    pred = predictions[i]
    error = abs(actual - pred)
    print(f"${actual:<11,.0f} ${pred:<11,.0f} ${error:<11,.0f}")

# Calculate sample metrics
mae_sample = np.mean(np.abs(actual_prices - predictions))
print(f"\nSample MAE: ${mae_sample:,.0f}")
print("Prediction function working correctly")


Testing Prediction Functionality
Sample Predictions:
Actual       Predicted    Error       
------------------------------------
$94,500      $94,217      $283        
$174,500     $184,452     $9,952      
$265,979     $213,996     $51,983     
$253,293     $325,277     $71,984     
$275,000     $265,747     $9,253      
$222,500     $226,669     $4,169      
$230,000     $232,046     $2,046      
$145,000     $153,291     $8,291      
$117,500     $132,221     $14,721     
$139,400     $121,587     $17,813     
$100,000     $100,662     $662        
$386,250     $421,486     $35,236     
$112,000     $119,693     $7,693      
$265,900     $237,843     $28,057     
$197,000     $184,590     $12,410     
$175,500     $169,789     $5,711      
$254,000     $214,604     $39,396     
$383,970     $358,320     $25,650     
$179,200     $178,377     $823        
$267,000     $278,027     $11,027     
$119,200     $109,800     $9,400      
$119,500     $122,436     $2,936      
$345,000     

In [5]:
# Example usage for new house prediction
print("Example: Predicting New House Price")
print("=" * 40)

# Example house with specific characteristics
example_house = X_sample.iloc[0:1].copy()  # Use first house as template

# Display original features for reference
print("Original house features (sample):")
key_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']
available_features = [f for f in key_features if f in example_house.columns]
for feature in available_features:
    print(f"  {feature}: {example_house[feature].iloc[0]}")

# Make prediction
example_prediction = predict_house_price(final_model, example_house)
print(f"\nPredicted Price: ${example_prediction[0]:,.0f}")

# Demonstrate prediction confidence interval
if selection_summary:
    expected_rmse = selection_summary['performance_metrics']['rmse']
    lower_bound = example_prediction[0] - expected_rmse
    upper_bound = example_prediction[0] + expected_rmse
    
    print(f"Expected Range (±1 RMSE):")
    print(f"  Lower bound: ${lower_bound:,.0f}")
    print(f"  Upper bound: ${upper_bound:,.0f}")

print("\nPrediction completed successfully")


Example: Predicting New House Price
Original house features (sample):
  OverallQual: 6
  GrLivArea: 987
  GarageCars: 1
  TotalBsmtSF: 483
  YearBuilt: 1972

Predicted Price: $94,223
Expected Range (±1 RMSE):
  Lower bound: $72,601
  Upper bound: $115,844

Prediction completed successfully
