In [1]:
input_path = None
output_path = None

In [2]:
# Parameters
input_path = "uploads\\realtor-data-small-small.csv"
output_path = "predictions\\predicted_20250817122143.csv"


In [3]:

# Cell 1 - Imports
import pandas as pd
import numpy as np
import joblib
import os
output_dir = os.path.dirname(output_path)
os.makedirs(output_dir, exist_ok=True)

# input_filename = "realtor-data-small.csv"
# filepath = os.path.join("uploads", input_filename)


In [4]:
# Cell 2 - Accept file_path from Papermill
realtordata = pd.read_csv(input_path)
realtordata.columns = realtordata.columns.str.strip().str.lower()

In [5]:
# Cell 3 - Drop rows with missing target or key features
realtordata = realtordata.dropna(subset=['price', 'house_size', 'zip_code'])
# Cell 3.5 - Log-transform skewed features
# (Apply before filling missing values)
realtordata['house_size'] = np.log1p(realtordata['house_size'])
realtordata['acre_lot'] = np.log1p(realtordata['acre_lot'])
# ✅ Cell 3.6 - Filter prices between $50,000 and $650,000
initial_count = len(realtordata)
realtordata = realtordata[(realtordata['price'] >= 50000) & (realtordata['price'] <= 650000)].copy()
print(f"🧹 Filtered by price range. Remaining rows: {len(realtordata)}")
print(f"❌ Dropped rows: {initial_count - len(realtordata)}")


🧹 Filtered by price range. Remaining rows: 11247
❌ Dropped rows: 4943


In [6]:
# Cell 4 - Fill missing numeric values
for col in ['bed', 'bath', 'acre_lot']:
    if col in realtordata.columns:
        realtordata[col] = realtordata[col].fillna(realtordata[col].median())

In [7]:
# Cell 5 - Fill missing categorical values
for col in ['city', 'state']:
    if col in realtordata.columns:
        realtordata[col] = realtordata[col].fillna(realtordata[col].mode()[0])

In [8]:
# Cell 6 - Drop unneeded columns
columns_to_drop = ['street', 'status', 'brokered_by', 'prev_sold_date']
realtordata = realtordata.drop(columns=[col for col in columns_to_drop if col in realtordata.columns])

In [9]:
# Cell 7 - Save webdata with 'state' (optional)
webdata = realtordata.copy()

In [10]:
# Load the encoder trained on 'city'
city_encoder = joblib.load("ordinal_encoder_city.pkl")

# Apply encoder to the 'city' column
# This replaces the city with an integer encoding
realtordata['city'] = city_encoder.transform(realtordata[['city']])
# Convert to categorical and set the same categories used during training
categories = city_encoder.categories_[0].tolist()
realtordata['city'] = pd.Categorical(realtordata['city'], categories=list(range(len(categories))))

In [11]:
# Cell 8 - One-hot encode for ML
ml_data = pd.get_dummies(realtordata.drop(columns=['price']), columns=['state'], drop_first=True)

In [12]:
# ✅ Load both models and trained columns
quantile_model = joblib.load("realtor_quantile_model.pkl")
mean_model = joblib.load("realtor_mean_model.pkl")
trained_cols = joblib.load("trained_columns.pkl")

# ✅ Align columns
for col in trained_cols:
    if col not in ml_data.columns:
        ml_data[col] = 0

ml_data = ml_data[trained_cols]


In [13]:
# ✅ Predict with both models
X = ml_data
quantile_pred_log = quantile_model.predict(X)
mean_pred_log = mean_model.predict(X)

# ✅ Reverse log1p
quantile_pred = np.expm1(quantile_pred_log)
mean_pred = np.expm1(mean_pred_log)

# ✅ Blend predictions
final_pred = 0.90 * quantile_pred + 0.10 * mean_pred


In [14]:
# ✅ Cell 11 - Add prediction to webdata and save final output
webdata["predicted_price"] = final_pred.round(0).astype(int)
webdata.to_csv(output_path, index=False)

In [15]:
# ✅ Cell 12 - Done
print("✅ Cleaned and predicted. Output saved to uploads/predicted_output.csv")


✅ Cleaned and predicted. Output saved to uploads/predicted_output.csv
