<a href="https://colab.research.google.com/github/magnetbrains-bit/Real-Estate-Valuation-Engine/blob/main/Real_Estate_Valuation_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Run this cell to upload your Kaggle API key
from google.colab import files
files.upload() # Choose the kaggle.json file you downloaded

# Next, run this cell to move the key to the correct location
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [2]:
# Download the dataset from Kaggle
!kaggle datasets download -d ruchi798/housing-prices-in-metropolitan-areas-of-india

# Unzip the downloaded file
!unzip housing-prices-in-metropolitan-areas-of-india.zip

Dataset URL: https://www.kaggle.com/datasets/ruchi798/housing-prices-in-metropolitan-areas-of-india
License(s): CC0-1.0
Downloading housing-prices-in-metropolitan-areas-of-india.zip to /content
  0% 0.00/269k [00:00<?, ?B/s]
100% 269k/269k [00:00<00:00, 590MB/s]
Archive:  housing-prices-in-metropolitan-areas-of-india.zip
  inflating: Bangalore.csv           
  inflating: Chennai.csv             
  inflating: Delhi.csv               
  inflating: Hyderabad.csv           
  inflating: Kolkata.csv             
  inflating: Mumbai.csv              


In [3]:
import pandas as pd
import numpy as np # We'll need numpy for some cleaning steps

# Load the Mumbai-specific dataset into a DataFrame
df = pd.read_csv('Mumbai.csv')

# Let's take a first look at our data
print("Shape of the dataset:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nInfo about columns and null values:")
df.info()

Shape of the dataset: (7719, 40)

First 5 rows:
     Price  Area  Location  No. of Bedrooms  Resale  MaintenanceStaff  \
0  4850000   720  Kharghar                1       1                 1   
1  4500000   600  Kharghar                1       1                 1   
2  6700000   650  Kharghar                1       1                 1   
3  4500000   650  Kharghar                1       1                 1   
4  5000000   665  Kharghar                1       1                 1   

   Gymnasium  SwimmingPool  LandscapedGardens  JoggingTrack  ...  \
0          0             0                  0             0  ...   
1          1             1                  0             1  ...   
2          1             1                  0             1  ...   
3          0             0                  1             0  ...   
4          0             0                  1             0  ...   

   LiftAvailable  BED  VaastuCompliant  Microwave  GolfCourse  TV  \
0              1    0              

In [7]:
# --- 2. Handle missing values (if any) ---
# Let's check for missing values again after our initial exploration
print("\nMissing values per column:")
print(df.isnull().sum())
# If there were any, a simple approach is to drop them:
df = df.dropna()
print("\nShape after handling null values:", df.shape)


Missing values per column:
Price                  0
Area                   0
Location               0
No. of Bedrooms        0
Resale                 0
MaintenanceStaff       0
Gymnasium              0
SwimmingPool           0
LandscapedGardens      0
JoggingTrack           0
RainWaterHarvesting    0
IndoorGames            0
ShoppingMall           0
Intercom               0
SportsFacility         0
ATM                    0
ClubHouse              0
School                 0
24X7Security           0
PowerBackup            0
CarParking             0
StaffQuarter           0
Cafeteria              0
MultipurposeRoom       0
Hospital               0
WashingMachine         0
Gasconnection          0
AC                     0
Wifi                   0
Children'splayarea     0
LiftAvailable          0
BED                    0
VaastuCompliant        0
Microwave              0
GolfCourse             0
TV                     0
DiningTable            0
Sofa                   0
Wardrobe              

In [8]:
# --- 3. Clean and standardize column names ---
# Good practice to make them lowercase and replace spaces/dots with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('.', '')
print("\nDataFrame with cleaned column names:")
print(df.head())


DataFrame with cleaned column names:
     price  area  location  no_of_bedrooms  resale  maintenancestaff  \
0  4850000   720  Kharghar               1       1                 1   
1  4500000   600  Kharghar               1       1                 1   
2  6700000   650  Kharghar               1       1                 1   
3  4500000   650  Kharghar               1       1                 1   
4  5000000   665  Kharghar               1       1                 1   

   gymnasium  swimmingpool  landscapedgardens  joggingtrack  ...  \
0          0             0                  0             0  ...   
1          1             1                  0             1  ...   
2          1             1                  0             1  ...   
3          0             0                  1             0  ...   
4          0             0                  1             0  ...   

   liftavailable  bed  vaastucompliant  microwave  golfcourse  tv  \
0              1    0                1          0  

In [9]:
# --- 4. Ensure correct data types ---
# The 'price' and 'area' columns look good (int64), but let's verify.
# The 'no_of_bedrooms' column is perfect.
# The amenity columns are already in a useful 0/1 format.
# Let's check for any strange values in 'area' or 'price'.
print("\nLooking for outliers or strange values...")
print(df.describe())
# A common issue is properties with unrealistic sizes or prices.
# Let's filter out properties with an area less than 200 sqft, as they are likely data errors.
df = df[df['area'] >= 200]
print(f"\nShape after removing tiny properties: {df.shape}")



Looking for outliers or strange values...
              price         area  no_of_bedrooms       resale  \
count  7.719000e+03  7719.000000     7719.000000  7719.000000   
mean   1.506165e+07   998.409250        1.913331     0.647105   
std    2.052100e+07   550.967809        0.855376     0.477901   
min    2.000000e+06   200.000000        1.000000     0.000000   
25%    5.300000e+06   650.000000        1.000000     0.000000   
50%    9.500000e+06   900.000000        2.000000     1.000000   
75%    1.700000e+07  1177.000000        2.000000     1.000000   
max    4.200000e+08  8511.000000        7.000000     1.000000   

       maintenancestaff    gymnasium  swimmingpool  landscapedgardens  \
count       7719.000000  7719.000000   7719.000000        7719.000000   
mean           7.498899     7.473896      7.437881           7.441638   
std            3.197923     3.252095      3.328245           3.320401   
min            0.000000     0.000000      0.000000           0.000000   
25%   

In [10]:
# --- 5. Clean the 'location' column ---
# Just like with the Bengaluru data, we'll group rare locations into an 'other' category.
# This prevents the model from overfitting to locations with very few data points.
df['location'] = df['location'].apply(lambda x: x.strip())
location_stats = df['location'].value_counts()


In [11]:
# Let's see how many locations have fewer than 10 listings
print(f"\nNumber of locations with <= 10 listings: {len(location_stats[location_stats <= 10])}")

# Let's set the threshold. Any location with 10 or fewer listings will be marked as 'other'.
locations_less_than_10 = location_stats[location_stats <= 10]
df['location'] = df['location'].apply(lambda x: 'other' if x in locations_less_than_10 else x)

print(f"\nNumber of unique locations after cleaning: {df['location'].nunique()}")
print("\nSample of location counts after grouping:")
print(df['location'].value_counts().head(10))


Number of locations with <= 10 listings: 312

Number of unique locations after cleaning: 102

Sample of location counts after grouping:
location
other             824
Kharghar          681
Thane West        577
Mira Road East    481
Ulwe              391
Nala Sopara       225
Borivali West     202
Kalyan West       197
Andheri West      189
Panvel            180
Name: count, dtype: int64


In [13]:
# Let's assume 'df' is your DataFrame after completing the initial cleaning steps.

print("--- Starting Advanced Logical Cleaning ---")
print(f"Shape before advanced cleaning: {df.shape}")

# --- 1. Feature Engineering: Create 'price_per_sqft' ---
# The price is in Lakhs (100,000). We'll convert it to actual value for calculation.
df['price_per_sqft'] = (df['price'] * 100000) / df['area']
print("\n'price_per_sqft' column created.")
print(df[['price', 'area', 'price_per_sqft']].head())
print("\nStatistics for price_per_sqft:")
print(df['price_per_sqft'].describe())


# --- 2. Removing Extreme Price Outliers by Location ---
# We will write a function to remove data points beyond 1 standard deviation from the mean
# for each location. This is a standard data science technique.

def remove_price_outliers(df_in):
    df_out = pd.DataFrame()
    for key, subdf in df_in.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        reduced_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st))]
        df_out = pd.concat([df_out, reduced_df], ignore_index=True)
    return df_out

df = remove_price_outliers(df)
print(f"\nShape after removing price outliers: {df.shape}")


# --- 3. Removing Bedroom-vs-Area Outliers ---
# It's a common data error to see, for example, a 6-bedroom house with 500 sqft.
# We'll assume a typical bedroom needs at least 300 sqft on average.
# So, we will remove properties where (area / no_of_bedrooms) is less than 300.

df = df[~(df.area / df.no_of_bedrooms < 300)]
print(f"\nShape after removing bedroom/area outliers: {df.shape}")


# --- Final Cleanup ---
# Now that we've used price_per_sqft to clean, we can drop it before training the model.
df = df.drop('price_per_sqft', axis='columns')
print("\nDropped 'price_per_sqft' column.")

print("\n--- Advanced Cleaning Complete ---")
print(f"Final shape of the cleaned dataset: {df.shape}")
# --- Overwrite the old cleaned CSV with our newly deep-cleaned data ---
df.to_csv('cleaned_mumbai_data.csv', index=False)
print("\nSaved the deep-cleaned data to 'cleaned_mumbai_data.csv'")

--- Starting Advanced Logical Cleaning ---
Shape before advanced cleaning: (6583, 40)

'price_per_sqft' column created.
      price  area  price_per_sqft
0   6200000   400    1.550000e+09
1   9500000  1000    9.500000e+08
2  14900000  1245    1.196787e+09
3  14000000  1183    1.183432e+09
4   3600000  1245    2.891566e+08

Statistics for price_per_sqft:
count    6.583000e+03
mean     1.211271e+09
std      8.315487e+08
min      8.078125e+07
25%      6.215067e+08
50%      9.595960e+08
75%      1.597576e+09
max      8.235294e+09
Name: price_per_sqft, dtype: float64

Shape after removing price outliers: (4782, 41)

Shape after removing bedroom/area outliers: (4782, 41)

Dropped 'price_per_sqft' column.

--- Advanced Cleaning Complete ---
Final shape of the cleaned dataset: (4782, 40)

Saved the deep-cleaned data to 'cleaned_mumbai_data.csv'


In [15]:
# Step 1: Install and import necessary libraries ---
!pip install geopy
import pandas as pd
from geopy.geocoders import Nominatim
import time # To add delays between requests

# Load our clean dataset
df = pd.read_csv('cleaned_mumbai_data.csv')

# --- Step 2: Geocode the unique locations ---
# IMPORTANT: We only geocode the UNIQUE locations, not the entire column.
# This is much faster and avoids thousands of redundant API calls.

# Initialize the geocoder
geolocator = Nominatim(user_agent="mumbai_real_estate_app")

# Get the list of unique locations from our DataFrame
unique_locations = df['location'].unique()

# Create a dictionary to store the coordinates
location_coords = {}

print(f"Geocoding {len(unique_locations)} unique locations. This may take a few minutes...")

for loc in unique_locations:
    # We add "Mumbai, India" to make the search more specific and accurate
    query = f"{loc}, Mumbai, India"
    try:
        location_data = geolocator.geocode(query)
        if location_data:
            location_coords[loc] = (location_data.latitude, location_data.longitude)
        else:
            location_coords[loc] = (None, None) # Location not found
    except Exception as e:
        print(f"Error geocoding {loc}: {e}")
        location_coords[loc] = (None, None)

    time.sleep(1) # IMPORTANT: Be polite to the free API by adding a 1-second delay

print("\nGeocoding complete!")

# --- Step 3: Map the coordinates back to our main DataFrame ---
df['latitude'] = df['location'].map(lambda loc: location_coords.get(loc, (None, None))[0])
df['longitude'] = df['location'].map(lambda loc: location_coords.get(loc, (None, None))[1])

# Check how many locations we successfully geocoded
print(f"\nNumber of properties with missing coordinates: {df['latitude'].isnull().sum()}")

# Drop any rows where we couldn't find coordinates
df = df.dropna(subset=['latitude', 'longitude'])

print(f"Final shape after geocoding: {df.shape}")
print("\nDataFrame with new latitude and longitude columns:")
print(df.head())

# Save our newly enriched data!
df.to_csv('geocoded_mumbai_data.csv', index=False)

Geocoding 102 unique locations. This may take a few minutes...





Geocoding complete!

Number of properties with missing coordinates: 1860
Final shape after geocoding: (2922, 42)

DataFrame with new latitude and longitude columns:
      price  area location  no_of_bedrooms  resale  maintenancestaff  \
0   6200000   400   Airoli               1       1                 1   
1   9500000  1000   Airoli               2       1                 1   
2  14900000  1245   Airoli               2       0                 0   
3  14000000  1183   Airoli               2       0                 0   
4   6400000   495   Airoli               1       1                 9   

   gymnasium  swimmingpool  landscapedgardens  joggingtrack  ...  \
0          0             0                  0             0  ...   
1          0             1                  0             0  ...   
2          0             0                  0             0  ...   
3          1             1                  1             1  ...   
4          9             9                  9             9  

In [18]:
# --- Step 1: Install and import necessary libraries ---
!pip install osmnx geopandas
import osmnx as ox
import geopandas as gpd
from shapely.geometry import Point
from scipy.spatial import cKDTree
import numpy as np

# Load our geocoded dataset
df = pd.read_csv('geocoded_mumbai_data.csv')

# --- Step 2: Download amenity data (e.g., Metro Stations) from OpenStreetMap ---
print("Downloading metro station data from OpenStreetMap...")
tags = {"railway": "station", "station": "subway"}
metro_stations = ox.features.features_from_place('Mumbai, India', tags)
print(f"Found {len(metro_stations)} metro stations.")

# --- DIAGNOSTIC STEP: See the different geometry types we have ---
print("\nTypes of geometries found for stations:")
print(metro_stations.geom_type.value_counts())


# --- Step 3: Calculate the distance to the NEAREST metro station for each property ---
# Convert our property DataFrame to a GeoDataFrame
geometry = [Point(xy) for xy in zip(df['longitude'], df['latitude'])]
gdf_properties = gpd.GeoDataFrame(df, geometry=geometry, crs=metro_stations.crs)

# --- THE FIX: Calculate the centroid for all station geometries ---
# This ensures that every station, whether it's a Point or a Polygon,
# is represented by a single point (its center).
station_centroids = metro_stations.geometry.centroid

# Prepare the coordinates for efficient searching, using the centroids
station_coords = np.array(list(zip(station_centroids.y, station_centroids.x)))
property_coords = np.array(list(zip(gdf_properties.geometry.y, gdf_properties.geometry.x)))

# Use cKDTree for a very fast nearest-neighbor search
kdtree = cKDTree(station_coords)
dist, idx = kdtree.query(property_coords, k=1)

# The result 'dist' is in degrees, so we convert it to kilometers (approx)
df['dist_to_metro_km'] = dist * 111.1

print("\n'dist_to_metro_km' column created.")
print(df[['location', 'dist_to_metro_km']].head())

# Save our final, fully-enriched dataset!
df.to_csv('enriched_mumbai_data.csv', index=False)

print("\n--- Geospatial Feature Engineering Complete! ---")

Downloading metro station data from OpenStreetMap...
Found 88 metro stations.

Types of geometries found for stations:
Point      72
Polygon    16
Name: count, dtype: int64

'dist_to_metro_km' column created.
  location  dist_to_metro_km
0   Airoli          5.029939
1   Airoli          5.029939
2   Airoli          5.029939
3   Airoli          5.029939
4   Airoli          5.029939

--- Geospatial Feature Engineering Complete! ---



  station_centroids = metro_stations.geometry.centroid


In [19]:
# --- Step 1: Import Libraries and Load Data ---
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
import joblib # For saving the model

# Load our final, feature-rich dataset
df = pd.read_csv('enriched_mumbai_data.csv')

print("Dataset loaded. Shape:", df.shape)
print(df.head())


# --- Step 2: Prepare the Data for Machine Learning ---
# Machine learning models need all input to be numeric.
# Our 'location' column is text, so we need to convert it.
# The best way to do this is with One-Hot Encoding.

# 'pd.get_dummies' creates a new column for each location with a 1 or 0.
df_encoded = pd.get_dummies(df, columns=['location'], drop_first=True)

print("\nShape after one-hot encoding:", df_encoded.shape)
print("New columns created for locations.")

# Define our features (X) and our target (y)
X = df_encoded.drop('price', axis='columns')
y = df_encoded['price']


# --- Step 3: Split Data into Training and Testing Sets ---
# We train the model on the training set and then test its performance
# on the unseen testing set to get an honest evaluation.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set has {X_train.shape[0]} samples.")
print(f"Testing set has {X_test.shape[0]} samples.")


# --- Step 4: Initialize and Train the XGBoost Model ---
print("\nTraining the XGBoost model...")

# Initialize the XGBoost Regressor model
# A regressor is used because we are predicting a continuous value (price)
model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)

# Train the model on our training data
model.fit(X_train, y_train)

print("Model training complete!")


# --- Step 5: Evaluate the Model's Performance ---
print("\nEvaluating model performance on the test set...")

# Make predictions on the test data
predictions = model.predict(X_test)

# Calculate performance metrics
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print(f"R-squared (R²): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f} Lakhs")

# Let's interpret the results in plain English
print(f"\nInterpretation:")
print(f"The R-squared score means our model can explain approximately {r2:.0%} of the variance in house prices.")
print(f"The Mean Absolute Error means that, on average, our model's price prediction is off by about {mae:.2f} Lakhs.")


# --- Step 6: Save the Trained Model to a File ---
# This is a CRITICAL step. We save the 'trained brain' so we can
# load it directly into our dashboard later without retraining.
joblib.dump(model, 'mumbai_price_model.joblib')

print("\nModel has been saved to 'mumbai_price_model.joblib'")

Dataset loaded. Shape: (2922, 43)
      price  area location  no_of_bedrooms  resale  maintenancestaff  \
0   6200000   400   Airoli               1       1                 1   
1   9500000  1000   Airoli               2       1                 1   
2  14900000  1245   Airoli               2       0                 0   
3  14000000  1183   Airoli               2       0                 0   
4   6400000   495   Airoli               1       1                 9   

   gymnasium  swimmingpool  landscapedgardens  joggingtrack  ...  microwave  \
0          0             0                  0             0  ...          0   
1          0             1                  0             0  ...          0   
2          0             0                  0             0  ...          0   
3          1             1                  1             1  ...          0   
4          9             9                  9             9  ...          9   

   golfcourse  tv  diningtable  sofa  wardrobe  refrigerat

In [20]:
# --- Step 1: Import Libraries and Load Data ---
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np # For log transform
import joblib

# Load our final, feature-rich dataset
df = pd.read_csv('enriched_mumbai_data.csv')
print("Dataset loaded. Shape:", df.shape)

# --- Step 2: Prepare Data for Machine Learning (with a key improvement) ---
# One-Hot Encode the 'location' column
df_encoded = pd.get_dummies(df, columns=['location'], drop_first=True)
print("\nShape after one-hot encoding:", df_encoded.shape)

# IMPROVEMENT: Use a Log Transform on the Price
# Price data is often "right-skewed" (many cheaper properties, few very expensive ones).
# A log transform makes the distribution more normal and helps the model perform better.
df_encoded['price_log'] = np.log(df_encoded['price'])

# Define features (X) and our NEW log-transformed target (y)
X = df_encoded.drop(['price', 'price_log'], axis='columns') # Drop both price columns from features
y = df_encoded['price_log'] # Our target is now the log of the price


# --- Step 3: Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set has {X_train.shape[0]} samples.")


# --- Step 4: Initialize and Train the Improved XGBoost Model ---
print("\nTraining the XGBoost model...")
# We can use slightly more powerful parameters
model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, subsample=0.8, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
print("Model training complete!")


# --- Step 5: Evaluate the Model's Performance (with corrected logic) ---
print("\nEvaluating model performance on the test set...")
# The model predicts the LOG of the price
log_predictions = model.predict(X_test)

# We must convert the predictions back to actual prices using the inverse of log (exponent)
predictions = np.exp(log_predictions)
# The original test prices also need to be converted back from log form
y_test_actual = np.exp(y_test)

# Calculate performance metrics
r2 = r2_score(y_test_actual, predictions)
mae = mean_absolute_error(y_test_actual, predictions)
mae_lakhs = mae / 100000 # Convert MAE from Rupees to Lakhs for easier interpretation

print(f"R-squared (R²): {r2:.2f}")
print(f"Mean Absolute Error (MAE): {mae_lakhs:.2f} Lakhs") # THE FIX IS HERE

# Let's interpret the results in plain English
print(f"\nInterpretation:")
print(f"The R-squared score means our model can explain approximately {r2:.0%} of the variance in house prices.")
print(f"The Mean Absolute Error means that, on average, our model's price prediction is off by about {mae_lakhs:.2f} Lakhs.")


# --- Step 6: Save the Trained Model to a File ---
joblib.dump(model, 'mumbai_price_model.joblib')
print("\nModel has been saved to 'mumbai_price_model.joblib'")

Dataset loaded. Shape: (2922, 43)

Shape after one-hot encoding: (2922, 109)

Training set has 2337 samples.

Training the XGBoost model...
Model training complete!

Evaluating model performance on the test set...
R-squared (R²): 0.76
Mean Absolute Error (MAE): 28.32 Lakhs

Interpretation:
The R-squared score means our model can explain approximately 76% of the variance in house prices.
The Mean Absolute Error means that, on average, our model's price prediction is off by about 28.32 Lakhs.

Model has been saved to 'mumbai_price_model.joblib'


In [21]:
# --- Step 1: Import Libraries and Load Data ---
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
import numpy as np
import joblib

# Load our final, feature-rich dataset
df = pd.read_csv('enriched_mumbai_data.csv')
print("Dataset loaded. Shape:", df.shape)

# --- Step 2: Prepare Data for Machine Learning ---
df_encoded = pd.get_dummies(df, columns=['location'], drop_first=True)
df_encoded['price_log'] = np.log(df_encoded['price'])
X = df_encoded.drop(['price', 'price_log'], axis='columns')
y = df_encoded['price_log']

# --- Step 3: Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set has {X_train.shape[0]} samples.")

# --- Step 4: Hyperparameter Tuning with GridSearchCV ---
print("\nStarting hyperparameter tuning with GridSearchCV... This may take several minutes.")

# Define the grid of parameters to search
param_grid = {
    'n_estimators': [300, 500, 700],
    'max_depth': [5, 6, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.7, 0.8],
    'colsample_bytree': [0.7, 0.8]
}

# Initialize the XGBoost model
model = xgb.XGBRegressor(random_state=42, n_jobs=-1)

# Initialize GridSearchCV
# cv=3 means 3-fold cross-validation.
# scoring='r2' tells it to optimize for the best R-squared score.
# verbose=2 will print progress updates.
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='r2', verbose=2, n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

print("\nHyperparameter tuning complete!")

# Get the best model found by the search
best_model = grid_search.best_estimator_
print(f"\nBest Hyperparameters Found: {grid_search.best_params_}")


# --- Step 5: Evaluate the BEST Model's Performance ---
print("\nEvaluating the optimized model on the test set...")
log_predictions = best_model.predict(X_test)
predictions = np.exp(log_predictions)
y_test_actual = np.exp(y_test)

r2 = r2_score(y_test_actual, predictions)
mae = mean_absolute_error(y_test_actual, predictions)
mae_lakhs = mae / 100000

print(f"Optimized R-squared (R²): {r2:.2f}")
print(f"Optimized Mean Absolute Error (MAE): {mae_lakhs:.2f} Lakhs")

# --- Step 6: Save the OPTIMIZED Model to a File ---
joblib.dump(best_model, 'mumbai_price_model_optimized.joblib')
print("\nOptimized model has been saved to 'mumbai_price_model_optimized.joblib'")

Dataset loaded. Shape: (2922, 43)

Training set has 2337 samples.

Starting hyperparameter tuning with GridSearchCV... This may take several minutes.
Fitting 3 folds for each of 72 candidates, totalling 216 fits

Hyperparameter tuning complete!

Best Hyperparameters Found: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.8}

Evaluating the optimized model on the test set...
Optimized R-squared (R²): 0.76
Optimized Mean Absolute Error (MAE): 28.56 Lakhs

Optimized model has been saved to 'mumbai_price_model_optimized.joblib'


In [26]:
import joblib

# Load the trained model itself
model = joblib.load('mumbai_price_model_optimized.joblib')

# Access the model's internal list of feature names that it was trained on
# This is the GUARANTEED correct list.
correct_training_columns = model.feature_names_in_

# Save this foolproof list to our blueprint file
joblib.dump(correct_training_columns, 'training_columns.joblib')

print("A new, 100% correct 'training_columns.joblib' has been created.")
print("Please download this file and replace the old one in your project folder.")
print(f"The model expects exactly {len(correct_training_columns)} features.")

A new, 100% correct 'training_columns.joblib' has been created.
Please download this file and replace the old one in your project folder.
The model expects exactly 108 features.
