In [1]:
import pandas as pd
from geopy.distance import great_circle
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
import os
os.chdir('/Users/nidamaryam/Desktop/Projects/PropertyTax')
os.getcwd()

'/Users/nidamaryam/Desktop/Projects/PropertyTax'

In [3]:
INPUT_FILE = "FinalDataset_Geocoded_New.xlsx"
CENTER_LAT_LON = (12.9716, 77.5946) 

In [4]:
def calculate_distance(row, center_coords):
    property_coords = (row['Latitude'], row['Longitude'])
    return great_circle(property_coords, center_coords).km

In [9]:
def train_and_evaluate_models(df):
    df = df.rename(columns={'Cost2': 'Price', 'Area2': 'Area'})
    
    df_clean = df.dropna(subset=['Area', 'Price', 'Distance_to_Center', 'BHK']).copy()
    
    df_clean = df_clean[df_clean['BHK'].apply(lambda x: isinstance(x, (int, float)))].copy()
    
    df_clean = df_clean[(df_clean['Price'] < df_clean['Price'].quantile(0.99)) & 
                        (df_clean['Area'] < df_clean['Area'].quantile(0.99))].copy()
    
    features = ['BHK', 'Area', 'Distance_to_Center']
    target = 'Price'
    
    X = df_clean[features]
    y = df_clean[target]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    numerical_features = ['Area', 'Distance_to_Center']
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features)
        ],
        remainder='passthrough'
    )
    
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
        'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100, random_state=42)
    }
    
    results = {}
    
    print("\n--- Model Training and Evaluation ---")
    
    for name, model in models.items():
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('regressor', model)])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        rmse_cr = rmse / 10000000 
        
        results[name] = {'RMSE (Rupees)': f'{rmse:,.2f}', 
                         'RMSE (Crores)': f'{rmse_cr:.2f}',
                         'R-squared': f'{r2:.4f}'}
                         
        print(f"| {name:<25} | RMSE: {rmse_cr:.2f} Cr | R-squared: {r2:.4f}")
        
    print("-------------------------------------")
    return results

In [10]:
df = pd.read_excel(INPUT_FILE)
        
# 3. Feature Engineering: Distance Calculation
print("\n--- Calculating Distance to Center ---")
df['Distance_to_Center'] = df.apply(
    lambda row: calculate_distance(row, CENTER_LAT_LON), 
    axis=1
)
        
# Display the new column
print("Distance calculation complete. Sample rows:")
print(df[['Place', 'Latitude', 'Longitude', 'Distance_to_Center']].head())
        
# 4. & 5. Model Training and Evaluation
model_results = train_and_evaluate_models(df)
        
print("\nFinal Model Comparison:")
results_df = pd.DataFrame.from_dict(model_results, orient='index')
print(results_df)


--- Calculating Distance to Center ---
Distance calculation complete. Sample rows:
                               Place   Latitude  Longitude  Distance_to_Center
0   3rd Block Koramangala, Bangalore  12.936074  77.619448            4.780728
1   3rd Block Koramangala, Bangalore  12.936074  77.619448            4.780728
2   3rd Block Koramangala, Bangalore  12.936074  77.619448            4.780728
3   3rd Block Koramangala, Bangalore  12.936074  77.619448            4.780728
4   3rd Block Koramangala, Bangalore  12.936074  77.619448            4.780728

Cleaned dataset size: 1957 rows.

--- Model Training and Evaluation ---
| Linear Regression         | RMSE: 0.90 Cr | R-squared: 0.5497
| Random Forest Regressor   | RMSE: 0.85 Cr | R-squared: 0.6004
| Gradient Boosting Regressor | RMSE: 0.76 Cr | R-squared: 0.6755
-------------------------------------

Final Model Comparison:
                            RMSE (Rupees) RMSE (Crores) R-squared
Linear Regression            9,002,819.11     