In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

# --- STEP 1: Data Loading ---
# Load the single dataset file (assuming you named it 'Bengaluru_House_Data.csv')
df = pd.read_csv('data/Bengaluru_House_Data.csv') 

# Display initial info
print(f"Bengaluru data shape: {df.shape}")
df.info()
print("\nFirst 5 rows of Indian Housing Data:")
print(df.head())

# --- STEP 2: Initial Cleaning and Feature Selection (EDA) ---

# 2.1 Drop irrelevant features
# 'area_type', 'availability', 'society', and 'balcony' are often messy or irrelevant for a beginner model.
df.drop(['area_type', 'availability', 'society', 'balcony'], axis='columns', inplace=True)


Bengaluru data shape: (13320, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB

First 5 rows of Indian Housing Data:
              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Utta

In [98]:
# Step 3: Drop NaNs
initial_rows = df.shape[0] # Saving initial count for comparison

# FIX APPLIED HERE: Create a single boolean mask across all features (df) and apply it to Y_target
rows_to_keep_simple_nan = df.notnull().all(axis=1)

df = df[rows_to_keep_simple_nan].reset_index(drop=True)
Y_target = Y_target[rows_to_keep_simple_nan].reset_index(drop=True)

print(f"Total rows after dropping NaNs: {df.shape[0]} (Lost {initial_rows - df.shape[0]} rows)")


Total rows after dropping NaNs: 13246 (Lost 74 rows)


In [99]:
print(f"\nTotal rows after dropping NaNs: {df.shape[0]} (Lost {initial_rows - df.shape[0]} rows)")

# --- Step 4A: Feature Engineering - Cleaning 'size' (BHK) ---

# Create a clean numerical 'bhk' column by extracting the number from the 'size' string
# The function extracts the first part of the string, which is the number of bedrooms/BHK.
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

# Drop the messy original 'size' column
df.drop('size', axis='columns', inplace=True)

print("\n'bhk' column created and 'size' column dropped.")


Total rows after dropping NaNs: 13246 (Lost 74 rows)

'bhk' column created and 'size' column dropped.


In [100]:
# Step 4B: Clean total_sqft
def convert_sqft_to_num(x):
    tokens = str(x).split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
# Drop remaining NaNs (failed conversions)
# --- FIX: Use index-aligned boolean mask for dropping ---
initial_rows_sqft = df.shape[0]
rows_to_keep = df['total_sqft'].notnull()

df = df[rows_to_keep].reset_index(drop=True)
Y_target = Y_target[rows_to_keep].reset_index(drop=True)

print(f"Total rows after cleaning total_sqft errors: {df.shape[0]} (Lost {initial_rows_sqft - df.shape[0]} rows)")

Total rows after cleaning total_sqft errors: 13200 (Lost 46 rows)


In [101]:
# --- Step 5: Feature Engineering - Price Per Square Foot ---

# CRITICAL STEP: Create a new feature for Outlier Analysis
# Price is in Lakhs (â‚¹ 100,000), so we multiply by 100000 to get price in Rupees
df['price_per_sqft'] = (Y_target * 100000) / df['total_sqft']

print("\n'price_per_sqft' feature successfully created.")
print(df.head())


'price_per_sqft' feature successfully created.
                   location  total_sqft  bath   price  bhk  price_per_sqft
0  Electronic City Phase II      1056.0   2.0   39.07    2     6060.606061
1          Chikka Tirupathi      2600.0   5.0  120.00    4     1807.692308
2               Uttarahalli      1440.0   2.0   62.00    3     3750.000000
3        Lingadheeranahalli      1521.0   3.0   95.00    3     2432.610125
4                  Kothanur      1200.0   2.0   51.00    2     4166.666667


In [102]:
# --- Step 5B: rounded up values of 'total_sqft' to nearest integer ---
# ADDED ROUNDING STEP HERE: Rounding to 2 decimal places for neatness
df['price_per_sqft'] = df['price_per_sqft'].round(2)

print("\n'price_per_sqft' feature successfully created and rounded.")
print(df.head())


'price_per_sqft' feature successfully created and rounded.
                   location  total_sqft  bath   price  bhk  price_per_sqft
0  Electronic City Phase II      1056.0   2.0   39.07    2         6060.61
1          Chikka Tirupathi      2600.0   5.0  120.00    4         1807.69
2               Uttarahalli      1440.0   2.0   62.00    3         3750.00
3        Lingadheeranahalli      1521.0   3.0   95.00    3         2432.61
4                  Kothanur      1200.0   2.0   51.00    2         4166.67


In [107]:
# Step 6A: Outlier removal (BHK/Sqft ratio)
initial_rows_outlier = df.shape[0]
rows_to_keep_bhk = ~(df['total_sqft'] / df['bhk'] < 300)

df = df[rows_to_keep_bhk].reset_index(drop=True)
Y_target = Y_target[rows_to_keep_bhk].reset_index(drop=True)

In [104]:
# Step 6B: Outlier removal (Price/Sqft by location)
def remove_price_per_sqft_outliers(df_in, Y_in):
    df_out = pd.DataFrame()
    Y_out = pd.Series(dtype=np.float64) 
    
    for key, subdf in df_in.groupby('location'):
        m = np.mean(subdf['price_per_sqft'])
        st = np.std(subdf['price_per_sqft'])
        
        # Filter the group
        reduced_df = subdf[(subdf['price_per_sqft'] > (m - st)) & (subdf['price_per_sqft'] <= (m + st))]
        
        # Get the corresponding Y values
        reduced_Y = Y_in.loc[reduced_df.index]
        
        # Concatenate results
        df_out = pd.concat([df_out, reduced_df], ignore_index=False)
        Y_out = pd.concat([Y_out, reduced_Y], ignore_index=False)
        
    return df_out.reset_index(drop=True), Y_out.reset_index(drop=True) # Reset index on final output

df, Y_target = remove_price_per_sqft_outliers(df, Y_target) # Pass Y_target into the function

print(f"\nRows after Outlier Removal (sqft/bhk and price/sqft): {df.shape[0]} (Lost {initial_rows_outlier - df.shape[0]} rows)")



Rows after Outlier Removal (sqft/bhk and price/sqft): 113 (Lost 13087 rows)


In [105]:
# Step 7: Feature Encoding (One-Hot Encoding for 'location')
X = df.drop(['price_per_sqft'], axis='columns') 
X = pd.get_dummies(X, drop_first=True) 

print(f"Features (X) shape after encoding: {X.shape}")

Features (X) shape after encoding: (113, 50)


In [106]:
# Step 8: Splitting and Transformation

# Apply Log transformation to the Target (Y)
Y = np.log1p(Y_target) 

# Split the data 80% for training, 20% for testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")

X_train shape: (90, 50)
