In [1]:
# --- Step 1: Import Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# --- CHANGE: Added train_test_split to fix the NameError ---
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error
import warnings

warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

print("--- Step 1: Libraries Imported ---")

--- Step 1: Libraries Imported ---


In [2]:
# --- Step 2: Load Data and Initial Feature Selection ---
df = pd.read_csv('data/Bengaluru_House_Data.csv')
Y_target = df['price'].copy() # Save the target (Y)
# --- CHANGE: Added index reset to fix KeyError in Step 3 ---
Y_target = Y_target.reset_index(drop=True) 

# Drop columns that are irrelevant or have too many missing values (e.g., 'society')
df = df.drop(['area_type', 'availability', 'society', 'balcony', 'price'], axis='columns')

print(f"--- Step 2: Initial Data Loaded. Features: {df.shape} ---")
print(df.head())


--- Step 2: Initial Data Loaded. Features: (13320, 4) ---
                   location       size total_sqft  bath
0  Electronic City Phase II      2 BHK       1056  2.00
1          Chikka Tirupathi  4 Bedroom       2600  5.00
2               Uttarahalli      3 BHK       1440  2.00
3        Lingadheeranahalli      3 BHK       1521  3.00
4                  Kothanur      2 BHK       1200  2.00


In [3]:
# --- Step 3: Handling Simple Missing Values (NaN) ---
initial_rows = df.shape[0]

# --- CHANGE: Added 'if' statement to prevent KeyError on re-run ---
# Check if 'size' column exists (it won't if Step 4A ran first)
if 'size' in df.columns:
    # Create a boolean mask of rows to KEEP (where all essential columns are NOT null)
    rows_to_keep_simple_nan = df['location'].notna() & df['size'].notna() & df['bath'].notna()
else:
    # If 'size' is already gone, just check location and bath
    rows_to_keep_simple_nan = df['location'].notna() & df['bath'].notna()
# --- END OF CHANGE ---

# CRITICAL FIX: Apply the same mask to both df and Y_target AND reset index
df = df[rows_to_keep_simple_nan].reset_index(drop=True)
Y_target = Y_target[rows_to_keep_simple_nan].reset_index(drop=True)

print(f"\n--- Step 3: Simple NaNs Dropped. Rows: {df.shape[0]} (Lost {initial_rows - df.shape[0]} rows) ---")


--- Step 3: Simple NaNs Dropped. Rows: 13246 (Lost 74 rows) ---


In [4]:
# --- Step 4A: Clean 'size' column (Create 'bhk') ---
# Use .loc to avoid SettingWithCopyWarning
df.loc[:, 'bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df = df.drop('size', axis='columns') # Drop the original messy 'size' column

print(f"\n--- Step 4A: 'bhk' column created and 'size' column dropped ---")


--- Step 4A: 'bhk' column created and 'size' column dropped ---


In [5]:
# --- Step 4B: Clean 'total_sqft' column (Handle Ranges) ---
def convert_sqft_to_num(x):
    tokens = str(x).split('-')
    if len(tokens) == 2:
        try:
            return (float(tokens[0]) + float(tokens[1])) / 2
        except ValueError:
            return None # Handle malformed ranges
    try:
        return float(x)
    except ValueError:
        return None # Handle "Sq. Meter", "Perch", etc.

df.loc[:, 'total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

# --- CHANGE: New logic to fix index errors after conversion ---
initial_rows_sqft = df.shape[0]
rows_to_keep_sqft = df['total_sqft'].notna()

df = df[rows_to_keep_sqft].reset_index(drop=True)
Y_target = Y_target[rows_to_keep_sqft].reset_index(drop=True)
# --- END OF CHANGE ---

print(f"\n--- Step 4B: 'total_sqft' cleaned. Rows: {df.shape[0]} (Lost {initial_rows_sqft - df.shape[0]} rows) ---")


--- Step 4B: 'total_sqft' cleaned. Rows: 13200 (Lost 46 rows) ---


In [6]:
# --- Step 5: Feature Engineering - Price Per Square Foot ---

# CRITICAL STEP: Create a new feature for Outlier Analysis
# Price is in Lakhs (â‚¹ 100,000), so we multiply by 100000 to get price in Rupees
df['price_per_sqft'] = (Y_target * 100000) / df['total_sqft']

print("\n'price_per_sqft' feature successfully created.")
print(df.head())


'price_per_sqft' feature successfully created.
                   location total_sqft  bath  bhk price_per_sqft
0  Electronic City Phase II   1,056.00  2.00    2       3,699.81
1          Chikka Tirupathi   2,600.00  5.00    4       4,615.38
2               Uttarahalli   1,440.00  2.00    3       4,305.56
3        Lingadheeranahalli   1,521.00  3.00    3       6,245.89
4                  Kothanur   1,200.00  2.00    2       4,250.00


In [7]:
# --- Step 5B: rounded up values of 'total_sqft' to nearest integer ---
# ADDED ROUNDING STEP HERE: Rounding to 2 decimal places for neatness
df['price_per_sqft'] = df['price_per_sqft'].round(2)

print("\n'price_per_sqft' feature successfully created and rounded.")
print(df.head())


'price_per_sqft' feature successfully created and rounded.
                   location total_sqft  bath  bhk price_per_sqft
0  Electronic City Phase II   1,056.00  2.00    2       3,699.81
1          Chikka Tirupathi   2,600.00  5.00    4       4,615.38
2               Uttarahalli   1,440.00  2.00    3       4,305.56
3        Lingadheeranahalli   1,521.00  3.00    3       6,245.89
4                  Kothanur   1,200.00  2.00    2       4,250.00


In [8]:
# Step 6A: Outlier removal (BHK/Sqft ratio)
initial_rows_outlier = df.shape[0]
rows_to_keep_bhk = ~(df['total_sqft'] / df['bhk'] < 300)

df = df[rows_to_keep_bhk].reset_index(drop=True)
Y_target = Y_target[rows_to_keep_bhk].reset_index(drop=True)

In [9]:
# Step 6B: Outlier removal (Price/Sqft by location)
def remove_price_per_sqft_outliers(df_in, Y_in):
    df_out = pd.DataFrame()
    Y_out = pd.Series(dtype=np.float64) 
    
    for key, subdf in df_in.groupby('location'):
        m = np.mean(subdf['price_per_sqft'])
        st = np.std(subdf['price_per_sqft'])
        
        # Filter the group
        reduced_df = subdf[(subdf['price_per_sqft'] > (m - st)) & (subdf['price_per_sqft'] <= (m + st))]
        
        # Get the corresponding Y values
        reduced_Y = Y_in.loc[reduced_df.index]
        
        # Concatenate results
        df_out = pd.concat([df_out, reduced_df], ignore_index=False)
        Y_out = pd.concat([Y_out, reduced_Y], ignore_index=False)
        
    return df_out.reset_index(drop=True), Y_out.reset_index(drop=True) # Reset index on final output

df, Y_target = remove_price_per_sqft_outliers(df, Y_target) # Pass Y_target into the function

print(f"\nRows after Outlier Removal (sqft/bhk and price/sqft): {df.shape[0]} (Lost {initial_rows_outlier - df.shape[0]} rows)")



Rows after Outlier Removal (sqft/bhk and price/sqft): 9259 (Lost 3941 rows)


In [10]:
# Step 7: Feature Encoding (One-Hot Encoding for 'location')
X = df.drop(['price_per_sqft'], axis='columns') 
X = pd.get_dummies(X, drop_first=True) 

print(f"Features (X) shape after encoding: {X.shape}")

Features (X) shape after encoding: (9259, 2412)


In [11]:
# Step 8: Splitting and Transformation

# Apply Log transformation to the Target (Y)
Y = np.log1p(Y_target) 

# Split the data 80% for training, 20% for testing
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")

X_train shape: (7407, 2412)
