In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- STEP 1: Data Loading ---
# Load the single dataset file (assuming you named it 'Bengaluru_House_Data.csv')
df = pd.read_csv('data/Bengaluru_House_Data.csv') 

# Display initial info
print(f"Bengaluru data shape: {df.shape}")
df.info()
print("\nFirst 5 rows of Indian Housing Data:")
print(df.head())

# --- STEP 2: Initial Cleaning and Feature Selection (EDA) ---

# 2.1 Drop irrelevant features
# 'area_type', 'availability', 'society', and 'balcony' are often messy or irrelevant for a beginner model.
df.drop(['area_type', 'availability', 'society', 'balcony'], axis='columns', inplace=True)


Bengaluru data shape: (13320, 9)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB

First 5 rows of Indian Housing Data:
              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Utta

In [12]:
# --- Step 3: Handling Simple Missing Values (NaN) ---

# Dropping rows with simple NaNs in the feature set (df)
# For this dataset, dropping a few dozen rows is safer than complex imputation.
initial_rows = df.shape[0]

# Drop NaNs from the Feature Set (X)
df.dropna(inplace=True)

# --- Keep Y_target Synchronized ---
# Re-align Y_target to match the remaining rows in df
# We find the new index of the cleaned df and apply it to Y_target
Y_target = Y_target[df.index]

print(f"\nTotal rows after dropping NaNs: {df.shape[0]} (Lost {initial_rows - df.shape[0]} rows)")


Total rows after dropping NaNs: 13200 (Lost 74 rows)


In [13]:
print(f"\nTotal rows after dropping NaNs: {df.shape[0]} (Lost {initial_rows - df.shape[0]} rows)")

# --- Step 4A: Feature Engineering - Cleaning 'size' (BHK) ---

# Create a clean numerical 'bhk' column by extracting the number from the 'size' string
# The function extracts the first part of the string, which is the number of bedrooms/BHK.
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

# Drop the messy original 'size' column
df.drop('size', axis='columns', inplace=True)

print("\n'bhk' column created and 'size' column dropped.")


Total rows after dropping NaNs: 13200 (Lost 74 rows)

'bhk' column created and 'size' column dropped.


In [14]:
# Assuming previous code is run, 'df' is the feature set and 'Y_target' is the price.

# --- STEP 4B: Cleaning the 'total_sqft' column (FIXED) ---

def convert_sqft_to_num(x):
    # This function handles three cases: single number, range, or other units.
    
    # 1. Case: Range (e.g., "1050 - 1100")
    tokens = str(x).split('-')
    if len(tokens) == 2:
        # Return the average of the range
        try:
            return (float(tokens[0]) + float(tokens[1])) / 2
        except:
            return None
    
    # 2. Case: Single number (e.g., "1056")
    try:
        return float(x)
    except:
        # 3. Case: Other units (e.g., "34.46Sq. Meter") - return None
        return None 

# Apply the function to the total_sqft column
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

# --- Synchronization FIX ---
# Drop rows where the conversion failed (i.e., returned None/NaN in 'total_sqft')
initial_rows = df.shape[0]

# 1. Identify which rows need to be dropped
rows_to_drop = df[df['total_sqft'].isnull()].index

# 2. Drop the rows from the feature set (df)
df.drop(rows_to_drop, inplace=True)

# 3. Drop the EXACT SAME rows from the target variable (Y_target)
Y_target.drop(rows_to_drop, inplace=True) 
# Note: We must drop using the index (rows_to_drop) to keep them aligned.

print(f"Total rows after cleaning total_sqft errors: {df.shape[0]} (Lost {initial_rows - df.shape[0]} rows)")
print("\n'total_sqft' column is now fully numerical.")
print(df.head())


Total rows after cleaning total_sqft errors: 13200 (Lost 0 rows)

'total_sqft' column is now fully numerical.
                   location  total_sqft  bath  bhk
0  Electronic City Phase II      1056.0   2.0    2
1          Chikka Tirupathi      2600.0   5.0    4
2               Uttarahalli      1440.0   2.0    3
3        Lingadheeranahalli      1521.0   3.0    3
4                  Kothanur      1200.0   2.0    2
