In [3]:
# Importing the pandas library and giving it a short name 'pd'
# We use pandas to work with tables of data (like Excel spreadsheets)
import pandas as pd

# Importing the numpy library and giving it a short name 'np'
# We use numpy to work with numbers and math (especially with big lists of numbers)
import numpy as np 

# Importing the pyplot part of matplotlib and giving it a short name 'plt'
# This helps us draw graphs and charts
from matplotlib import pyplot as plt 

# This line tells Jupyter Notebook to show the graphs right below the code
# (You only need this if you're using Jupyter Notebook)
%matplotlib inline

# Importing the main matplotlib library so we can change how our charts look
import matplotlib

# Setting the default size of the graphs to be wider and taller
# (This makes the graphs easier to see)
matplotlib.rcParams["figure.figsize"] = (20, 10)


In [4]:
df1 = pd.read_csv("Bengaluru_House_Data.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
df1.shape

(13320, 9)

In [6]:
# This line groups the data in the 'df1' table by the values in the 'area_type' column
# It then counts how many times each 'area_type' appears in the table

# Let's break it down:
# df1 is our data table (like a big Excel sheet)
# 'groupby' helps us group rows that have the same 'area_type' (like "Built-up Area", "Plot Area", etc.)
# ['area_type'] tells Python to group the data by the 'area_type' column
# ['area_type'].agg('count') tells Python to count how many rows are in each group (how many homes for each area type)

df1.groupby('area_type')['area_type'].agg('count')


area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [7]:
# This line removes (drops) the specified columns from the original dataframe (df1)
# The columns being removed are 'area_type', 'society', 'balcony', and 'availability'
# The resulting dataframe is saved as df2, which is a "cleaned-up" version of df1 without these columns
# axis='columns' specifies that we want to drop columns, not rows
df2 = df1.drop(['area_type', 'society', 'balcony', 'availability'], axis='columns')

# This line shows the first 5 rows of the cleaned dataframe (df2) so you can quickly check what it looks like
# This is useful to make sure the data looks correct after removing the specified columns
df2.head()


Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [8]:
# This line checks for missing (null) values in each column of the cleaned dataframe (df2)
# 'isnull()' returns a dataframe of the same shape, but with 'True' where values are missing and 'False' where values are present
# '.sum()' adds up the 'True' values (which are treated as 1), so it gives the total count of missing values in each column

# This is useful to understand which columns have missing data and how many missing values there are
df2.isnull().sum()


location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [9]:
# The following line removes any rows from the dataframe (df2) that have missing values.
# In other words, it deletes the rows where there is any empty cell (NaN) in the data.
# This helps make sure that the data we're working with has no missing information.
df3 = df2.dropna()

# After removing the rows with missing values, we use the following line to check if there are any remaining missing values.
# This will tell us if our data (df3) still has any empty cells.
# 'isnull()' checks for missing values, and '.sum()' adds up how many missing values there are in each column.
df3.isnull().sum()


location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [10]:
df3.shape

(13246, 5)

In [11]:
# df3 is a DataFrame.
# Think of it like a table with rows and columns.
# Example: It may have columns like "name", "size", etc.

# df3['size'] means:
# 👉 Look at only the column called "size" in that table.
# For example, it could have values like: small, medium, small, large, etc.

# .unique() means:
# 👉 Show me all the different values in that column (no repeats).
# So, if "size" has: small, medium, small, large
# This will return: small, medium, large

df3['size'].unique()


array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [12]:
# We are creating a new column called 'bhk'
# This column will only store the number of bedrooms from the 'size' column

# df3['size'] = column like: '2 BHK', '3 Bedroom', '1 RK' etc.

# .apply(lambda x: ...) means:
# 👉 Do something for each value in the 'size' column

# x.split(' ') means:
# 👉 Split the text by space → turns '2 BHK' into a list of tokens: ['2', 'BHK']
# Here, '2' and 'BHK' are tokens (small parts of the string)

# x.split(' ')[0] means:
# 👉 Take the first token from the list → which is the number as a string (e.g., '2')

# int(...) means:
# 👉 Convert that token (which is a string, like '2') into an actual number (like 2)

df3['bhk'] = df3['size'].apply(lambda x : int(x.split(' ')[0]))  # We only take the first token (the number)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x : int(x.split(' ')[0]))  # We only take the first token (the number)


In [13]:
df3.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [14]:
# df3['bhk'] is the column where we stored the number of bedrooms (like 2, 3, 4, etc.)
# .unique() is looking at the 'bhk' column and giving us only the unique values.
# This means it removes any duplicates and only shows the different numbers of bedrooms.

df3['bhk'].unique()  


array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18])

In [15]:
# df3 is our DataFrame (think of it like a table of data)

# df3.bhk is the column in the DataFrame that contains the number of bedrooms for each property.

# df3.bhk > 20 checks which properties have more than 20 bedrooms.
# It will create a condition that only returns True for rows where the 'bhk' value is greater than 20.

# df3[...] means: Select the rows where the condition inside the brackets is True.
# So, this line will return only the rows where the number of bedrooms is greater than 20.

df3[df3.bhk > 20]

# Example output:

# location                size         total_sqft    bath   price   bhk
# 2Electronic City Phase II  27 BHK       8000         27     230.0   27
# Munnekollal              43 Bedroom     2400         40     660.0   43



Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43


In [17]:
# df3 is our DataFrame (like a table made of rows and columns)

# df3.total_sqft means:
# 👉 We are looking at only the column named "total_sqft"
# 👉 This column shows the size of the house or flat in square feet

# .unique() means:
# 👉 Show all the different (unique) values in this column
# 👉 Do not repeat values that appear more than once

df3.total_sqft.unique()


array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [18]:
# ✅ Purpose of the Function is_float(x):
# This function checks if a given value x can be converted into a number (float).

# 🔍 Why do we need this?
# Because in our data (df3.total_sqft), some values are messy or in different formats:
# 👉 Normal numbers like '1056' ✅
# 👉 Ranges like '1133 - 1384' ❌
# 👉 Words like '34.46Sq. Meter' ❌

# So, we want to find out which values are clean numbers (and keep them),
# and which are not (so we can fix or remove them later).

def is_float(x):
    try:
        # Try to convert the value x to a float (number with decimal)
        float(x)
    except:
        # If an error happens during conversion, it means x is NOT a clean number
        return False
    # If it gets converted without error, return True
    return True


In [22]:
# ✅ Purpose of this line:
# We are checking which rows in df3 have a 'total_sqft' value that is NOT a clean number.

# ❓ Why we used it:
# Because some entries in 'total_sqft' are not valid numbers — they are:
# 👉 Ranges like '1133 - 1384'
# 👉 Units like '34.46Sq. Meter'
# 👉 Text or bad data
# We want to **find and look at those bad values** so we can clean or fix them.

# df3 is our table (DataFrame)

# df3['total_sqft'] gives us the column with all the house sizes

# .apply(is_float) means:
# 👉 For every value in 'total_sqft', check if it's a clean number using our is_float function
# 👉 It returns True for valid numbers and False for messy/bad ones

# ~ (tilde) means NOT in Python
# 👉 So ~df3['total_sqft'].apply(is_float) gives us only the rows with BAD values (not float)

# Finally, df3[...] selects all the rows where the condition is True (i.e., bad sqft data)

df3[~df3['total_sqft'].apply(is_float)].head(10)


Unnamed: 0,location,size,total_sqft,bath,price,bhk
30,Yelahanka,4 BHK,2100 - 2850,4.0,186.0,4
122,Hebbal,4 BHK,3067 - 8156,4.0,477.0,4
137,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,54.005,2
165,Sarjapur,2 BHK,1145 - 1340,2.0,43.49,2
188,KR Puram,2 BHK,1015 - 1540,2.0,56.8,2
410,Kengeri,1 BHK,34.46Sq. Meter,1.0,18.5,1
549,Hennur Road,2 BHK,1195 - 1440,2.0,63.77,2
648,Arekere,9 Bedroom,4125Perch,9.0,265.0,9
661,Yelahanka,2 BHK,1120 - 1145,2.0,48.13,2
672,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,445.0,4


In [23]:
# ✅ Purpose of this function:
# This function cleans the 'total_sqft' values.
# It tries to turn the messy data into clean numbers (floats).

# ❓ Why we need it:
# Some values are ranges like '2100 - 2850' → we want the average = (2100 + 2850)/2
# Some are clean numbers like '1056' → we just keep as it is
# Some have units or bad data like '34.46Sq. Meter' → we skip (return None)

def convert_sqft_to_num(x):
    # 👉 First check if the value has a range using ' - ' (note: not underscore!)
    tokens = x.split(' - ')  # We use ' - ' to split ranges like '2100 - 2850'

    if len(tokens) == 2:
        # If we get 2 numbers, convert both to float and take their average
        return (float(tokens[0]) + float(tokens[1])) / 2

    try:
        # If it's not a range, try to convert directly to float (like '1056')
        return float(x)
    except:
        # If it's not a number at all (like '34.46Sq. Meter'), return None
        return None


In [27]:
# Calling the function with a single number as a string: '2166'
# Purpose:
# This is a clean number, so it should be directly converted into a float.

convert_sqft_to_num('2166')


2166.0

In [29]:
# Calling the function with a range: '2100 - 2850'
# Purpose:
# This is a range, so the function will split the string and calculate the average of the two values.

convert_sqft_to_num('2100 - 2850')


2475.0

In [31]:
# Calling the function with a string containing units: '34.46sq. Meter'
# Purpose:
# This string contains a unit ('sq. Meter'), so the function should fail to convert it and return None.

convert_sqft_to_num('34.46sq. Meter')


In [34]:
# Creating a copy of df3 and storing it in a new DataFrame df4
# This creates a completely new DataFrame df4 that is identical to df3, 
# but they are two separate objects in memory. Any changes to df4 will not affect df3.
# Purpose: This helps preserve the original DataFrame (df3) so that we don't accidentally change it
# during our data processing or analysis steps.
df4 = df3.copy()

# Applying the function 'convert_sqft_to_num' to each value in the 'total_sqft' column of df4
# This will clean up the 'total_sqft' column by converting all valid entries into numbers 
# (for ranges, it calculates the average), and invalid values will be converted to None.
# Purpose: The 'total_sqft' column may contain values like ranges or invalid entries,
# and we want to make sure that all the entries in this column are clean numeric values (or None if invalid).
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)

# Displaying the first few rows of the updated df4 DataFrame
# This allows us to see how the 'total_sqft' column has been updated after applying the conversion function.
# Purpose: This helps us confirm that the 'convert_sqft_to_num' function worked correctly
# and transformed the 'total_sqft' values as expected. It's a quick way to check the transformations.
df4.head(2)


Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4


In [37]:
# Accessing the row in df4 at index 30 using the .loc method
# .loc is used to access a specific row by its index label.
# In this case, it's fetching the data of the row at index 30 from df4.
# Purpose: This allows us to view all the details of the property located at index 30
# in the dataset, including its location, size, total_sqft, bath, price, and bhk.

df4.loc[30]


location      Yelahanka
size              4 BHK
total_sqft       2475.0
bath                4.0
price             186.0
bhk                   4
Name: 30, dtype: object

In [38]:
(2100+2850)/2

2475.0

In [41]:
# -------------------- Feature Engineering: Creating a New Column --------------------

# Step 1: Make a new copy of the cleaned DataFrame
# 📌 Why? So we keep the original df4 safe and untouched.
# If something goes wrong later, we can still go back to df4 and fix it.
df5 = df4.copy()

# Step 2: Create a new column called 'price_per_sqft'
# 👉 'price' column is in Lakhs (1 Lakh = 100,000 Rupees)
# 👉 We multiply the price by 100000 to convert it into rupees
# 👉 We divide that rupee price by total_sqft to get price per square foot
# 🧠 This helps us know how much 1 sq. foot costs in each property
# Example: A house costs 50 lakhs and is 1000 sqft
#          price_per_sqft = (50 * 100000) / 1000 = ₹5000 per sq. ft.
df5['price_per_sqft'] = df5['price'] * 100000 / df5['total_sqft']

# Step 3: Show the first few rows of df5 to check the new column
#Helps us confirm that our new column was added correctly
df5.head()


Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0


In [42]:
df5.location.unique() # it will give us how many unique locations we have 

array(['Electronic City Phase II', 'Chikka Tirupathi', 'Uttarahalli', ...,
       '12th cross srinivas nagar banshankari 3rd stage',
       'Havanur extension', 'Abshot Layout'], dtype=object)

In [44]:
# -------------------- Check Unique Locations --------------------

# Step 1: Check how many unique location names we have in the dataset
# 🧠 This helps us understand how many different areas or neighborhoods are present
# df5['location'] gives us the column
# .unique() returns all the unique location names (no duplicates)
# len(...) counts how many unique names are there
len(df5.location.unique())


1304

In [47]:
# -------------------- Clean and Analyze Location Column --------------------

# Step 1: Remove any extra spaces from location names
# 👉 Sometimes location names have spaces before or after the name like:
#    'Yelahanka ' or ' Yelahanka'
# 👉 These are treated as different locations even though they are the same
# 🧽 So we use strip() to clean the text and remove unwanted spaces
df5.location = df5.location.apply(lambda x: x.strip())

# Step 2: Count how many properties are listed in each location
# 👉 We use groupby('location') to group all the rows that have the same location name
# 👉 Then we apply 'count' on the 'location' column to see how many times each location appears
# 🧠 Example: if "Whitefield" appears 50 times, we will get "Whitefield: 50"
location_stats = df5.groupby('location')['location'].agg('count')

# Step 3: Sort the locations based on how many listings they have
# 📊 We want to see the most common locations at the top
# 📉 .sort_values(ascending=False) sorts from highest to lowest
location_stats = location_stats.sort_values(ascending=False)

# Step 4: Show the result
# 👀 This will give us an idea of:
#   - Which locations are popular (appear many times)
#   - Which locations are rare (appear only once or twice)
# 💡 This info helps in the next step where we reduce the number of unique locations
location_stats


location
Whitefield                            535
Sarjapur  Road                        392
Electronic City                       304
Kanakpura Road                        266
Thanisandra                           236
                                     ... 
poornaprajna layout                     1
pavitra paradise                        1
near Ramanashree California resort      1
mvj engineering college                 1
1Kasavanhalli                           1
Name: location, Length: 1293, dtype: int64

In [48]:
# -------------------- Analyze Rare Locations --------------------

# Step: Count how many locations appear 10 times or fewer in the dataset
# 👉 location_stats already contains how many times each location appears
# 👉 location_stats <= 10 creates a filter for rare locations (appearing <= 10 times)
# 👉 location_stats[location_stats <= 10] gives us a list of those rare locations
# 👉 len(...) counts how many such rare locations there are

# 🧠 Why is this important?
#    - These rare locations (only 1–10 listings) may not have enough data for the machine learning model to learn from
#    - Having too many unique locations creates too many dimensions/features for the model
#    - Grouping them into a common category (like 'other') helps reduce noise and improve performance

len(location_stats[location_stats <= 10])  # ✅ Output: 1052 rare locations


1052

In [49]:
# -------------------- Store Rare Locations into a New Variable --------------------

# Step 1: Create a new variable for rare locations
# 👉 location_stats contains all locations and their counts
# 👉 location_stats <= 10 filters only those locations that appear 10 times or fewer
# 👉 We are saving this filtered data in a new variable called location_stats_less_than_10

location_stats_less_than_10 = location_stats[location_stats <= 10]

# Step 2: Show the list of rare locations
# 🧠 This helps us identify which location names appear too few times in the dataset
#    These will later be replaced with a common label like 'other'
location_stats_less_than_10


location
Kalkere                               10
Sadashiva Nagar                       10
BTM 1st Stage                         10
Basapura                              10
Gunjur Palya                          10
                                      ..
poornaprajna layout                    1
pavitra paradise                       1
near Ramanashree California resort     1
mvj engineering college                1
1Kasavanhalli                          1
Name: location, Length: 1052, dtype: int64

In [50]:
len(df5.location.unique())

1293

In [51]:
# -------------------- Replace Rare Locations with 'other' --------------------

# Step 1: Replace all rare locations with a common name 'other'
# 👉 If the location is in the list of rare locations (location_stats_less_than_10), we replace it with 'other'
# 👉 Otherwise, we keep the location name as is
# 📌 Why? Having too many unique location values can make the model overfit or become too complex.
#        Grouping rare locations under 'other' simplifies the data and helps the model generalize better.

df5.location = df5.location.apply(lambda x: 'other' if x in location_stats_less_than_10 else x)

# Step 2: Count how many unique locations are left after replacing
# 🧠 This shows us how many distinct locations we have now after grouping rare ones
len(df5.location.unique())


242

In [52]:
df5.head(10)

Unnamed: 0,location,size,total_sqft,bath,price,bhk,price_per_sqft
0,Electronic City Phase II,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,Chikka Tirupathi,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,Uttarahalli,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,Lingadheeranahalli,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,Kothanur,2 BHK,1200.0,2.0,51.0,2,4250.0
5,Whitefield,2 BHK,1170.0,2.0,38.0,2,3247.863248
6,Old Airport Road,4 BHK,2732.0,4.0,204.0,4,7467.057101
7,Rajaji Nagar,4 BHK,3300.0,4.0,600.0,4,18181.818182
8,Marathahalli,3 BHK,1310.0,3.0,63.25,3,4828.244275
9,other,6 Bedroom,1020.0,6.0,370.0,6,36274.509804
