In [1]:
# Importing the pandas library and giving it a short name 'pd'
# We use pandas to work with tables of data (like Excel spreadsheets)
import pandas as pd

# Importing the numpy library and giving it a short name 'np'
# We use numpy to work with numbers and math (especially with big lists of numbers)
import numpy as np 

# Importing the pyplot part of matplotlib and giving it a short name 'plt'
# This helps us draw graphs and charts
from matplotlib import pyplot as plt 

# This line tells Jupyter Notebook to show the graphs right below the code
# (You only need this if you're using Jupyter Notebook)
%matplotlib inline

# Importing the main matplotlib library so we can change how our charts look
import matplotlib

# Setting the default size of the graphs to be wider and taller
# (This makes the graphs easier to see)
matplotlib.rcParams["figure.figsize"] = (20, 10)


In [2]:
df1 = pd.read_csv("Bengaluru_House_Data.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
df1.shape

(13320, 9)

In [4]:
# This line groups the data in the 'df1' table by the values in the 'area_type' column
# It then counts how many times each 'area_type' appears in the table

# Let's break it down:
# df1 is our data table (like a big Excel sheet)
# 'groupby' helps us group rows that have the same 'area_type' (like "Built-up Area", "Plot Area", etc.)
# ['area_type'] tells Python to group the data by the 'area_type' column
# ['area_type'].agg('count') tells Python to count how many rows are in each group (how many homes for each area type)

df1.groupby('area_type')['area_type'].agg('count')


area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [5]:
# This line removes (drops) the specified columns from the original dataframe (df1)
# The columns being removed are 'area_type', 'society', 'balcony', and 'availability'
# The resulting dataframe is saved as df2, which is a "cleaned-up" version of df1 without these columns
# axis='columns' specifies that we want to drop columns, not rows
df2 = df1.drop(['area_type', 'society', 'balcony', 'availability'], axis='columns')

# This line shows the first 5 rows of the cleaned dataframe (df2) so you can quickly check what it looks like
# This is useful to make sure the data looks correct after removing the specified columns
df2.head()


Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [6]:
# This line checks for missing (null) values in each column of the cleaned dataframe (df2)
# 'isnull()' returns a dataframe of the same shape, but with 'True' where values are missing and 'False' where values are present
# '.sum()' adds up the 'True' values (which are treated as 1), so it gives the total count of missing values in each column

# This is useful to understand which columns have missing data and how many missing values there are
df2.isnull().sum()


location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [7]:
# The following line removes any rows from the dataframe (df2) that have missing values.
# In other words, it deletes the rows where there is any empty cell (NaN) in the data.
# This helps make sure that the data we're working with has no missing information.
df3 = df2.dropna()

# After removing the rows with missing values, we use the following line to check if there are any remaining missing values.
# This will tell us if our data (df3) still has any empty cells.
# 'isnull()' checks for missing values, and '.sum()' adds up how many missing values there are in each column.
df3.isnull().sum()


location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [8]:
df3.shape

(13246, 5)

In [9]:
# df3 is a DataFrame.
# Think of it like a table with rows and columns.
# Example: It may have columns like "name", "size", etc.

# df3['size'] means:
# 👉 Look at only the column called "size" in that table.
# For example, it could have values like: small, medium, small, large, etc.

# .unique() means:
# 👉 Show me all the different values in that column (no repeats).
# So, if "size" has: small, medium, small, large
# This will return: small, medium, large

df3['size'].unique()


array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [12]:
# We are creating a new column called 'bhk'
# This column will only store the number of bedrooms from the 'size' column

# df3['size'] = column like: '2 BHK', '3 Bedroom', '1 RK' etc.

# .apply(lambda x: ...) means:
# 👉 Do something for each value in the 'size' column

# x.split(' ') means:
# 👉 Split the text by space → turns '2 BHK' into a list of tokens: ['2', 'BHK']
# Here, '2' and 'BHK' are tokens (small parts of the string)

# x.split(' ')[0] means:
# 👉 Take the first token from the list → which is the number as a string (e.g., '2')

# int(...) means:
# 👉 Convert that token (which is a string, like '2') into an actual number (like 2)

df3['bhk'] = df3['size'].apply(lambda x : int(x.split(' ')[0]))  # We only take the first token (the number)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x : int(x.split(' ')[0]))  # We only take the first token (the number)


In [13]:
df3.head()

Unnamed: 0,location,size,total_sqft,bath,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [15]:
# df3['bhk'] is the column where we stored the number of bedrooms (like 2, 3, 4, etc.)
# .unique() is looking at the 'bhk' column and giving us only the unique values.
# This means it removes any duplicates and only shows the different numbers of bedrooms.

df3['bhk'].unique()  


array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18])

In [17]:
# df3 is our DataFrame (think of it like a table of data)

# df3.bhk is the column in the DataFrame that contains the number of bedrooms for each property.

# df3.bhk > 20 checks which properties have more than 20 bedrooms.
# It will create a condition that only returns True for rows where the 'bhk' value is greater than 20.

# df3[...] means: Select the rows where the condition inside the brackets is True.
# So, this line will return only the rows where the number of bedrooms is greater than 20.

df3[df3.bhk > 20]

# Example output:

# location                size         total_sqft    bath   price   bhk
# 2Electronic City Phase II  27 BHK       8000         27     230.0   27
# Munnekollal              43 Bedroom     2400         40     660.0   43



Unnamed: 0,location,size,total_sqft,bath,price,bhk
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43
