In [1]:
# Import necessary packages
import numpy as np
import pandas as pd

In [2]:
# Load the data
dataframe = pd.read_csv('Bengaluru_House_Data.csv')
dataframe.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [3]:
# Analyze the data
dataframe.info()

# Check for missing values
dataframe.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [4]:
# Check different values in the area_type column
print(dataframe['area_type'].value_counts())
print(len(dataframe['area_type'].unique()))

# Check different values in the location column
print(dataframe['location'].value_counts())
print(len(dataframe['location'].unique()))


# As we can see that there are 1304 unique locations, so we can drop the location column

area_type
Super built-up  Area    8790
Built-up  Area          2418
Plot  Area              2025
Carpet  Area              87
Name: count, dtype: int64
4
location
Whitefield                        540
Sarjapur  Road                    399
Electronic City                   302
Kanakpura Road                    273
Thanisandra                       234
                                 ... 
Bapuji Layout                       1
1st Stage Radha Krishna Layout      1
BEML Layout 5th stage               1
singapura paradise                  1
Abshot Layout                       1
Name: count, Length: 1305, dtype: int64
1306


In [5]:
# Drop unnecessary columns
dataframe = dataframe.drop(['availability', 'society', 'location'], axis=1)

In [6]:
dataframe.head()

Unnamed: 0,area_type,size,total_sqft,bath,balcony,price
0,Super built-up Area,2 BHK,1056,2.0,1.0,39.07
1,Plot Area,4 Bedroom,2600,5.0,3.0,120.0
2,Built-up Area,3 BHK,1440,2.0,3.0,62.0
3,Super built-up Area,3 BHK,1521,3.0,1.0,95.0
4,Super built-up Area,2 BHK,1200,2.0,1.0,51.0


In [7]:
# Remove the rows that does not have 'size' value
dataframe = dataframe.dropna(subset=['size'])

In [8]:
dataframe.isnull().sum()

area_type       0
size            0
total_sqft      0
bath           57
balcony       593
price           0
dtype: int64

In [10]:
# Check unique values in the size column
print(dataframe['size'].unique())

['2 BHK' '4 Bedroom' '3 BHK' '4 BHK' '6 Bedroom' '3 Bedroom' '1 BHK'
 '1 RK' '1 Bedroom' '8 Bedroom' '2 Bedroom' '7 Bedroom' '5 BHK' '7 BHK'
 '6 BHK' '5 Bedroom' '11 BHK' '9 BHK' '9 Bedroom' '27 BHK' '10 Bedroom'
 '11 Bedroom' '10 BHK' '19 BHK' '16 BHK' '43 Bedroom' '14 BHK' '8 BHK'
 '12 Bedroom' '13 BHK' '18 Bedroom']


In [11]:
# Convert the size column to integer
dataframe['size'] = dataframe['size'].apply(lambda x: int(x.split(' ')[0])) # Split the string and take the first element and convert it to integer

# Change the name of the column size to bhk
dataframe = dataframe.rename(columns={'size': 'bhk'})

dataframe.head()

Unnamed: 0,area_type,bhk,total_sqft,bath,balcony,price
0,Super built-up Area,2,1056,2.0,1.0,39.07
1,Plot Area,4,2600,5.0,3.0,120.0
2,Built-up Area,3,1440,2.0,3.0,62.0
3,Super built-up Area,3,1521,3.0,1.0,95.0
4,Super built-up Area,2,1200,2.0,1.0,51.0


In [12]:
# Convert the null values of 'bath' column equal to the bhk value of that row
dataframe['bath'] = dataframe['bath'].fillna(dataframe['bhk'])

# Convert the null values of 'balcony' column equal to the median value of that column
dataframe['balcony'] = dataframe['balcony'].fillna(dataframe['balcony'].median())

# Convert 'bath' and 'balcony' columns to integer
dataframe['bath'] = dataframe['bath'].astype(int)
dataframe['balcony'] = dataframe['balcony'].astype(int)

print(dataframe.info())

print(dataframe.head())

print(dataframe.isnull().sum())



<class 'pandas.core.frame.DataFrame'>
Index: 13304 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13304 non-null  object 
 1   bhk         13304 non-null  int64  
 2   total_sqft  13304 non-null  object 
 3   bath        13304 non-null  int64  
 4   balcony     13304 non-null  int64  
 5   price       13304 non-null  float64
dtypes: float64(1), int64(3), object(2)
memory usage: 727.6+ KB
None
              area_type  bhk total_sqft  bath  balcony   price
0  Super built-up  Area    2       1056     2        1   39.07
1            Plot  Area    4       2600     5        3  120.00
2        Built-up  Area    3       1440     2        3   62.00
3  Super built-up  Area    3       1521     3        1   95.00
4  Super built-up  Area    2       1200     2        1   51.00
area_type     0
bhk           0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64


In [16]:
# Check the 'total_sqft' column values that is not in float
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

dataframe[~dataframe['total_sqft'].apply(is_float)].head(10) # Display the rows that are not in float

Unnamed: 0,area_type,bhk,total_sqft,bath,balcony,price
30,Super built-up Area,4,2100 - 2850,4,0,186.0
56,Built-up Area,4,3010 - 3410,4,2,192.0
81,Built-up Area,4,2957 - 3450,4,2,224.5
122,Super built-up Area,4,3067 - 8156,4,0,477.0
137,Super built-up Area,2,1042 - 1105,2,0,54.005
165,Super built-up Area,2,1145 - 1340,2,0,43.49
188,Super built-up Area,2,1015 - 1540,2,0,56.8
224,Super built-up Area,3,1520 - 1740,3,2,74.82
410,Super built-up Area,1,34.46Sq. Meter,1,0,18.5
549,Super built-up Area,2,1195 - 1440,2,0,63.77


In [None]:
# Convert the range values to float
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None
    
dataframe['total_sqft'] = dataframe['total_sqft'].apply(convert_sqft_to_num)  



In [20]:
# Drop the rows that have null values in the total_sqft column
dataframe = dataframe.dropna(subset=['total_sqft'])

In [21]:
print(dataframe.isnull().sum())

print(dataframe.info())

area_type     0
bhk           0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 13258 entries, 0 to 13319
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   area_type   13258 non-null  object 
 1   bhk         13258 non-null  int64  
 2   total_sqft  13258 non-null  float64
 3   bath        13258 non-null  int64  
 4   balcony     13258 non-null  int64  
 5   price       13258 non-null  float64
dtypes: float64(2), int64(3), object(1)
memory usage: 725.0+ KB
None


# Outlier Detection and Removal