# Bengaluru house price prediction model

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20, 10)

In [3]:
df1 = pd.read_csv("Bengaluru_House_Data.csv")
df1.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [4]:
df1.shape

(13320, 9)

In [5]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
df1.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [7]:
n_unique = df1.nunique()
print("Number of unique values:\n{}".format(n_unique))

Number of unique values:
area_type          4
availability      81
location        1305
size              31
society         2688
total_sqft      2117
bath              19
balcony            4
price           1994
dtype: int64


In [8]:
df1.groupby("area_type")["area_type"].agg("count")

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [9]:
df1.groupby("society")["society"].agg("count")

society
3Codeli    2
7 ise P    1
A idse     2
A rtsai    1
ACersd     1
          ..
Zonce E    2
Zostaa     3
i1ncyRe    1
i1odsne    1
i1rtsCo    3
Name: society, Length: 2688, dtype: int64

In [10]:
df1.groupby("availability")["availability"].agg("count")

availability
14-Jul                      1
14-Nov                      1
15-Aug                      1
15-Dec                      1
15-Jun                      1
                        ...  
22-Mar                      3
22-May                     10
22-Nov                      2
Immediate Possession       16
Ready To Move           10581
Name: availability, Length: 81, dtype: int64

In [11]:
one_hot_encoded_data = pd.get_dummies(df1, columns = ["area_type"])
print(one_hot_encoded_data.iloc[:,7:].head())

    price  area_type_Built-up  Area  area_type_Carpet  Area  \
0   39.07                         0                       0   
1  120.00                         0                       0   
2   62.00                         1                       0   
3   95.00                         0                       0   
4   51.00                         0                       0   

   area_type_Plot  Area  area_type_Super built-up  Area  
0                     0                               1  
1                     1                               0  
2                     0                               0  
3                     0                               1  
4                     0                               1  


In [12]:
df2 = df1.drop(["society"],axis = "columns")

### Data Cleaning

In [13]:
df2.isnull().sum()

area_type         0
availability      0
location          1
size             16
total_sqft        0
bath             73
balcony         609
price             0
dtype: int64

In [14]:
df2[['size', 'balcony']] = df2[['size', 'balcony']].fillna(df2[['size', 'balcony']].median(numeric_only =None))

  df2[['size', 'balcony']] = df2[['size', 'balcony']].fillna(df2[['size', 'balcony']].median(numeric_only =None))


In [15]:
df2.isnull().sum()

area_type        0
availability     0
location         1
size            16
total_sqft       0
bath            73
balcony          0
price            0
dtype: int64

In [16]:
df3 = df2.dropna()
df3.isnull().sum()

area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64

In [17]:
unique_bhk_size = df3["size"].unique()
n_unique_bhk_size = df3["size"].nunique()
print("Unique values:\n{}".format(unique_bhk_size))
print("Number of unique values:\n{}".format(n_unique_bhk_size))

Unique values:
['2 BHK' '4 Bedroom' '3 BHK' '4 BHK' '6 Bedroom' '3 Bedroom' '1 BHK'
 '1 RK' '1 Bedroom' '8 Bedroom' '2 Bedroom' '7 Bedroom' '5 BHK' '7 BHK'
 '6 BHK' '5 Bedroom' '11 BHK' '9 BHK' '9 Bedroom' '27 BHK' '10 Bedroom'
 '11 Bedroom' '10 BHK' '19 BHK' '16 BHK' '43 Bedroom' '14 BHK' '8 BHK'
 '12 Bedroom' '13 BHK' '18 Bedroom']
Number of unique values:
31


In [18]:
df3["size"].nunique()

31

In [24]:
#df3["bhk"] = df3.loc("size").apply(lambda x: x.split()[0])
# https://www.kdnuggets.com/2019/06/select-rows-columns-pandas.html
#df3["bhk"] = df3.loc[:,("size")].apply(lambda x: int(x.split()[0]))
df3["bhk"] = df3["size"].apply(lambda x: int(x.split()[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3["bhk"] = df3["size"].apply(lambda x: int(x.split()[0]))


In [25]:
df3.head()

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,1200,2.0,1.0,51.0,2


In [26]:
unique_bhk = df3["bhk"].unique()
n_unique_bhk = df3["bhk"].nunique()
print("Unique values:\n{}".format(unique_bhk))
print("Number of unique values:\n{}".format(n_unique_bhk))

Unique values:
[ 2  4  3  6  1  8  7  5 11  9 27 10 19 16 43 14 12 13 18]
Number of unique values:
19


In [27]:
df3[df3.bhk > 15]

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,bhk
1718,Super built-up Area,Ready To Move,2Electronic City Phase II,27 BHK,8000,27.0,0.0,230.0,27
3379,Super built-up Area,Ready To Move,1Hanuman Nagar,19 BHK,2000,16.0,2.0,490.0,19
3609,Super built-up Area,Ready To Move,Koramangala Industrial Layout,16 BHK,10000,16.0,2.0,550.0,16
4684,Plot Area,Ready To Move,Munnekollal,43 Bedroom,2400,40.0,0.0,660.0,43
11559,Plot Area,18-Apr,1Kasavanhalli,18 Bedroom,1200,18.0,2.0,200.0,18


In [28]:
df3.total_sqft.unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [29]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

### Find wrong format values and mismatched values

In [39]:
df3[~df3["total_sqft"].apply(is_float)].shape

(190, 9)

In [38]:
# df3[df3["total_sqft"].apply(is_float)]
df3[~df3["total_sqft"].apply(is_float)].head(10)

Unnamed: 0,area_type,availability,location,size,total_sqft,bath,balcony,price,bhk
30,Super built-up Area,19-Dec,Yelahanka,4 BHK,2100 - 2850,4.0,0.0,186.0,4
122,Super built-up Area,18-Mar,Hebbal,4 BHK,3067 - 8156,4.0,0.0,477.0,4
137,Super built-up Area,19-Mar,8th Phase JP Nagar,2 BHK,1042 - 1105,2.0,0.0,54.005,2
165,Super built-up Area,18-Dec,Sarjapur,2 BHK,1145 - 1340,2.0,0.0,43.49,2
188,Super built-up Area,Ready To Move,KR Puram,2 BHK,1015 - 1540,2.0,0.0,56.8,2
410,Super built-up Area,Ready To Move,Kengeri,1 BHK,34.46Sq. Meter,1.0,0.0,18.5,1
549,Super built-up Area,18-Sep,Hennur Road,2 BHK,1195 - 1440,2.0,0.0,63.77,2
648,Built-up Area,Ready To Move,Arekere,9 Bedroom,4125Perch,9.0,2.0,265.0,9
661,Super built-up Area,Ready To Move,Yelahanka,2 BHK,1120 - 1145,2.0,0.0,48.13,2
672,Built-up Area,18-Mar,Bettahalsoor,4 Bedroom,3090 - 5002,4.0,0.0,445.0,4
