# BrokerInBlue House Price Predictor

### Importing Libraires

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,10)

### Loading Dataset

In [2]:
df1 = pd.read_csv('Bengaluru_House_Data.csv')
df1.sample(10)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
4343,Plot Area,Ready To Move,Vishwapriya Nagar,6 Bedroom,,2100,6.0,2.0,71.0
8867,Super built-up Area,Ready To Move,Ambedkar Nagar,2 BHK,SoechHa,1409,2.0,2.0,95.0
8435,Plot Area,Ready To Move,Jalahalli,4 Bedroom,,1000,4.0,1.0,90.0
5858,Plot Area,Ready To Move,Shampura,6 Bedroom,,1150,6.0,2.0,75.0
13279,Plot Area,Ready To Move,Vishwanatha Nagenahalli,6 Bedroom,,1200,5.0,,130.0
4227,Super built-up Area,Ready To Move,Whitefield,2 BHK,,1340,2.0,1.0,77.0
9962,Super built-up Area,Ready To Move,Gottigere,2 BHK,,1100,2.0,1.0,28.0
9267,Super built-up Area,Ready To Move,Vishwapriya Layout,2 BHK,Prodsth,770,2.0,2.0,30.0
8669,Super built-up Area,Ready To Move,Bellandur,2 BHK,,1200,2.0,3.0,45.0
11340,Super built-up Area,Ready To Move,Byrasandra,2 BHK,Srhan P,1100,2.0,2.0,55.0


### Exploring and Cleaning Data

In [3]:
df1.shape

(13320, 9)

In [4]:
df1.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [5]:
df1.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [6]:
df2 = df1.drop(['area_type','availability','society'], axis='columns')

In [7]:
df2.sample(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
6314,VGP Layout,3 BHK,1515,3.0,3.0,90.0
2968,Yelahanka New Town,3 BHK,1610,3.0,2.0,92.0
9393,7th Phase JP Nagar,3 BHK,1680,3.0,2.0,122.0
9050,Bannerghatta Road,3 BHK,1365 - 1700,3.0,0.0,79.465
4182,Green Glen Layout,3 BHK,1715,3.0,2.0,115.0
1890,Harlur,2 BHK,1174,2.0,1.0,76.0
8560,7th Phase JP Nagar,2 BHK,1040,2.0,1.0,75.0
6253,Nagarbhavi,3 BHK,1850,2.0,2.0,89.0
12727,Whitefield,5 Bedroom,4144,5.0,1.0,331.0
2244,Kammasandra,2 Bedroom,1200,2.0,1.0,66.0


In [8]:
df2.shape

(13320, 6)

In [9]:
df2['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: size, dtype: int64

In [10]:
df2.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [11]:
df3 = df2.dropna()

In [12]:
df3.shape

(12710, 6)

In [13]:
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [14]:
df3['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In [15]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))


In [16]:
df3

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715,3.0,3.0,112.00,3
13315,Whitefield,5 Bedroom,3453,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00,4


In [17]:
df3[df3.bhk>10]

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
459,1 Giri Nagar,11 BHK,5000,9.0,3.0,360.0,11
1718,2Electronic City Phase II,27 BHK,8000,27.0,0.0,230.0,27
1768,1 Ramamurthy Nagar,11 Bedroom,1200,11.0,0.0,170.0,11
3853,1 Annasandrapalya,11 Bedroom,1200,6.0,3.0,150.0,11
4684,Munnekollal,43 Bedroom,2400,40.0,0.0,660.0,43
4916,1Channasandra,14 BHK,1250,15.0,0.0,125.0,14
6533,Mysore Road,12 Bedroom,2232,6.0,2.0,300.0,12
9935,1Hoysalanagar,13 BHK,5425,13.0,0.0,275.0,13


In [18]:
df3['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [19]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [20]:
df3[~df3['total_sqft'].apply(is_float)].sample(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
7286,Kanakpura Road,2 BHK,1010 - 1300,2.0,0.0,50.125,2
11498,Yelahanka,1 BHK,629 - 1026,1.0,0.0,42.535,1
11178,Hennur Road,3 BHK,1550 - 1590,3.0,0.0,75.99,3
1178,Yelahanka,3 BHK,1445 - 1455,3.0,0.0,65.255,3
11389,Electronic City Phase II,4 BHK,2150 - 2225,4.0,0.0,105.0,4
5453,Kannur,6 Bedroom,3Cents,6.0,3.0,75.0,6
5599,Hormavu,3 BHK,1469 - 1766,3.0,0.0,73.595,3
7520,Doddaballapur,3 BHK,1100Sq. Meter,2.0,1.0,48.0,3
8724,Varthur Road,1 BHK,540 - 565,1.0,0.0,13.26,1
5382,Whitefield,1 BHK,524 - 894,1.0,0.0,34.735,1


In [21]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return(float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [22]:
convert_sqft_to_num('2516')

2516.0

In [23]:
convert_sqft_to_num('2516 - 5612')

4064.0

In [24]:
convert_sqft_to_num('1574Sq. Yards')

In [25]:
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)
df4.sample(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
1757,Thubarahalli,2 BHK,1128.0,2.0,2.0,75.0,2
5102,Vimanapura,3 BHK,1600.0,3.0,1.0,64.0,3
9969,R.T. Nagar,2 BHK,1150.0,2.0,2.0,72.0,2
3765,Electronic City,2 BHK,1090.0,2.0,1.0,31.49,2
8848,Vasanthapura,2 BHK,940.0,2.0,1.0,40.0,2
962,sapthagiri Layout,2 BHK,1300.0,1.0,1.0,115.0,2
5309,Marathahalli,2 BHK,1200.0,2.0,1.0,67.0,2
11398,Rajaji Nagar,3 BHK,2500.0,3.0,3.0,340.0,3
135,Kanakpura Road,2 BHK,950.0,2.0,1.0,57.0,2
3727,Basavangudi,4 BHK,2600.0,4.0,3.0,260.0,4


In [26]:
df4.loc[122]

location      Hebbal
size           4 BHK
total_sqft    5611.5
bath             4.0
balcony          0.0
price          477.0
bhk                4
Name: 122, dtype: object

In [27]:
df4.sample(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
6562,Dasarahalli,3 Bedroom,2400.0,2.0,1.0,152.0,3
10332,OMBR Layout,3 BHK,1580.0,3.0,2.0,75.0,3
5175,Bellandur,2 BHK,1060.0,2.0,1.0,65.0,2
5553,Pattanagere,2 BHK,850.0,2.0,0.0,35.0,2
9669,Parappana Agrahara,2 BHK,1194.0,2.0,2.0,45.0,2
668,Shantiniketan Layout,3 BHK,2072.0,3.0,2.0,108.0,3
942,Attibele,1 BHK,400.0,1.0,1.0,11.0,1
8613,BSM Extension,2 BHK,900.0,2.0,3.0,45.0,2
8170,Srinivas Colony,3 BHK,2750.0,3.0,2.0,943.0,3
9926,Sector 2 HSR Layout,3 BHK,1515.0,2.0,3.0,69.0,3


### Feature Engineering

In [35]:
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*10000/df5['total_sqft']
df5.sample(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
2194,Nehru Nagar,3 BHK,1775.0,3.0,3.0,110.0,3,619.71831
9845,Akshaya Nagar,1 Bedroom,2000.0,1.0,0.0,200.0,1,1000.0
10484,Kanakapura,3 BHK,1290.0,2.0,3.0,51.1,3,396.124031
1517,Ramagondanahalli,3 BHK,1610.0,2.0,2.0,111.0,3,689.440994
7209,Kumbena Agrahara,2 BHK,1180.0,2.0,2.0,50.0,2,423.728814
1956,Gubbalala,3 BHK,1745.0,3.0,2.0,104.0,3,595.988539
5439,Hebbal,2 BHK,1344.0,2.0,1.0,108.0,2,803.571429
7248,Hulimavu,2 BHK,,2.0,3.0,46.0,2,
10812,Electronic City,3 BHK,1644.0,3.0,2.0,92.59,3,563.199513
8587,Whitefield,3 BHK,1564.0,3.0,1.0,103.0,3,658.567775


In [37]:
df5.isnull().sum()

location           0
size               0
total_sqft        42
bath               0
balcony            0
price              0
bhk                0
price_per_sqft    42
dtype: int64

In [39]:
df5.shape

(12710, 8)

In [42]:
len(df5.location.unique())

1265

In [48]:
df5.location = df5.location.apply(lambda x: x.strip())
location_stats = df5.groupby('location')['location'].agg('count').sort_values(ascending=False)
location_stats

location
Whitefield              515
Sarjapur  Road          372
Electronic City         302
Kanakpura Road          261
Thanisandra             234
                       ... 
Kanakapura  Rod           1
Kanakapura Main Road      1
Kanakapura Road           1
Kanakapura Road,          1
whitefiled                1
Name: location, Length: 1254, dtype: int64

In [51]:
len(location_stats[location_stats<=10])

1017

In [53]:
location_less_10 = location_stats[location_stats<=10]
location_less_10

location
1st Block Koramangala    10
Kalkere                  10
Basapura                 10
Kodigehalli              10
Gunjur Palya             10
                         ..
Kanakapura  Rod           1
Kanakapura Main Road      1
Kanakapura Road           1
Kanakapura Road,          1
whitefiled                1
Name: location, Length: 1017, dtype: int64

In [54]:
df5.location = df5.location.apply(lambda x: 'other' if x in location_less_10 else x)

In [55]:
len(df5.location.unique())

238

In [56]:
df5.sample(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk,price_per_sqft
5619,Marathahalli,2 BHK,1102.0,2.0,2.0,53.67,2,487.023593
12555,Rajaji Nagar,3 BHK,2367.0,3.0,2.0,320.0,3,1351.922264
379,Jalahalli,3 BHK,1400.0,3.0,0.0,77.0,3,550.0
8707,other,3 BHK,1800.0,4.0,3.0,115.0,3,638.888889
8340,Koramangala,2 BHK,1084.0,2.0,1.0,179.0,2,1651.291513
1100,other,1 Bedroom,750.0,1.0,0.0,56.25,1,750.0
493,5th Phase JP Nagar,2 BHK,1150.0,3.0,3.0,52.5,2,456.521739
8687,other,3 BHK,1495.0,3.0,2.0,55.0,3,367.892977
12504,Jalahalli East,4 Bedroom,1200.0,4.0,1.0,80.0,4,666.666667
6188,Benson Town,3 BHK,2850.0,4.0,3.0,470.0,3,1649.122807


In [72]:
df5 = df5.drop(['size'], axis='columns')

In [74]:
df5.sample(5)

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
9945,Vasanthapura,978.0,2.0,1.0,34.22,2,349.897751
99,other,1200.0,3.0,1.0,90.0,3,750.0
6602,Arekere,2060.0,3.0,2.0,150.0,3,728.15534
464,Chikkalasandra,1100.0,2.0,2.0,50.0,2,454.545455
8785,Kengeri,883.0,2.0,0.0,49.0,2,554.926387


In [75]:
df5.isnull().sum()

location           0
total_sqft        42
bath               0
balcony            0
price              0
bhk                0
price_per_sqft    42
dtype: int64

In [78]:
df5 = df5.dropna()

In [79]:
df5.isnull().sum()

location          0
total_sqft        0
bath              0
balcony           0
price             0
bhk               0
price_per_sqft    0
dtype: int64

In [80]:
df5.shape

(12668, 7)

### Outlier Removal

In [81]:
df5.sample(10)

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
3099,other,1070.0,2.0,2.0,33.0,2,308.411215
11977,Tumkur Road,992.0,2.0,1.0,70.0,2,705.645161
468,Akshaya Nagar,1070.0,2.0,1.0,54.0,2,504.672897
1199,Uttarahalli,1008.0,2.0,3.0,45.0,2,446.428571
11104,Electronic City Phase II,1160.0,2.0,1.0,33.5,2,288.793103
4505,Chandapura,645.0,1.0,1.0,16.45,1,255.03876
10300,Kanakpura Road,1452.0,3.0,3.0,55.6,3,382.92011
11092,Bannerghatta Road,1656.0,2.0,1.0,62.93,3,380.012077
355,other,2000.0,3.0,2.0,365.0,3,1825.0
1810,Kanakpura Road,1570.0,3.0,3.0,64.5,3,410.828025


In [82]:
df5[(df5.total_sqft/df5.bhk)<200].head()

Unnamed: 0,location,total_sqft,bath,balcony,price,bhk,price_per_sqft
68,Devarachikkanahalli,1350.0,7.0,0.0,85.0,8,629.62963
70,other,500.0,3.0,2.0,100.0,3,2000.0
89,Rajaji Nagar,710.0,6.0,3.0,160.0,6,2253.521127
119,Hennur Road,276.0,3.0,3.0,23.0,2,833.333333
129,Vishwapriya Layout,950.0,7.0,0.0,115.0,7,1210.526316


In [83]:
df5.shape

(12668, 7)

In [85]:
df6 = df5[~(df5.total_sqft/df5.bhk<200)]

In [86]:
df6.shape

(12386, 7)

In [87]:
df6.price_per_sqft.describe()

count    12386.000000
mean       638.116956
std        415.888519
min         26.782981
25%        421.434464
50%        530.769231
75%        699.929577
max      17647.058824
Name: price_per_sqft, dtype: float64

In [None]:
df6.head()