# BrokerInBlue House Price Predictor

### Importing Libraires

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams['figure.figsize'] = (20,10)

### Loading Dataset

In [2]:
df1 = pd.read_csv('Bengaluru_House_Data.csv')
df1.sample(10)

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
3985,Super built-up Area,19-Oct,Hebbal,3 BHK,SNontle,2600,3.0,2.0,195.0
9729,Super built-up Area,Ready To Move,Rajaji Nagar,3 BHK,Brway G,1640,3.0,2.0,251.0
9826,Super built-up Area,Ready To Move,Kasavanhalli,2 BHK,,1069,2.0,2.0,55.0
1936,Super built-up Area,Ready To Move,Shettigere,1 BHK,,650,1.0,1.0,26.0
12323,Super built-up Area,Ready To Move,KR Puram,3 BHK,RaiewLa,1400,2.0,3.0,60.0
13281,Plot Area,Ready To Move,Margondanahalli,5 Bedroom,,1375,5.0,1.0,125.0
10370,Super built-up Area,Ready To Move,Amruthahalli,3 BHK,,2650,4.0,0.0,175.0
11279,Super built-up Area,18-Aug,Kanakpura Road,3 BHK,,1401,3.0,2.0,69.0
5727,Super built-up Area,Ready To Move,Kanakpura Road,3 BHK,PuandHi,1498,3.0,2.0,95.0
4873,Built-up Area,Ready To Move,Saptagiri Layout,2 BHK,,1100,2.0,2.0,44.0


### Exploring and Cleaning Data

In [3]:
df1.shape

(13320, 9)

In [4]:
df1.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


In [5]:
df1.groupby('area_type')['area_type'].agg('count')

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [6]:
df2 = df1.drop(['area_type','availability','society'], axis='columns')

In [7]:
df2.sample(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price
5861,Whitefield,1 BHK,905,1.0,1.0,52.0
9973,Hennur Bande,6 Bedroom,1200,6.0,3.0,180.0
12166,Thyagaraja Nagar,2 BHK,1040,2.0,1.0,68.0
4372,Amarjyothi Colony,2 BHK,1100,2.0,2.0,38.0
11595,Budigere,3 BHK,1820,3.0,2.0,85.0
1933,Kanakpura Road,2 BHK,1135,2.0,2.0,58.0
11642,Belatur,2 BHK,1275,2.0,2.0,59.0
7893,Kasavanhalli,3 BHK,1225,2.0,1.0,69.0
6609,Hennur Gardens,3 BHK,1490,2.0,1.0,115.0
709,Green Garden Layout,2 BHK,1045,2.0,0.0,45.0


In [8]:
df2.shape

(13320, 6)

In [9]:
df2['size'].value_counts()

2 BHK         5199
3 BHK         4310
4 Bedroom      826
4 BHK          591
3 Bedroom      547
1 BHK          538
2 Bedroom      329
5 Bedroom      297
6 Bedroom      191
1 Bedroom      105
8 Bedroom       84
7 Bedroom       83
5 BHK           59
9 Bedroom       46
6 BHK           30
7 BHK           17
1 RK            13
10 Bedroom      12
9 BHK            8
8 BHK            5
11 BHK           2
11 Bedroom       2
10 BHK           2
14 BHK           1
13 BHK           1
12 Bedroom       1
27 BHK           1
43 Bedroom       1
16 BHK           1
19 BHK           1
18 Bedroom       1
Name: size, dtype: int64

In [10]:
df2.isnull().sum()

location        1
size           16
total_sqft      0
bath           73
balcony       609
price           0
dtype: int64

In [11]:
df3 = df2.dropna()

In [12]:
df3.shape

(12710, 6)

In [13]:
df3.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
balcony       0
price         0
dtype: int64

In [14]:
df3['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '3 Bedroom', '1 BHK', '1 RK',
       '4 BHK', '1 Bedroom', '2 Bedroom', '6 Bedroom', '8 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '11 Bedroom', '43 Bedroom',
       '14 BHK', '8 BHK', '12 Bedroom', '10 Bedroom', '13 BHK'],
      dtype=object)

In [15]:
df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['bhk'] = df3['size'].apply(lambda x: int(x.split(' ')[0]))


In [16]:
df3

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
0,Electronic City Phase II,2 BHK,1056,2.0,1.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,3.0,120.00,4
2,Uttarahalli,3 BHK,1440,2.0,3.0,62.00,3
3,Lingadheeranahalli,3 BHK,1521,3.0,1.0,95.00,3
4,Kothanur,2 BHK,1200,2.0,1.0,51.00,2
...,...,...,...,...,...,...,...
13314,Green Glen Layout,3 BHK,1715,3.0,3.0,112.00,3
13315,Whitefield,5 Bedroom,3453,4.0,0.0,231.00,5
13317,Raja Rajeshwari Nagar,2 BHK,1141,2.0,1.0,60.00,2
13318,Padmanabhanagar,4 BHK,4689,4.0,1.0,488.00,4


In [17]:
df3[df3.bhk>10]

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
459,1 Giri Nagar,11 BHK,5000,9.0,3.0,360.0,11
1718,2Electronic City Phase II,27 BHK,8000,27.0,0.0,230.0,27
1768,1 Ramamurthy Nagar,11 Bedroom,1200,11.0,0.0,170.0,11
3853,1 Annasandrapalya,11 Bedroom,1200,6.0,3.0,150.0,11
4684,Munnekollal,43 Bedroom,2400,40.0,0.0,660.0,43
4916,1Channasandra,14 BHK,1250,15.0,0.0,125.0,14
6533,Mysore Road,12 Bedroom,2232,6.0,2.0,300.0,12
9935,1Hoysalanagar,13 BHK,5425,13.0,0.0,275.0,13


In [18]:
df3['total_sqft'].unique()

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)

In [19]:
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [22]:
df3[~df3['total_sqft'].apply(is_float)].sample(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
5940,Thanisandra,3 BHK,1349 - 3324,3.0,0.0,115.89,3
7843,Whitefield,1 BHK,540 - 740,1.0,0.0,19.83,1
12184,Hennur,3 BHK,1783 - 1878,3.0,0.0,84.205,3
122,Hebbal,4 BHK,3067 - 8156,4.0,0.0,477.0,4
1821,Sarjapur,3 Bedroom,1574Sq. Yards,3.0,1.0,76.0,3
9183,Hormavu,2 BHK,943 - 1220,2.0,0.0,38.665,2
6987,Chandapura,2 BHK,598 - 958,2.0,0.0,25.29,2
6268,Chickpet,2 BHK,122Sq. Yards,2.0,0.0,48.0,2
5783,Sarjapur,4 Bedroom,2580 - 2591,4.0,0.0,139.5,4
1484,Hebbal,2 BHK,547.34 - 827.31,2.0,0.0,42.72,2


In [28]:
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return(float(tokens[0])+float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None

In [29]:
convert_sqft_to_num('2516')

2516.0

In [30]:
convert_sqft_to_num('2516 - 5612')

4064.0

In [31]:
convert_sqft_to_num('1574Sq. Yards')

In [32]:
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(convert_sqft_to_num)
df4.sample(10)

Unnamed: 0,location,size,total_sqft,bath,balcony,price,bhk
11057,Mahadevpura,3 BHK,1500.0,2.0,1.0,70.0,3
5298,Mahalakshmi Puram,2 BHK,830.0,2.0,1.0,60.0,2
5118,Subramanyapura,2 BHK,950.0,2.0,1.0,55.0,2
1797,Chowdeshwari Layout,3 BHK,1250.0,2.0,1.0,80.0,3
10450,Panathur,2 BHK,1125.0,2.0,2.0,45.0,2
8225,Iblur Village,4 BHK,3596.0,5.0,1.0,260.0,4
7349,Manonarayanapalya,3 Bedroom,800.0,2.0,1.0,52.0,3
5187,Peenya,2 BHK,966.0,2.0,1.0,49.0,2
642,Chikkalasandra,2 BHK,875.0,2.0,3.0,52.8,2
112,Whitefield,2 BHK,1116.0,2.0,1.0,51.91,2


In [34]:
df4.loc[122]

location      Hebbal
size           4 BHK
total_sqft    5611.5
bath             4.0
balcony          0.0
price          477.0
bhk                4
Name: 122, dtype: object