In [152]:
# Imports needed
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
matplotlib.rcParams["figure.figsize"] = (20,10)

In [153]:
# Create the dataframe & Read the CSV File
df = pd.read_csv('Bengaluru_House_Data.csv')
# Show the Result of the dataframe
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [154]:
# Show the Nof Rows and Columns of our Dataframe
# (ROWs,COLUMNs)
df.shape

(13320, 9)

In [155]:
# We group by area_type as it is the column we will use as a reference
# [] in brackets indicates that we will only work with area_type, specifying the column
# count counts how many times each type of area appears in the DataFrame
df.groupby('area_type')['area_type'].count()

area_type
Built-up  Area          2418
Carpet  Area              87
Plot  Area              2025
Super built-up  Area    8790
Name: area_type, dtype: int64

In [156]:
# Now Delete the columns we don't need
# So we create a new dataframe
# Final Result = Location, Size, total_sqft, bath,price
df2 = df.drop(['area_type','society','balcony','availability'], axis='columns')
df2.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


In [157]:
# NOW WE START THE DATA CLEANING PROCESS
# First of all we must know the Null values
df2.isnull().sum()
# This result means thet there's one location null, 16 with the size null too and so on
# So the next step is to drop all those values

location       1
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [158]:
# So here we create a cleande dataframe without all the null values
df_cleaned = df2.dropna()
df_cleaned.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [159]:
df_cleaned.shape

(13246, 5)

In [160]:
# SIZE: Result
# 4 Bedroom
# 3 BHK
# 3 BHK
# 2 BHK
# 2 BHK
# As we can se there are different types of Size, and we wanted to know what's going on hereç
# To do that we list all the unique values
df_cleaned['size'].unique()

array(['2 BHK', '4 Bedroom', '3 BHK', '4 BHK', '6 Bedroom', '3 Bedroom',
       '1 BHK', '1 RK', '1 Bedroom', '8 Bedroom', '2 Bedroom',
       '7 Bedroom', '5 BHK', '7 BHK', '6 BHK', '5 Bedroom', '11 BHK',
       '9 BHK', '9 Bedroom', '27 BHK', '10 Bedroom', '11 Bedroom',
       '10 BHK', '19 BHK', '16 BHK', '43 Bedroom', '14 BHK', '8 BHK',
       '12 Bedroom', '13 BHK', '18 Bedroom'], dtype=object)

In [161]:
# I can gess that 4 Bedroom and 4 BHK are essentially the same, so we are going to create a new colum
# called BHk
# here we are going to call a Fuynction in wich we are going to split by the space 
# x is going to be the numbrer the token on the position 0, teh position 1 is for the string
# So finally we have a cleaned dataframe with the column BHK wich contains only the number of bedrooms in a house
df_cleaned['BHK'] = df_cleaned['size'].apply(lambda x: int(x.split(' ')[0]))
df_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['BHK'] = df_cleaned['size'].apply(lambda x: int(x.split(' ')[0]))


Unnamed: 0,location,size,total_sqft,bath,price,BHK
0,Electronic City Phase II,2 BHK,1056,2.0,39.07,2
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0,4
2,Uttarahalli,3 BHK,1440,2.0,62.0,3
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0,3
4,Kothanur,2 BHK,1200,2.0,51.0,2


In [162]:
# to take more control on the BHK column we can show some UNIQUE values
df_cleaned['BHK'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18])

In [163]:
# Here sometimes we can se somethin strange, what type o fhouse has 11 bedrooms,
# We are going to check it so we make sure all is in rule
df_cleaned[df_cleaned.BHK > 10]

Unnamed: 0,location,size,total_sqft,bath,price,BHK
459,1 Giri Nagar,11 BHK,5000,9.0,360.0,11
1718,2Electronic City Phase II,27 BHK,8000,27.0,230.0,27
1768,1 Ramamurthy Nagar,11 Bedroom,1200,11.0,170.0,11
3379,1Hanuman Nagar,19 BHK,2000,16.0,490.0,19
3609,Koramangala Industrial Layout,16 BHK,10000,16.0,550.0,16
3853,1 Annasandrapalya,11 Bedroom,1200,6.0,150.0,11
4684,Munnekollal,43 Bedroom,2400,40.0,660.0,43
4916,1Channasandra,14 BHK,1250,15.0,125.0,14
6533,Mysore Road,12 Bedroom,2232,6.0,300.0,12
7979,1 Immadihalli,11 BHK,6000,12.0,150.0,11


In [164]:
# Here we see there i something strange
# a house with 16 Bedrooms has 10.000 OK, but you cant have 43 bedrooms with 2400
# we have to clean this dataset, so we can trust the data
# we are going to explore [total_sqft]

df_cleaned['total_sqft'].unique()
# some of teh values are a RANGE not a unique value, so to solve that we are going to
# make an average and put a final number 
# We are going to see HOW MANY of those values are in range

def is_foat(x):
    try:
        float(x)
    except:
        return False
    return True

array(['1056', '2600', '1440', ..., '1133 - 1384', '774', '4689'],
      dtype=object)