### Focussing on data cleaning tis and tricks

In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv("real-estate-india.csv")
df.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


Here, at the first glance we can see,
1) We need to extract BHK from the "Property Title" column and make it as a seperate column,

2) For few rows we have same "Name" in name and "Location" columns, which needs to be sorted out

3) Property Type column should be made from the data present in Property Title column,

4) Then, The "Price" column are in strings which should be converted to numerical values,

5) City names should be xtracted and made into a seperate column.

In [4]:
# Duplicating the data frame to deal with errors later
df2 = df

In [6]:
df2.shape

(14528, 9)

In [7]:
len(df2)

14528

In [8]:
# Capturing BHK values in a seperate list
BHK = []
for i in range(0, len(df2)):
    BHK.append(df2["Property Title"][i][:2])

# Assigning the list to a new column called "Bedrooms"
df2["Bedrooms"] = BHK

In [12]:
df2["Bedrooms"].head(), df2["Bedrooms"].describe()

(0    4 
 1    10
 2    3 
 3    7 
 4    2 
 Name: Bedrooms, dtype: object,
 count     14528
 unique       20
 top          2 
 freq       5691
 Name: Bedrooms, dtype: object)

In [20]:
# Replacing the errors, misspelt, or unwanted symbols in the 'Bedroom' column
df2["Bedrooms"] = df2["Bedrooms"].str.replace(".","")
df2["Bedrooms"] = df2["Bedrooms"].str.replace("+","")
df2["Bedrooms"] = df2["Bedrooms"].str.replace("St","")
df2["Bedrooms"] = df2["Bedrooms"].str.replace("","")

# lets duplicate the Dataframe one more time
df3 = df2

In [21]:
df3["Bedrooms"].value_counts()

Bedrooms
2     5691
1     3270
3     3132
4      990
5      438
6      296
10     200
7      143
8      132
9       78
3       44
1       30
2       30
5       24
        11
4       10
9        4
8        2
7        2
Sh       1
Name: count, dtype: int64

In [22]:
#Removing those columns because they're very small in number
bad_df = df3.index.isin([1186, 3497, 4003, 4005, 5889, 7527, 7594, 7993, 9148, 12355,14487])
df3 = df3[~bad_df]

#We check again
df3['Bedrooms'].value_counts()

#We can see repeated digits because of the space it has in front of it, we now remove that space

df3["Bedrooms"] = df3["Bedrooms"].str.replace("Sh","1")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("1 ","1")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("2 ","2")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("3 ","3")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("4 ","4")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("5 ","5")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("6 ","6")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("7 ","7")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("8 ","8")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("9 ","9")
df3["Bedrooms"] = df3["Bedrooms"].str.replace("Sh","3")

#Now, the values are good and unique
df3['Bedrooms'].value_counts()

#Converting the column to INT type, so that we can use it for a conditional formatting.
df3['Bedrooms'] = df3['Bedrooms'].astype(int)

#Checking if its changed to INT
df3['Bedrooms'] > 1

#Using conditional Formatting to Assign BHK  and RK for houses with 1 it is RK rest is BHK
df3['Bedrooms1'] = np.where(df3['Bedrooms'] > 1, df3['Bedrooms'].astype(str) + ' BHK',df3['Bedrooms'].astype(str) + ' RK')

In [23]:
df3.head()

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,Bedrooms,Bedrooms1
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes,4,4 BHK
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes,10,10 BHK
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No,3,3 BHK
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes,7,7 BHK
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes,2,2 BHK


In [27]:
df3.describe()

Unnamed: 0,Total_Area,Price_per_SQFT,Baths,Bedrooms
count,14517.0,14517.0,14517.0,14517.0
mean,1297.690019,11659.687263,2.75167,2.561273
std,1243.919931,48511.962603,0.898,1.661337
min,70.0,0.0,1.0,1.0
25%,650.0,4480.0,2.0,2.0
50%,1000.0,6050.0,3.0,2.0
75%,1439.0,9310.0,3.0,3.0
max,35000.0,999000.0,6.0,10.0


In [30]:
df3.dtypes

Name               object
Property Title     object
Price              object
Location           object
Total_Area          int64
Price_per_SQFT    float64
Description        object
Baths               int64
Balcony            object
Bedrooms            int64
Bedrooms1          object
dtype: object

#### 2) Common Data in "Name" and "Location" columns

In [31]:
# Finding how many rows are same in both columns
df3["Name"].isin(df["Location"]).value_counts()

Name
True     8883
False    5634
Name: count, dtype: int64

In [32]:
# Dividing the dataset based on the criteria above
df_split1 = df3[~df3["Name"].isin(df3["Location"])]

df_split2 = df3[df3["Name"].isin(df3["Location"])]

In [33]:
#Testing the correct locality extraction we need
df_split2['Name'][1].split(',')[-2]

#reseting the index since the split causes gap and missing index values in the splitted dataframe
df_split2 = df_split2.reset_index()

#Finding length of the dataframe to use it in the for loop
len(df_split2)

#Using simple for loop
property_loc = []
for i in range(0,8883):
    property_loc.append(df_split2['Name'][i].split(',')[-2])


#Assigning the Name with New names from the list appended above    
df_split2['Name1'] = property_loc

#Adding further info to the Name column above
df_split2['Name1'] = 'Property in ' + df_split2['Name1'].astype(str) 

#Checking the data
df_split2.head(1)

Unnamed: 0,index,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony,Bedrooms,Bedrooms1,Name1
0,1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes,10,10 BHK,Property in Pozhichalur


In [34]:
#Checking which column to include in the new and modified dataset with column in order.
df_split2.columns

df_split_Edit = df_split2[['Name1', 'Bedrooms1','Baths', 'Balcony','Price', 'Location', 'Total_Area',
       'Price_per_SQFT', 'Property Title', 'Description']]

#renaming the column to match with the 2nd splitted column
df_split_Edit = df_split_Edit.rename(columns={'Name1': 'Name'})

#Modifyind 2nd split dataframe with columns we require
df_split1_edit = df_split1[['Name', 'Bedrooms1','Baths', 'Balcony','Price', 'Location', 'Total_Area',
       'Price_per_SQFT', 'Property Title', 'Description']]
df_split1_edit.head(4)

Unnamed: 0,Name,Bedrooms1,Baths,Balcony,Price,Location,Total_Area,Price_per_SQFT,Property Title,Description
0,Casagrand ECR 14,4 BHK,4,Yes,₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",Best 4 BHK Apartment for modern-day lifestyle ...
2,DAC Prapthi,3 BHK,3,No,₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"3 BHK Flat for sale in West Tambaram, Chennai","Property for sale in Tambaram, Chennai. This 3..."
4,VGN Spring Field Phase 1,2 BHK,3,Yes,₹48.0 L,"Avadi, Chennai",960,5000.0,"2 BHK Flat for sale in Avadi, Chennai","Property for sale in Avadi, Chennai. This 2 BH..."
5,KG Earth Homes,2 BHK,3,No,₹40.0 L,"Siruseri, Chennai",940,4250.0,"2 BHK Flat for sale in Siruseri, Chennai","Price negotiable. Big hall, big balcony, gated..."
