In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
df_dirty_train = pd.read_csv('../data/train.csv')
df_dirty_test = pd.read_csv('../data/test.csv')


In [3]:
#a.	Remove duplicates and invalid data
print(df_dirty_train.shape[0])
df_cleaned = df_dirty_train.drop_duplicates()
df_cleaned = df_cleaned[df_cleaned.size_sqft > 0]
df_cleaned.dropna(subset=['num_beds', 'num_baths', 'price', 'size_sqft'], inplace = True)
df_cleaned = df_cleaned[df_cleaned.price > 0]
df_cleaned = df_cleaned.drop(['property_details_url', 'listing_id', 'elevation', 'total_num_units', 'floor_level'], axis = 1)

print(f'Records dropped :{df_dirty_train.shape[0] - df_cleaned.shape[0]}' )
print(df_cleaned.head())

20254
Records dropped :608
                                               title  \
0          hdb flat for sale in 866 yishun street 81   
1  hdb flat for sale in 506b serangoon north aven...   
2                 4 bed condo for sale in meyerhouse   
3               3 bed condo for sale in leedon green   
4                 2 bed condo for sale in one bernam   

                              address         property_name property_type  \
0            sembawang / yishun (d27)  866 yishun street 81   hdb 4 rooms   
1  hougang / punggol / sengkang (d19)  hdb-serangoon estate           hdb   
2                      128 meyer road            meyerhouse         condo   
3                   26 leedon heights          leedon green         Condo   
4                     1 bernam street            one bernam         condo   

              tenure  built_year  num_beds  num_baths  size_sqft   furnishing  \
0                NaN      1988.0       3.0        2.0       1115  unspecified   
1  99-year 

In [70]:
#b.	Remove irrelevant data

#Elevation has only one value -> 0


In [71]:
#c.	Standardize capitalization
#df_cleaned.fillna({'built_year': 0}, inplace=True)
#df_cleaned['built_year'] = df_cleaned['built_year'].astype(str).apply(lambda x: x.replace('.0',''))
df_cleaned['property_type'] = df_cleaned['property_type'].str.lower()
df_cleaned['tenure'] = df_cleaned['tenure'].str.lower()
df_cleaned['furnishing'] = df_cleaned['furnishing'].str.lower()
df_cleaned['subzone'] = df_cleaned['subzone'].str.lower()
df_cleaned['planning_area'] = df_cleaned['planning_area'].str.lower()

In [72]:
mask_999 = ['947-year leasehold', '929-year leasehold', '946-year leasehold',
'956-year leasehold']
mask_99 = ['100-year leasehold', '102-year leasehold', '110-year leasehold', '103-year leasehold']
df_cleaned = df_cleaned.replace(mask_999, '999-year leasehold')
df_cleaned = df_cleaned.replace(mask_99, '99-year leasehold')


df_cleaned['tenure'].value_counts().sort_values()

#Encode property name
#



999-year leasehold      459
freehold               6270
99-year leasehold     11205
Name: tenure, dtype: int64

In [73]:
#e.	Clear formatting

In [74]:
#f.	Fix errors

In [75]:
#g.	Language translation

In [76]:
#h. Handle missing values

#built_year
dfmap = df_cleaned.dropna(subset = ['built_year'])[['built_year', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_cleaned = df_cleaned.drop(columns=['built_year']).merge(dfmap, on=['property_name'], how='left')

dfmap = df_cleaned.dropna(subset = ['tenure'])[['tenure', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_cleaned = df_cleaned.drop(columns=['tenure']).merge(dfmap, on=['property_name'], how='left')

dfmap = df_cleaned.dropna(subset = ['tenure'])[['tenure', 'address']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['address'])
df_cleaned = df_cleaned.drop(columns=['tenure']).merge(dfmap, on=['address'], how='left')

dfmap = df_cleaned.dropna(subset = ['available_unit_types'])[['available_unit_types', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_cleaned = df_cleaned.drop(columns=['available_unit_types']).merge(dfmap, on=['property_name'], how='left')

df_cleaned.isnull().sum()


title                      0
address                    0
property_name              0
property_type              0
num_beds                   0
num_baths                  0
size_sqft                  0
furnishing                 0
lat                        0
lng                        0
subzone                  106
planning_area            106
price                      0
built_year               901
tenure                   198
available_unit_types    1430
dtype: int64

In [67]:
df_cleaned[df_cleaned.tenure.isnull()]

Unnamed: 0,title,address,property_name,property_type,num_beds,num_baths,size_sqft,furnishing,lat,lng,subzone,planning_area,price,built_year,tenure,available_unit_types
21,5 bed house for sale in montrose terrace,mount rosie road,montrose terrace,terraced house,5.0,6.0,7829,unspecified,1.322016,103.836450,malcolm,novena,11025000.0,,,5 br
333,5 bed house for sale in florence gardens,florence road,florence gardens,semi-detached house,5.0,4.0,3500,unspecified,1.365070,103.885066,kovan,hougang,7140000.0,,,"4, 5 br"
502,5 bed house for sale in bukit loyang estate,jalan kelempong,bukit loyang estate,semi-detached house,5.0,3.0,3000,unspecified,1.362513,103.967638,flora drive,pasir ris,7551400.0,,,"2, 4, 5, 6 br"
714,5 bed house for sale in paterson garden,lengkok angsa,paterson garden,semi-detached house,5.0,7.0,3412,unspecified,1.300918,103.829973,paterson,river valley,14175000.0,,,5 br
729,7 bed house for sale in east view garden,meragi terrace,east view garden,semi-detached house,7.0,6.0,7500,unspecified,1.342076,103.960414,simei,tampines,3675000.0,1977.0,,"3, 4, 5, 6, 7, 8 br"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18856,3 bed house for sale in blair plain conservati...,everton road,blair plain conservation area,conservation house,3.0,2.0,4000,unspecified,1.277503,103.838096,everton park,bukit merah,6174000.0,,,"studio, 1, 2, 3, 4, 5, 7 br"
18983,6 bed house for sale in dunman garden,swanage road,dunman garden,corner terrace,6.0,7.0,6924,unspecified,1.308418,103.893388,mountbatten,marine parade,8384200.0,,,6 br
19303,5 bed house for sale in sunny terrace,jalan jamal,sunny terrace,corner terrace,5.0,6.0,2800,unspecified,1.312759,103.925318,frankel,bedok,8400000.0,,,5 br
19435,3 bed condo for sale in spring residences,25 ewe boon road,spring residences,apartment,3.0,2.0,1463,unspecified,1.316002,103.830431,nassim,tanglin,3444000.0,2013.0,,"2, 3 br"


In [77]:
#No need to run this unless all data has been cleaned.
#df_cleaned.to_csv('../data/test_cleaned.csv')  
df_cleaned.to_csv('../data/train_cleaned.csv')


In [20]:
#a.	Remove duplicates and invalid data
print(df_dirty_test.shape[0])
df_test_cleaned = df_dirty_test.drop_duplicates()
df_test_cleaned = df_test_cleaned[df_test_cleaned.size_sqft > 0]
df_test_cleaned.dropna(subset=['num_beds', 'num_baths', 'size_sqft'], inplace = True)
#df_test_cleaned = df_test_cleaned[df_test_cleaned.price > 0]- not applicable to test data

7000


In [21]:
#b.	Remove irrelevant data

#Elevation has only one value -> 0

df_test_cleaned = df_test_cleaned.drop(['property_details_url', 'listing_id', 'elevation', 'total_num_units', 'floor_level'], axis = 1)

In [23]:
print(f'Records dropped :{df_dirty_test.shape[0] - df_test_cleaned.shape[0]}' )
print(df_test_cleaned)

Records dropped :182
                                                 title  \
0                  1 bed condo for sale in the gazania   
1              3 bed condo for sale in vue 8 residence   
2                         1 bed condo for sale in icon   
3     hdb flat for sale in 812b choa chu kang avenue 7   
4             hdb flat for sale in 204 toa payoh north   
...                                                ...   
6995           5 bed house for sale in paradise island   
6996            5 bed house for sale in orchid village   
6997                3 bed condo for sale in the avenir   
6998           hdb flat for sale in 31 marine crescent   
6999                   3 bed condo for sale in riviere   

                                                address  \
0                                      17 how sun drive   
1                                  95 pasir ris heights   
2                                      10 gopeng street   
3     bukit batok / bukit panjang / choa chu k

In [24]:
#c.	Standardize capitalization
#df_cleaned.fillna({'built_year': 0}, inplace=True)
#df_cleaned['built_year'] = df_cleaned['built_year'].astype(str).apply(lambda x: x.replace('.0',''))
df_test_cleaned['property_type'] = df_test_cleaned['property_type'].str.lower()
df_test_cleaned['tenure'] = df_test_cleaned['tenure'].str.lower()
df_test_cleaned['furnishing'] = df_test_cleaned['furnishing'].str.lower()
df_test_cleaned['subzone'] = df_test_cleaned['subzone'].str.lower()
df_test_cleaned['planning_area'] = df_test_cleaned['planning_area'].str.lower()

In [25]:
mask_999 = ['947-year leasehold', '929-year leasehold', '946-year leasehold',
'956-year leasehold']
mask_99 = ['100-year leasehold', '102-year leasehold', '110-year leasehold', '103-year leasehold']
df_test_cleaned = df_test_cleaned.replace(mask_999, '999-year leasehold')
df_test_cleaned = df_test_cleaned.replace(mask_99, '99-year leasehold')


df_test_cleaned['tenure'].value_counts().sort_values()

#Encode property name
#

999-year leasehold     168
freehold              2147
99-year leasehold     3868
Name: tenure, dtype: int64

In [26]:
#e.	Clear formatting

In [27]:
#f. Fix errors

In [28]:
#g. Language Translations

In [31]:
#h. Handle missing values

#built_year
dfmap = df_test_cleaned.dropna(subset = ['built_year'])[['built_year', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_test_cleaned = df_test_cleaned.drop(columns=['built_year']).merge(dfmap, on=['property_name'], how='left')

df_test_cleaned = df_test_cleaned.dropna(subset=['built_year'])

dfmap = df_test_cleaned.dropna(subset = ['tenure'])[['tenure', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_test_cleaned = df_test_cleaned.drop(columns=['tenure']).merge(dfmap, on=['property_name'], how='left')

dfmap = df_test_cleaned.dropna(subset = ['tenure'])[['tenure', 'address']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['address'])
df_test_cleaned = df_test_cleaned.drop(columns=['tenure']).merge(dfmap, on=['address'], how='left')

dfmap = df_test_cleaned.dropna(subset = ['available_unit_types'])[['available_unit_types', 'property_name']].drop_duplicates()
dfmap = dfmap.drop_duplicates(subset = ['property_name'])
df_test_cleaned = df_test_cleaned.drop(columns=['available_unit_types']).merge(dfmap, on=['property_name'], how='left')

df_test_cleaned.isnull().sum()

title                     0
address                   0
property_name             0
property_type             0
num_beds                  0
num_baths                 0
size_sqft                 0
furnishing                0
lat                       0
lng                       0
subzone                  33
planning_area            33
built_year                0
tenure                   38
available_unit_types    512
dtype: int64

In [32]:
df_test_cleaned[df_test_cleaned.tenure.isnull()]

Unnamed: 0,title,address,property_name,property_type,num_beds,num_baths,size_sqft,furnishing,lat,lng,subzone,planning_area,built_year,tenure,available_unit_types
305,5 bed house for sale in pasir ris beach park,pasir ris way,pasir ris beach park,bungalow,5.0,7.0,7535,unspecified,1.386285,103.937608,pasir ris wafer fab park,pasir ris,1977.0,,"3, 4, 5, 6, 7, 9 br"
703,5 bed house for sale in pasir ris beach park,pasir ris way,pasir ris beach park,bungalow,5.0,5.0,7535,unfurnished,1.386285,103.937608,pasir ris wafer fab park,pasir ris,1977.0,,"3, 4, 5, 6, 7, 9 br"
705,5 bed house for sale in pasir ris beach park,pasir ris way,pasir ris beach park,bungalow,5.0,5.0,7544,unspecified,1.386285,103.937608,pasir ris wafer fab park,pasir ris,1977.0,,"3, 4, 5, 6, 7, 9 br"
812,4 bed house for sale in pasir ris beach park,pasir ris way,pasir ris beach park,semi-detached house,4.0,4.0,5990,unspecified,1.386285,103.937608,pasir ris wafer fab park,pasir ris,1977.0,,"3, 4, 5, 6, 7, 9 br"
1110,hdb flat for sale in 538 upper cross street,boat quay / raffles place / marina (d1),538 upper cross street,hdb,2.0,2.0,732,unspecified,1.285244,103.8457,china square,outram,1978.0,,
1140,4 bed house for sale in pasir ris beach park,pasir ris avenue,pasir ris beach park,semi-detached house,4.0,5.0,5989,unspecified,1.386285,103.937608,pasir ris wafer fab park,pasir ris,1977.0,,"3, 4, 5, 6, 7, 9 br"
1287,4 bed house for sale in pasir ris beach park,pasir ris way,pasir ris beach park,semi-detached house,4.0,4.0,5990,unspecified,1.386285,103.937608,pasir ris wafer fab park,pasir ris,1977.0,,"3, 4, 5, 6, 7, 9 br"
1341,4 bed house for sale in pasir ris beach park,pasir ris way,pasir ris beach park,semi-detached house,4.0,4.0,5989,unspecified,1.386285,103.937608,pasir ris wafer fab park,pasir ris,1977.0,,"3, 4, 5, 6, 7, 9 br"
1629,5 bed house for sale in pasir ris beach park,pasir ris way,pasir ris beach park,bungalow,5.0,6.0,7537,unspecified,1.386285,103.937608,pasir ris wafer fab park,pasir ris,1977.0,,"3, 4, 5, 6, 7, 9 br"
1831,4 bed house for sale in pasir ris beach park,pasir ris way,pasir ris beach park,semi-detached house,4.0,4.0,5989,unspecified,1.386285,103.937608,pasir ris wafer fab park,pasir ris,1977.0,,"3, 4, 5, 6, 7, 9 br"


In [33]:
df_test_cleaned.to_csv('../data/test_cleaned.csv')