In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings(action="ignore")

In [2]:
df_dirty_train = pd.read_csv('../data/train.csv')
df_dirty_test = pd.read_csv('../data/test.csv')
df_dirty_train.head()

Unnamed: 0,listing_id,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,...,furnishing,available_unit_types,total_num_units,property_details_url,lat,lng,elevation,subzone,planning_area,price
0,122881,hdb flat for sale in 866 yishun street 81,sembawang / yishun (d27),866 yishun street 81,hdb 4 rooms,,1988.0,3.0,2.0,1115,...,unspecified,,116.0,https://www.99.co/singapore/hdb/866-yishun-str...,1.414399,103.837196,0,yishun south,yishun,514500.0
1,259374,hdb flat for sale in 506b serangoon north aven...,hougang / punggol / sengkang (d19),hdb-serangoon estate,hdb,99-year leasehold,1992.0,4.0,2.0,1575,...,unspecified,"1, 2, 3, 4, 5, 6 br",,https://www.99.co/singapore/hdb/hdbserangoon-e...,1.372597,103.875625,0,serangoon north,serangoon,995400.0
2,665422,4 bed condo for sale in meyerhouse,128 meyer road,meyerhouse,condo,freehold,2022.0,4.0,6.0,3070,...,partial,"studio, 3, 4, 5, 6 br",56.0,https://www.99.co/singapore/condos-apartments/...,1.298773,103.895798,0,mountbatten,marine parade,8485000.0
3,857699,3 bed condo for sale in leedon green,26 leedon heights,leedon green,Condo,freehold,2023.0,3.0,2.0,958,...,partial,"studio, 1, 2, 3, 4 br",638.0,https://www.99.co/singapore/condos-apartments/...,1.312364,103.803271,0,farrer court,bukit timah,2626000.0
4,216061,2 bed condo for sale in one bernam,1 bernam street,one bernam,condo,99-year leasehold,2026.0,2.0,1.0,732,...,unspecified,"studio, 1, 2, 3, 4, 5 br",351.0,https://www.99.co/singapore/condos-apartments/...,1.273959,103.843635,0,anson,downtown core,1764000.0


In [3]:
#a.	Remove duplicates and invalid data
print(df_dirty_train.shape[0])
df_cleaned = df_dirty_train.drop_duplicates()
df_cleaned = df_cleaned[df_cleaned.size_sqft > 0]
df_cleaned = df_cleaned[df_cleaned.num_beds >= 0]
df_cleaned = df_cleaned[df_cleaned.num_baths >= 0]
df_cleaned = df_cleaned[df_cleaned.price > 0]

print(df_cleaned.shape[0])
print(f'Records dropped :{df_dirty_train.shape[0] - df_cleaned.shape[0]}' )


20254
19646
Records dropped :608


In [4]:
#b.	Remove irrelevant data
df_cleaned = df_cleaned.drop('property_details_url', axis = 1)
df_cleaned = df_cleaned.drop('listing_id', axis = 1)
df_cleaned.dropna(subset=['price'], inplace=True)


In [5]:
#c.	Standardize capitalization
df_cleaned.fillna({'built_year': 0}, inplace=True)
df_cleaned['built_year'] = df_cleaned['built_year'].astype(str).apply(lambda x: x.replace('.0',''))
df_cleaned['property_type'] = df_cleaned['property_type'].str.lower()
df_cleaned['tenure'] = df_cleaned['tenure'].str.lower()
df_cleaned['furnishing'] = df_cleaned['furnishing'].str.lower()
df_cleaned['subzone'] = df_cleaned['subzone'].str.lower()
df_cleaned['planning_area'] = df_cleaned['planning_area'].str.lower()

In [6]:
#d.	Convert data type
df_cleaned['built_year'] = df_cleaned['built_year'].astype(int)
df_cleaned['num_beds'] = df_cleaned['num_beds'].astype(str).apply(lambda x: x.replace('.0',''))
df_cleaned['num_baths'] = df_cleaned['num_baths'].astype(str).apply(lambda x: x.replace('.0',''))
df_cleaned['num_beds'] = df_cleaned['num_beds'].astype(int)
df_cleaned['num_baths'] = df_cleaned['num_baths'].astype(int)
df_cleaned.head()
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19646 entries, 0 to 20253
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 19646 non-null  object 
 1   address               19646 non-null  object 
 2   property_name         19646 non-null  object 
 3   property_type         19646 non-null  object 
 4   tenure                17934 non-null  object 
 5   built_year            19646 non-null  int64  
 6   num_beds              19646 non-null  int64  
 7   num_baths             19646 non-null  int64  
 8   size_sqft             19646 non-null  int64  
 9   floor_level           3455 non-null   object 
 10  furnishing            19646 non-null  object 
 11  available_unit_types  18216 non-null  object 
 12  total_num_units       14019 non-null  float64
 13  lat                   19646 non-null  float64
 14  lng                   19646 non-null  float64
 15  elevation          

In [7]:
#e.	Clear formatting

In [8]:
#f.	Fix errors

df_cleaned.describe()

Unnamed: 0,built_year,num_beds,num_baths,size_sqft,total_num_units,lat,lng,elevation,price
count,19646.0,19646.0,19646.0,19646.0,14019.0,19646.0,19646.0,19646.0,19646.0
mean,1916.884455,3.124809,2.649751,1868.839,374.319709,1.432684,103.84946,0.0,5309694.0
std,423.962628,1.284993,1.474461,13748.3,345.027693,1.562986,3.634192,0.0,282241400.0
min,0.0,1.0,1.0,65.0,4.0,1.239621,-77.065364,0.0,249900.0
25%,1997.0,2.0,2.0,807.0,105.0,1.307873,103.80519,0.0,819000.0
50%,2016.0,3.0,2.0,1119.0,296.0,1.329565,103.841474,0.0,1680000.0
75%,2023.0,4.0,3.0,1528.0,559.5,1.372693,103.881514,0.0,3255000.0
max,2028.0,10.0,10.0,1496000.0,2612.0,69.486768,121.023232,0.0,39242430000.0


Singapore has the following latitude and longitude coordinates in its extreme ends:
1. left-most (Tuas) :  1.30871,103.64287
2. right-most (Changi) : 1.34538,104.00270
3. top-most (Sembawang) : 1.46227,103.79487
4. bottom-most (Bukit Merah) : 1.28762,103.82467


#### Min latitude - 1.28762       Max latitude - 1.46227

#### Min longitude - 103.64         Max longitude - 104.00

But we can see that in the data, min longitude is -77.065364 and max latitude is 69.486768 which are out of the range of latitude and longitude values 

<img src="images/singapore-lat-long-map.jpeg" width=600 height=600 />



In [9]:
df_max_lng = df_cleaned[df_cleaned.lng > 121]
df_min_lng = df_cleaned[df_cleaned.lng < -77]
df_max_lat = df_cleaned[df_cleaned.lat > 69]
df_wrong_coordinates = pd.concat([df_max_lng, df_min_lng, df_max_lat])

df_wrong_coordinates

# it is interesting to note that in all the records where latitude and longitude have incorrect coordinates,
# "planning_area" and "subzone" have missing values, this can also be verified by checking for count of missing values

Unnamed: 0,title,address,property_name,property_type,tenure,built_year,num_beds,num_baths,size_sqft,floor_level,furnishing,available_unit_types,total_num_units,lat,lng,elevation,subzone,planning_area,price
30,3 bed condo for sale in 1953,1 tessensohn road,1953,apartment,freehold,2023,3,3,1130,,unspecified,"studio, 1, 2, 3, 4, 5, 6 br",58.0,14.484814,121.023232,0,,,2671200.0
59,4 bed condo for sale in 1953,1 tessensohn road,1953,apartment,freehold,2023,4,4,1399,,unspecified,"studio, 1, 2, 3, 4, 5, 6 br",58.0,14.484814,121.023232,0,,,3272800.0
223,3 bed condo for sale in 1953,1 tessensohn road,1953,condo,freehold,2023,3,3,1152,low,unspecified,"studio, 1, 2, 3, 4, 5, 6 br",58.0,14.484814,121.023232,0,,,2292200.0
499,2 bed condo for sale in 1953,1 tessensohn road,1953,condo,freehold,2023,2,2,1130,,unfurnished,"studio, 1, 2, 3, 4, 5, 6 br",58.0,14.484814,121.023232,0,,,2363600.0
601,3 bed condo for sale in 1953,1 tessensohn road,1953,apartment,freehold,2023,3,3,1119,,unspecified,"studio, 1, 2, 3, 4, 5, 6 br",58.0,14.484814,121.023232,0,,,2163000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86,1 bed condo for sale in pollen & bleu,15 farrer drive,pollen & bleu,condo,99-year leasehold,2017,1,1,549,,unspecified,"1, 2, 3, 4, 5 br",106.0,69.486768,20.184434,0,,,1470000.0
2765,2 bed condo for sale in pollen & bleu,17 farrer drive,pollen & bleu,condo,99-year leasehold,2017,2,2,1163,,fully,"1, 2, 3, 4, 5 br",106.0,69.486768,20.184434,0,,,2362500.0
8806,3 bed condo for sale in pollen & bleu,17 farrer drive,pollen & bleu,condo,99-year leasehold,2017,3,2,1184,low,unfurnished,"1, 2, 3, 4, 5 br",106.0,69.486768,20.184434,0,,,2467500.0
11463,3 bed condo for sale in pollen & bleu,15 farrer drive,pollen & bleu,condo,99-year leasehold,2017,3,2,1163,,unspecified,"1, 2, 3, 4, 5 br",106.0,69.486768,20.184434,0,,,2058000.0


In [10]:
print(df_wrong_coordinates.shape)
print(df_cleaned["subzone"].isnull().sum())
print(df_cleaned["planning_area"].isnull().sum())

(106, 19)
106
106


In [11]:
# coordinates are incorrect for 5 'address'
df_wrong_coordinates["address"].value_counts()

1 tessensohn road       90
38 lorong 32 geylang     6
5 jalan mutiara          5
17 farrer drive          3
15 farrer drive          2
Name: address, dtype: int64

In [12]:
# using the 'address' we can manually correct the latitude, longitude coordinates along with 
# filling of values for sub zone and planning_area

df_cleaned.loc[df_cleaned.address == "1 tessensohn road", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.3164313', '103.8575321', 'balestier', 'novena'
df_cleaned.loc[df_cleaned.address == "38 lorong 32 geylang", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.31262', '103.88686', 'aljunied', 'geylang'
df_cleaned.loc[df_cleaned.address == "5 jalan mutiara", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.29565', '103.82887', 'leonie hill', 'river valley'
df_cleaned.loc[df_cleaned.address == "17 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.313259', '103.806622', 'holland road', 'bukit timah'
df_cleaned.loc[df_cleaned.address == "15 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.313259', '103.806622', 'holland road', 'bukit timah'


In [13]:
print(df_cleaned['property_type'].value_counts())
# changing "hdb 3 rooms", "hdb 4 rooms" and likewise to "hdb" since the number of rooms info can
# be obtained from "num_beds"
df_cleaned['property_type'].mask(df_cleaned['property_type'].str.contains("hdb"), "hdb", inplace=True)

condo                  9248
hdb                    2783
hdb 3 rooms            1189
hdb 4 rooms            1118
semi-detached house    1007
executive condo         918
bungalow                740
hdb 5 rooms             677
apartment               482
terraced house          441
hdb executive           302
corner terrace          253
hdb 2 rooms             234
cluster house           183
conservation house       21
landed                   19
walk-up                  14
townhouse                 9
good class bungalow       5
land only                 2
shophouse                 1
Name: property_type, dtype: int64


In [14]:
print(df_cleaned['furnishing'].value_counts())
# dropping 7 records with "na" 
df_cleaned = df_cleaned[df_cleaned.furnishing != "na"]

unspecified    14339
partial         2969
unfurnished     1812
fully            519
na                 7
Name: furnishing, dtype: int64


In [15]:
print("before changes")
print("condo")
print(df_cleaned["title"].str.contains("condo").value_counts())
print("\n")
print("apartment")
print(df_cleaned["title"].str.contains("apartment").value_counts())
print("\n")
print("property_type")
print(df_cleaned["property_type"].value_counts())


#The count of condos and apartments in title and property_type does not match

before changes
condo
False    9899
True     9740
Name: title, dtype: int64


apartment
False    19633
True         6
Name: title, dtype: int64


property_type
condo                  9246
hdb                    6303
semi-detached house    1007
executive condo         918
bungalow                740
apartment               477
terraced house          441
corner terrace          253
cluster house           183
conservation house       21
landed                   19
walk-up                  14
townhouse                 9
good class bungalow       5
land only                 2
shophouse                 1
Name: property_type, dtype: int64


In [16]:
df_cleaned.loc[(df_cleaned['property_type']!="condo") & 
       (df_cleaned['title'].str.contains('condo')),['property_type']] = "condo"
df_cleaned.loc[(df_cleaned['property_type']!="apartment") & 
       (df_cleaned['title'].str.contains('apartment')),['property_type']] = "apartment"

In [17]:
print("after changes")
print("condo")
print(df_cleaned["title"].str.contains("condo").value_counts())
print("\n")
print("apartment")
print(df_cleaned["title"].str.contains("apartment").value_counts())
print("\n")
print("property_type")
print(df_cleaned["property_type"].value_counts())


after changes
condo
False    9899
True     9740
Name: title, dtype: int64


apartment
False    19633
True         6
Name: title, dtype: int64


property_type
condo                  9734
hdb                    6303
semi-detached house    1006
executive condo         918
bungalow                740
terraced house          440
corner terrace          252
cluster house           183
conservation house       21
landed                   19
townhouse                 9
apartment                 6
good class bungalow       5
land only                 2
shophouse                 1
Name: property_type, dtype: int64


In [18]:
#g.	Language translation

In [19]:
#h.	Handle missing values

df_cleaned.isnull().sum()

title                       0
address                     0
property_name               0
property_type               0
tenure                   1712
built_year                  0
num_beds                    0
num_baths                   0
size_sqft                   0
floor_level             16184
furnishing                  0
available_unit_types     1430
total_num_units          5627
lat                         0
lng                         0
elevation                   0
subzone                     0
planning_area               0
price                       0
dtype: int64

In [20]:
#i. Features

#1. title - DROPPING
#2. address - DROPPING
#3. property_name - KEEP
#4. property_type - ordinal encoding
#5. tenure - one-hot encoding (reduce to 3 columns - freehold, 99-year, 999-year)
#6. built_year/numb_beds/num_baths/size_sqft - KEEP AS IT IS
#7. floor_level - ordinal encoding (DROPPING)
#8. furnishing - one-hot encoding
#9. available_unit_types -  (ON HOLD/DROPPING)
#10. total_num_units - DROPPING
#11. lat/lng - KEEP AS IT IS
#12. planning_area - one-hot encoding/adarsh to work on map
#13. subzone - one-hot encoding/Adarsh to work on map
#14. ADD FEATURES - number os schools/MRT stations/malls within a pre-fixed radius

In [21]:
#No need to run this unless all data has been cleaned.
df_dirty_test.to_csv('../data/test_cleaned.csv')  
df_dirty_train.to_csv('../data/train_cleaned.csv')