In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings(action="ignore")

In [None]:
df_cleaned = pd.read_csv('../data/train_cleaned.csv')
df_dirty_test = pd.read_csv('../data/test_cleaned.csv')
df_cleaned.head()

Singapore has the following latitude and longitude coordinates in its extreme ends:
1. left-most (Tuas) :  1.30871,103.64287
2. right-most (Changi) : 1.34538,104.00270
3. top-most (Sembawang) : 1.46227,103.79487
4. bottom-most (Bukit Merah) : 1.28762,103.82467


#### Min latitude - 1.28762       Max latitude - 1.46227

#### Min longitude - 103.64         Max longitude - 104.00

But we can see that in the data, min longitude is -77.065364 and max latitude is 69.486768 which are out of the range of latitude and longitude values 

<img src="images/singapore-lat-long-map.jpeg" width=600 height=600 />



In [None]:
df_cleaned['built_year'] = df_cleaned['built_year'].astype(int)
df_cleaned['num_beds'] = df_cleaned['num_beds'].astype(int)
df_cleaned['num_baths'] = df_cleaned['num_baths'].astype(int)
df_cleaned['lng'] = df_cleaned['lng'].astype(np.float16)
df_cleaned['lat'] = df_cleaned['lat'].astype(np.float16)
df_cleaned.info()

In [None]:
#print(df_cleaned.lng.max)
df_max_lng = df_cleaned[df_cleaned.lng > 121.0]
df_min_lng = df_cleaned[df_cleaned.lng < -77.0]
df_max_lat = df_cleaned[df_cleaned.lat > 69.0]
df_wrong_coordinates = pd.concat([df_max_lng, df_min_lng, df_max_lat])

df_wrong_coordinates

# it is interesting to note that in all the records where latitude and longitude have incorrect coordinates,
# "planning_area" and "subzone" have missing values, this can also be verified by checking for count of missing values

In [None]:
print(df_wrong_coordinates.shape)
print(df_cleaned["subzone"].isnull().sum())
print(df_cleaned["planning_area"].isnull().sum())

In [None]:
# coordinates are incorrect for 5 'address'
df_wrong_coordinates["address"].value_counts()

In [None]:
# using the 'address' we can manually correct the latitude, longitude coordinates along with 
# filling of values for sub zone and planning_area

df_cleaned.loc[df_cleaned.address == "1 tessensohn road", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.3164313', '103.8575321', 'balestier', 'novena'
df_cleaned.loc[df_cleaned.address == "38 lorong 32 geylang", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.31262', '103.88686', 'aljunied', 'geylang'
df_cleaned.loc[df_cleaned.address == "5 jalan mutiara", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.29565', '103.82887', 'leonie hill', 'river valley'
df_cleaned.loc[df_cleaned.address == "17 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.313259', '103.806622', 'holland road', 'bukit timah'
df_cleaned.loc[df_cleaned.address == "15 farrer drive", 
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.313259', '103.806622', 'holland road', 'bukit timah'


In [None]:
print(df_cleaned['property_type'].value_counts())
# changing "hdb 3 rooms", "hdb 4 rooms" and likewise to "hdb" since the number of rooms info can
# be obtained from "num_beds"
df_cleaned['property_type'].mask(df_cleaned['property_type'].str.contains("hdb"), "hdb", inplace=True)
df_cleaned.drop(df_cleaned[df_cleaned['property_type'] == 'land only'].index, inplace = True)
print(df_cleaned['property_type'].value_counts())

In [None]:
print(df_cleaned['furnishing'].value_counts())
# dropping 7 records with "na" 
df_cleaned = df_cleaned[df_cleaned.furnishing != "na"]

In [None]:
print("before changes")
print("condo")
print(df_cleaned["title"].str.contains("condo").value_counts())
print("\n")
print("apartment")
print(df_cleaned["title"].str.contains("apartment").value_counts())
print("\n")
print("property_type")
print(df_cleaned["property_type"].value_counts())


#The count of condos and apartments in title and property_type does not match

In [None]:
df_cleaned.loc[(df_cleaned['property_type']!="condo") & 
       (df_cleaned['title'].str.contains('condo')),['property_type']] = "condo"
df_cleaned.loc[(df_cleaned['property_type']!="apartment") & 
       (df_cleaned['title'].str.contains('apartment')),['property_type']] = "apartment"

In [None]:
print("after changes")
print("condo")
print(df_cleaned["title"].str.contains("condo").value_counts())
print("\n")
print("apartment")
print(df_cleaned["title"].str.contains("apartment").value_counts())
print("\n")
print("property_type")
print(df_cleaned["property_type"].value_counts())


In [None]:
#g.	Language translation

In [None]:
#h.	Handle missing values

df_cleaned.isnull().sum()

In [None]:
#i. Features

#1. title - DROPPING
#2. address - DROPPING
#3. property_name - KEEP
#4. property_type - ordinal encoding
#5. tenure - one-hot encoding (reduce to 3 columns - freehold, 99-year, 999-year)
#6. built_year - handle null values
#6.1 numb_beds/num_baths/size_sqft - KEEP AS IT IS
#7. floor_level - ordinal encoding (DROPPING)
#8. furnishing - one-hot encoding
#9. available_unit_types -  (ON HOLD/DROPPING)
#10. total_num_units - DROPPING
#11. lat/lng - KEEP AS IT IS
#12. planning_area - one-hot encoding/adarsh to work on map
#13. subzone - one-hot encoding/Adarsh to work on map
#14. ADD FEATURES - number os schools/MRT stations/malls within a pre-fixed radius

In [None]:
#No need to run this unless all data has been cleaned.
#df_dirty_test.to_csv('../data/test_cleaned.csv')  
df_cleaned.to_csv('../data/train_cleaned.csv')

In [None]:
#Test data cleaning starts here
#df_cleaned = pd.read_csv('../data/train_cleaned.csv')
df_test_cleaned = pd.read_csv('../data/test_cleaned.csv')
df_test_cleaned.head()

In [None]:
df_test_cleaned['built_year'] = df_test_cleaned['built_year'].astype(int)
df_test_cleaned['num_beds'] = df_test_cleaned['num_beds'].astype(int)
df_test_cleaned['num_baths'] = df_test_cleaned['num_baths'].astype(int)
df_test_cleaned['num_beds'] = df_test_cleaned['num_beds'].astype(int)
df_test_cleaned['num_baths'] = df_test_cleaned['num_baths'].astype(int)
df_test_cleaned['lng'] = df_test_cleaned['lng'].astype(np.float16)
df_test_cleaned['lat'] = df_test_cleaned['lat'].astype(np.float16)
df_test_cleaned.info()

In [None]:
#print(df_cleaned.lng.max)
df_max_lng = df_test_cleaned[df_test_cleaned.lng > 104]
df_min_lng = df_test_cleaned[df_test_cleaned.lng < 103.64]
df_max_lat = df_test_cleaned[df_test_cleaned.lat > 1.47]
df_wrong_coordinates = pd.concat([df_max_lng, df_min_lng, df_max_lat])

df_wrong_coordinates

# it is interesting to note that in all the records where latitude and longitude have incorrect coordinates,
# "planning_area" and "subzone" have missing values, this can also be verified by checking for count of missing values

In [None]:
print(df_wrong_coordinates.shape)
print(df_test_cleaned["subzone"].isnull().sum())
print(df_test_cleaned["planning_area"].isnull().sum())

In [None]:
# coordinates are incorrect for 4 'address'
df_wrong_coordinates["address"].value_counts()

In [None]:
# using the 'address' we can manually correct the latitude, longitude coordinates along with
# filling of values for sub zone and planning_area

df_test_cleaned.loc[df_test_cleaned.address == "1 tessensohn road",
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.3164313', '103.8575321', 'balestier', 'novena'
df_test_cleaned.loc[df_test_cleaned.address == "38 lorong 32 geylang",
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.31262', '103.88686', 'aljunied', 'geylang'
df_test_cleaned.loc[df_test_cleaned.address == "5 jalan mutiara",
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.29565', '103.82887', 'leonie hill', 'river valley'
df_test_cleaned.loc[df_test_cleaned.address == "17 farrer drive",
               ['property_type', 'lat', 'lng', 'subzone', 'planning_area']] = 'condo', '1.313259', '103.806622', 'holland road', 'bukit timah'

In [None]:
print(df_test_cleaned['property_type'].value_counts())
# changing "hdb 3 rooms", "hdb 4 rooms" and likewise to "hdb" since the number of rooms info can
# be obtained from "num_beds"
df_test_cleaned['property_type'].mask(df_test_cleaned['property_type'].str.contains("hdb"), "hdb", inplace=True)
#df_test_cleaned.drop(df_test_cleaned[df_test_cleaned['property_type'] == 'land only'].index, inplace = True)
print(df_test_cleaned['property_type'].value_counts())

In [None]:
print(df_test_cleaned['property_type'].value_counts())
print(df_test_cleaned['furnishing'].value_counts())
# dropping 3 records with "na"
#df_test_cleaned = df_test_cleaned[df_test_cleaned.furnishing != "na"]

In [None]:
print("before changes")
print("condo")
print(df_test_cleaned["title"].str.contains("condo").value_counts())
print("\n")
print("apartment")
print(df_test_cleaned["title"].str.contains("apartment").value_counts())
print("\n")
print("property_type")
print(df_test_cleaned["property_type"].value_counts())


#The count of condos and apartments in title and property_type does not match

In [None]:
df_test_cleaned.loc[(df_test_cleaned['property_type']!="condo") &
       (df_test_cleaned['title'].str.contains('condo')),['property_type']] = "condo"
df_test_cleaned.loc[(df_test_cleaned['property_type']!="apartment") &
       (df_test_cleaned['title'].str.contains('apartment')),['property_type']] = "apartment"

In [None]:
print("after changes")
print("condo")
print(df_test_cleaned["title"].str.contains("condo").value_counts())
print("\n")
print("apartment")
print(df_test_cleaned["title"].str.contains("apartment").value_counts())
print("\n")
print("property_type")
print(df_test_cleaned["property_type"].value_counts())

In [None]:
#g.	Language translation

In [None]:
#h.	Handle missing values

df_test_cleaned.isnull().sum()

In [None]:
#i. Features

#1. title - DROPPING
#2. address - DROPPING
#3. property_name - KEEP
#4. property_type - ordinal encoding
#5. tenure - one-hot encoding (reduce to 3 columns - freehold, 99-year, 999-year)
#6. built_year - handle null values
#6.1 numb_beds/num_baths/size_sqft - KEEP AS IT IS
#7. floor_level - ordinal encoding (DROPPING)
#8. furnishing - one-hot encoding
#9. available_unit_types -  (ON HOLD/DROPPING)
#10. total_num_units - DROPPING
#11. lat/lng - KEEP AS IT IS
#12. planning_area - one-hot encoding/adarsh to work on map
#13. subzone - one-hot encoding/Adarsh to work on map
#14. ADD FEATURES - number os schools/MRT stations/malls within a pre-fixed radius

In [None]:
#No need to run this unless all data has been cleaned.
df_test_cleaned.to_csv('../data/test_cleaned.csv')
df_test_cleaned.shape