In [6]:
import pandas as pd
import os
from sqlalchemy import create_engine
from MyCreds.mycreds import Capstone_AWS_SG
import numpy as np
import geopandas as gpd
import shapely
from geoalchemy2 import Geometry
from bs4 import BeautifulSoup
import re
from collections import defaultdict

In [7]:
engine = create_engine(f'postgresql+psycopg2://{Capstone_AWS_SG.username}:{Capstone_AWS_SG.password}@{Capstone_AWS_SG.host}/Capstone', echo=False)
conn = engine.connect()

### Load Resale Transactions

In [8]:
# by registration date

def create_resale_registration():

    base_dir = '../michael/data/resale-flat-prices/'
    df = pd.DataFrame()
    for file in os.listdir('../michael/data/resale-flat-prices/'):
        if "registration-date" in file and 'csv' in file:
            df = pd.concat([df, pd.read_csv(f'{base_dir}{file}')])

    return df

def clean_registration_df(df):

    df['month'] = pd.to_datetime(df['month'])

    df['n_rooms'] = df['flat_type'].str.extract('(\d)')
    df.fillna(value={'n_rooms':'5'},inplace=True)
    df['n_rooms'] = df['n_rooms'].astype('float')
    df['n_rooms'] = df['n_rooms'].astype('Int64')

    # Convert story range to min and max int columns
    df['storey_range_min'] = df['storey_range'].str[:2].astype(int)
    df['storey_range_max'] = df['storey_range'].str[-2:].astype(int)

    # Convert remaining lease years to int years and int months remaining columns
    df['remaining_lease_years'] = df['remaining_lease'].str.extract('(?P<remaining_lease_years>\d{2}) year[s]?')
    df[['remaining_lease_months', 'remaining_lease_month']] = df['remaining_lease'].str.extract('\d{2} year[s]? (?P<remaining_lease_months>\d{2}) month[s]?|(?P<remaining_lease_month>\d{2}) month[s]?')
    df.drop('remaining_lease_month', axis=1, inplace=True)
    df[['remaining_lease_years', 'remaining_lease_months']] = df[['remaining_lease_years', 'remaining_lease_months']].astype('float')
    df[['remaining_lease_years', 'remaining_lease_months']] = df[['remaining_lease_years', 'remaining_lease_months']].astype('Int64').copy()
    df['remaining_lease_years'].fillna(df['remaining_lease'], inplace=True)
    df.sort_values('month', ascending=True, inplace=True)
    df.reset_index(inplace=True, drop=True)
    df['remaining_lease_months'].fillna(pd.Series(np.where(df['remaining_lease'].isnull(), np.nan, 0)), inplace=True)
    return df

df_registration = create_resale_registration()
df_registration = clean_registration_df(df_registration)
df_registration

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,remaining_lease,n_rooms,storey_range_min,storey_range_max,remaining_lease_years,remaining_lease_months
0,2012-03-01,ANG MO KIO,2 ROOM,172,ANG MO KIO AVE 4,06 TO 10,45.0,Improved,1986,250000.0,,2,6,10,,
1,2012-03-01,SENGKANG,4 ROOM,184B,RIVERVALE CRES,11 TO 15,90.0,Model A,2003,436000.0,,4,11,15,,
2,2012-03-01,SENGKANG,4 ROOM,201B,COMPASSVALE DR,11 TO 15,90.0,Model A,2001,437000.0,,4,11,15,,
3,2012-03-01,SENGKANG,4 ROOM,311B,ANCHORVALE LANE,01 TO 05,95.0,Premium Apartment,2002,438000.0,,4,1,5,,
4,2012-03-01,SENGKANG,4 ROOM,259B,COMPASSVALE RD,01 TO 05,85.0,Model A2,2001,438000.0,,4,1,5,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210825,2022-03-01,CLEMENTI,3 ROOM,208B,CLEMENTI AVE 6,19 TO 21,68.0,Model A,2017,575000.0,94 years 08 months,3,19,21,94,8
210826,2022-03-01,CLEMENTI,3 ROOM,320,CLEMENTI AVE 4,04 TO 06,67.0,New Generation,1979,380888.0,56 years 02 months,3,4,6,56,2
210827,2022-03-01,CLEMENTI,3 ROOM,334,CLEMENTI AVE 2,07 TO 09,67.0,New Generation,1978,416000.0,55 years 05 months,3,7,9,55,5
210828,2022-03-01,GEYLANG,3 ROOM,3,JOO CHIAT RD,10 TO 12,74.0,Model A,1985,470000.0,62 years 10 months,3,10,12,62,10


In [9]:
def create_resale_approval():

    base_dir = '../michael/data/resale-flat-prices/'
    df = pd.DataFrame()
    for file in os.listdir('../michael/data/resale-flat-prices/'):
        if "approval-date" in file and 'csv' in file:
            df = pd.concat([df, pd.read_csv(f'{base_dir}{file}')])

    return df

def clean_approval_df(df):

    df['month'] = pd.to_datetime(df['month'])
    df['n_rooms'] = df['flat_type'].str.extract('(\d)')
    df.fillna(value={'n_rooms':'5'},inplace=True)
    df['n_rooms'] = df['n_rooms'].astype('float')
    df['n_rooms'] = df['n_rooms'].astype('Int64')
    # Convert story range to min and max int columns
    df['storey_range_min'] = df['storey_range'].str[:2].astype(int)
    df['storey_range_max'] = df['storey_range'].str[-2:].astype(int)

    df.sort_values('month', ascending=True, inplace=True)
    df.reset_index(inplace=True, drop=True)

    return df


df_approval = create_resale_approval()
df_approval = clean_approval_df(df_approval)
df_approval

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,n_rooms,storey_range_min,storey_range_max
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,1,10,12
1,1990-01-01,KALLANG/WHAMPOA,3 ROOM,44,BENDEMEER RD,04 TO 06,63.0,STANDARD,1981,31400.0,3,4,6
2,1990-01-01,KALLANG/WHAMPOA,3 ROOM,20,ST. GEORGE'S RD,04 TO 06,67.0,NEW GENERATION,1984,66500.0,3,4,6
3,1990-01-01,KALLANG/WHAMPOA,3 ROOM,14,KG ARANG RD,04 TO 06,103.0,NEW GENERATION,1984,77000.0,3,4,6
4,1990-01-01,KALLANG/WHAMPOA,3 ROOM,46,OWEN RD,01 TO 03,68.0,NEW GENERATION,1982,58000.0,3,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
656842,2012-02-01,GEYLANG,4 ROOM,319,UBI AVE 1,10 TO 12,84.0,Simplified,1985,383000.0,4,10,12
656843,2012-02-01,GEYLANG,4 ROOM,314,UBI AVE 1,01 TO 03,84.0,Simplified,1985,382000.0,4,1,3
656844,2012-02-01,GEYLANG,4 ROOM,343,UBI AVE 1,01 TO 03,84.0,Simplified,1986,380000.0,4,1,3
656845,2012-02-01,HOUGANG,3 ROOM,631,HOUGANG AVE 8,07 TO 09,60.0,Improved,1985,300000.0,3,7,9


In [6]:
# load into database

df_approval.to_sql('resale_price_based_on_approval_date', conn, if_exists='replace', index=False)
df_registration.to_sql('resale_price_based_on_registration_date', conn, if_exists='append', method='multi', index=False)

### Load Housing Price Index

In [7]:
resale_index = pd.read_csv('../michael/data/hdb-resale-price-index/housing-and-development-board-resale-price-index-1q2009-100-quarterly.csv')
resale_index

Unnamed: 0,quarter,index
0,1990-Q1,24.3
1,1990-Q2,24.4
2,1990-Q3,25.0
3,1990-Q4,24.7
4,1991-Q1,24.9
...,...,...
122,2020-Q3,133.9
123,2020-Q4,138.1
124,2021-Q1,142.2
125,2021-Q2,146.4


In [10]:
resale_index['year'] = resale_index['quarter'].str[:4].astype('float')
resale_index['year'] = resale_index['year'].astype('Int64')
resale_index['quarter_int'] = resale_index['quarter'].str[-1].astype('float')
resale_index['quarter_int'] = resale_index['quarter_int'].astype('Int64')
resale_index

Unnamed: 0,quarter,index,year,quarter_int
0,1990-Q1,24.3,1990,1
1,1990-Q2,24.4,1990,2
2,1990-Q3,25.0,1990,3
3,1990-Q4,24.7,1990,4
4,1991-Q1,24.9,1991,1
...,...,...,...,...
122,2020-Q3,133.9,2020,3
123,2020-Q4,138.1,2020,4
124,2021-Q1,142.2,2021,1
125,2021-Q2,146.4,2021,2


In [11]:
resale_index.to_sql('resale_price_index', conn, if_exists='replace', index=False)

### Load HDB Property Information

In [12]:
prop_info = pd.read_csv('../michael/data/hdb-property-information/hdb-property-information.csv')
prop_info.to_sql('hdb_property_info', conn, if_exists='replace', index=False)

### Load HDB Carpark Information

In [13]:
carpark = pd.read_csv('../michael/data/hdb-carpark-information/hdb-carpark-information.csv')
carpark.to_sql('hdb_carpark_info', conn, if_exists='replace', index=False)

### Load Postal Code Information - From Webscrape of OneMap

In [47]:
postal = pd.read_pickle('../michael/data/postal-codes/postal.pkl')
postal['building_id'] = postal.index
postal = postal[['building_id', 'address', 'blk_no', 'building', 'latitude', 'longitude', 'longtitude',
                 'postal', 'road_name', 'short_r_name', 'searchval', 'x', 'y']]
# ensure 6 digits by adding leading zero
postal['postal'] = postal['postal'].apply('{:0>6}'.format)
postal['address_to_match'] = postal['blk_no'] +' '+ postal['short_r_name']
postal

Unnamed: 0,building_id,address,blk_no,building,latitude,longitude,longtitude,postal,road_name,short_r_name,searchval,x,y,address_to_match
0,0,101A BAYFRONT AVENUE TEMPORARY SITE OFFICE SIN...,101A,TEMPORARY SITE OFFICE,1.275697,103.855652,103.855652,018895,BAYFRONT AVENUE,BAYFRONT AVE,TEMPORARY SITE OFFICE,30485.511362,28685.612665,101A BAYFRONT AVE
1,1,1 STRAITS BOULEVARD SINGAPORE CHINESE CULTURAL...,1,SINGAPORE CHINESE CULTURAL CENTRE,1.275829,103.849576,103.849576,018906,STRAITS BOULEVARD,STRAITS BOULEVARD,SINGAPORE CHINESE CULTURAL CENTRE,29809.365407,28700.236127,1 STRAITS BOULEVARD
2,2,11A STRAITS BOULEVARD TEMPORARY SITE OFFICE SI...,11A,TEMPORARY SITE OFFICE,1.274950,103.851665,103.851665,018907,STRAITS BOULEVARD,STRAITS BOULEVARD,TEMPORARY SITE OFFICE,30041.838898,28602.987244,11A STRAITS BOULEVARD
3,3,2 CENTRAL BOULEVARD IOI CENTRAL BOULEVARD TOWE...,2,IOI CENTRAL BOULEVARD TOWERS,1.279777,103.851513,103.851513,018916,CENTRAL BOULEVARD,CENTRAL BOULEVARD,IOI CENTRAL BOULEVARD TOWERS,30024.919852,29136.807026,2 CENTRAL BOULEVARD
4,4,21 PARK STREET DBS MARINA BAY MRT STATION SING...,21,DBS MARINA BAY MRT STATION,1.276427,103.854598,103.854598,018925,PARK STREET,PARK ST,DBS MARINA BAY MRT STATION,30368.205612,28766.381902,21 PARK ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142881,142881,100A KRANJI LOOP SINGAPORE 887327,100A,NIL,1.433629,103.758648,103.758648,887327,KRANJI LOOP,KRANJI LOOP,100A KRANJI LOOP SINGAPORE 887327,19690.280996,46149.118386,100A KRANJI LOOP
142882,142882,A PASIR PANJANG ROAD SINGAPORE 887328,A,NIL,1.277170,103.795840,103.795840,887328,PASIR PANJANG ROAD,PASIR PANJANG RD,A PASIR PANJANG ROAD SINGAPORE 887328,23828.902741,28848.553424,A PASIR PANJANG RD
142883,142883,PULAU BUKOM SINGAPORE 903808,,NIL,1.235596,103.768645,103.768645,903808,PULAU BUKOM,PULAU BUKOM,PULAU BUKOM SINGAPORE 903808,20802.263821,24251.557450,PULAU BUKOM
142884,142884,GATE C7 AIRPORT CARGO ROAD CHANGI ANIMAL AND P...,GATE C7,CHANGI ANIMAL AND PLANT QUARANTINE CENTRE,1.375285,103.996737,103.996737,918104,AIRPORT CARGO ROAD,AIRPORT CARGO RD,CHANGI ANIMAL AND PLANT QUARANTINE CENTRE,46186.488419,39698.194002,GATE C7 AIRPORT CARGO RD


In [48]:
geo_postal = gpd.GeoDataFrame(postal, geometry=gpd.points_from_xy(postal['longitude'], postal['latitude']), crs='epsg:4326')
geo_postal

Unnamed: 0,building_id,address,blk_no,building,latitude,longitude,longtitude,postal,road_name,short_r_name,searchval,x,y,address_to_match,geometry
0,0,101A BAYFRONT AVENUE TEMPORARY SITE OFFICE SIN...,101A,TEMPORARY SITE OFFICE,1.275697,103.855652,103.855652,018895,BAYFRONT AVENUE,BAYFRONT AVE,TEMPORARY SITE OFFICE,30485.511362,28685.612665,101A BAYFRONT AVE,POINT (103.85565 1.27570)
1,1,1 STRAITS BOULEVARD SINGAPORE CHINESE CULTURAL...,1,SINGAPORE CHINESE CULTURAL CENTRE,1.275829,103.849576,103.849576,018906,STRAITS BOULEVARD,STRAITS BOULEVARD,SINGAPORE CHINESE CULTURAL CENTRE,29809.365407,28700.236127,1 STRAITS BOULEVARD,POINT (103.84958 1.27583)
2,2,11A STRAITS BOULEVARD TEMPORARY SITE OFFICE SI...,11A,TEMPORARY SITE OFFICE,1.274950,103.851665,103.851665,018907,STRAITS BOULEVARD,STRAITS BOULEVARD,TEMPORARY SITE OFFICE,30041.838898,28602.987244,11A STRAITS BOULEVARD,POINT (103.85167 1.27495)
3,3,2 CENTRAL BOULEVARD IOI CENTRAL BOULEVARD TOWE...,2,IOI CENTRAL BOULEVARD TOWERS,1.279777,103.851513,103.851513,018916,CENTRAL BOULEVARD,CENTRAL BOULEVARD,IOI CENTRAL BOULEVARD TOWERS,30024.919852,29136.807026,2 CENTRAL BOULEVARD,POINT (103.85151 1.27978)
4,4,21 PARK STREET DBS MARINA BAY MRT STATION SING...,21,DBS MARINA BAY MRT STATION,1.276427,103.854598,103.854598,018925,PARK STREET,PARK ST,DBS MARINA BAY MRT STATION,30368.205612,28766.381902,21 PARK ST,POINT (103.85460 1.27643)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142881,142881,100A KRANJI LOOP SINGAPORE 887327,100A,NIL,1.433629,103.758648,103.758648,887327,KRANJI LOOP,KRANJI LOOP,100A KRANJI LOOP SINGAPORE 887327,19690.280996,46149.118386,100A KRANJI LOOP,POINT (103.75865 1.43363)
142882,142882,A PASIR PANJANG ROAD SINGAPORE 887328,A,NIL,1.277170,103.795840,103.795840,887328,PASIR PANJANG ROAD,PASIR PANJANG RD,A PASIR PANJANG ROAD SINGAPORE 887328,23828.902741,28848.553424,A PASIR PANJANG RD,POINT (103.79584 1.27717)
142883,142883,PULAU BUKOM SINGAPORE 903808,,NIL,1.235596,103.768645,103.768645,903808,PULAU BUKOM,PULAU BUKOM,PULAU BUKOM SINGAPORE 903808,20802.263821,24251.557450,PULAU BUKOM,POINT (103.76865 1.23560)
142884,142884,GATE C7 AIRPORT CARGO ROAD CHANGI ANIMAL AND P...,GATE C7,CHANGI ANIMAL AND PLANT QUARANTINE CENTRE,1.375285,103.996737,103.996737,918104,AIRPORT CARGO ROAD,AIRPORT CARGO RD,CHANGI ANIMAL AND PLANT QUARANTINE CENTRE,46186.488419,39698.194002,GATE C7 AIRPORT CARGO RD,POINT (103.99674 1.37529)


In [50]:
geo_postal.to_postgis('sg_buildings_postal_geo', conn, if_exists='replace', index=False, dtype={'geometry': Geometry(geometry_type='POINT', srid=4326)})


### Load Preschool Information

In [35]:
import geopandas as gpd
preschools = gpd.read_file('../michael/data/pre-schools-location/pre-schools-location-geojson.geojson')
preschools

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.70086 1.33832 0.00000)
1,kml_2,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.89866 1.39044 0.00000)
2,kml_3,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.80681 1.43802 0.00000)
3,kml_4,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.78742 1.43344 0.00000)
4,kml_5,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.88863 1.39565 0.00000)
...,...,...,...
1920,kml_1921,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.95263 1.34910 0.00000)
1921,kml_1922,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.77738 1.44503 0.00000)
1922,kml_1923,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.84084 1.31410 0.00000)
1923,kml_1924,<center><table><tr><th colspan='2' align='cent...,POINT Z (103.82625 1.35732 0.00000)


In [36]:
des = preschools['Description']
centers_dict = defaultdict(dict)

for i, description in enumerate(des):
    soup = BeautifulSoup(des[i])
    table = soup.find_all('tr')[1:]

    centers_dict[i] = {re.findall('<th>(\w+)<\/th>\s<td>(.+)?<\/td>', str(row))[0][0]:
                             re.findall('<th>(\w+)<\/th>\s<td>(.+)?<\/td>', str(row))[0][1] for row in table}

preschool_descriptions = pd.DataFrame(centers_dict).T
preschool_descriptions

Unnamed: 0,CENTRE_NAME,CENTRE_CODE,ADDRESS,POSTAL_CODE,INC_CRC,FMEL_UPD_D
0,BRILLIANT TOTS PTE. LTD.,PT9334,"610, JURONG WEST STREET 65, #01 - 534, S 640610",640610,0523C7904478A63D,20200812235534
1,BUBBLESLAND PLAYHOUSE PTE LTD,PT7680,"238, COMPASSVALE WALK, #01 - 542, S 540238",540238,18BED05A501AA168,20200812235534
2,BUCKET HOUSE PRESCHOOL,PT9527,"39, WOODLANDS CLOSE, #01 - 62, MEGA@WOODLANDS,...",737856,C88B9AC31EE71BF6,20200812235534
3,BUMBLE BEE CHILD CARE CENTRE,PT3150,"369, WOODLANDS AVENUE 1, #01 - 853, S 730369",730369,64AB8FACA8F60129,20200812235534
4,BUSY BEES SINGAPORE PTE LTD,PT9117,"327B, ANCHORVALE ROAD, #01 - 322, S 542327",542327,E1B55AC65B9059E8,20200812235534
...,...,...,...,...,...,...
1920,PCF Sparkletots Preschool @ Tampines-Changkat ...,ST0261,"284 Tampines Street 22 Blk 285 #01-149 , S520284",520284,7CFD747E120FA0D5,20200812235535
1921,New Life Kindergarten,RC1838,"10 Marsiling Lane , S739147",739147,002D449DA41EB2ED,20200812235535
1922,Newton Kindergarten,RC1798,"16 Newton Road , S307995",307995,C4489713904CDA34,20200812235535
1923,P &amp; J Kindergarten,PT9661,"35 Windsor Park Road, Windsor Park , S574141",574141,50EBBF878EBACFB7,20200812235535


In [37]:
preschools = preschools.merge(preschool_descriptions, left_index=True, right_index=True)
preschools = preschools[['CENTRE_NAME', 'CENTRE_CODE', 'ADDRESS', 'POSTAL_CODE', 'INC_CRC', 'FMEL_UPD_D', 'geometry']]
preschools.columns = [col.lower() for col in preschools.columns]
preschools = preschools.to_crs(4326)
preschools['geometry'] = preschools['geometry'].map(lambda point: shapely.ops.transform(lambda x, y, z: (x, y), point))
preschools

Unnamed: 0,centre_name,centre_code,address,postal_code,inc_crc,fmel_upd_d,geometry
0,BRILLIANT TOTS PTE. LTD.,PT9334,"610, JURONG WEST STREET 65, #01 - 534, S 640610",640610,0523C7904478A63D,20200812235534,POINT (103.70086 1.33832)
1,BUBBLESLAND PLAYHOUSE PTE LTD,PT7680,"238, COMPASSVALE WALK, #01 - 542, S 540238",540238,18BED05A501AA168,20200812235534,POINT (103.89866 1.39044)
2,BUCKET HOUSE PRESCHOOL,PT9527,"39, WOODLANDS CLOSE, #01 - 62, MEGA@WOODLANDS,...",737856,C88B9AC31EE71BF6,20200812235534,POINT (103.80681 1.43802)
3,BUMBLE BEE CHILD CARE CENTRE,PT3150,"369, WOODLANDS AVENUE 1, #01 - 853, S 730369",730369,64AB8FACA8F60129,20200812235534,POINT (103.78742 1.43344)
4,BUSY BEES SINGAPORE PTE LTD,PT9117,"327B, ANCHORVALE ROAD, #01 - 322, S 542327",542327,E1B55AC65B9059E8,20200812235534,POINT (103.88863 1.39565)
...,...,...,...,...,...,...,...
1920,PCF Sparkletots Preschool @ Tampines-Changkat ...,ST0261,"284 Tampines Street 22 Blk 285 #01-149 , S520284",520284,7CFD747E120FA0D5,20200812235535,POINT (103.95263 1.34910)
1921,New Life Kindergarten,RC1838,"10 Marsiling Lane , S739147",739147,002D449DA41EB2ED,20200812235535,POINT (103.77738 1.44503)
1922,Newton Kindergarten,RC1798,"16 Newton Road , S307995",307995,C4489713904CDA34,20200812235535,POINT (103.84084 1.31410)
1923,P &amp; J Kindergarten,PT9661,"35 Windsor Park Road, Windsor Park , S574141",574141,50EBBF878EBACFB7,20200812235535,POINT (103.82625 1.35732)


In [38]:
preschools.to_postgis('preschools', conn, if_exists='replace', index=False,dtype={'geometry': Geometry(geometry_type='POINT', srid=4326)})

### Load Parks

In [28]:
parks = gpd.read_file('../michael/data/parks/parks-geojson.geojson')
parks = parks.to_crs(4326)
parks['geometry'] = parks['geometry'].map(lambda point: shapely.ops.transform(lambda x, y, z: (x, y), point))
parks

Unnamed: 0,Name,Description,geometry
0,kml_1,<center><table><tr><th colspan='2' align='cent...,POINT (103.96086 1.34618)
1,kml_2,<center><table><tr><th colspan='2' align='cent...,POINT (103.97898 1.38990)
2,kml_3,<center><table><tr><th colspan='2' align='cent...,POINT (103.92313 1.40991)
3,kml_4,<center><table><tr><th colspan='2' align='cent...,POINT (103.86697 1.37924)
4,kml_5,<center><table><tr><th colspan='2' align='cent...,POINT (103.82907 1.37276)
...,...,...,...
345,kml_346,<center><table><tr><th colspan='2' align='cent...,POINT (103.89036 1.36796)
346,kml_347,<center><table><tr><th colspan='2' align='cent...,POINT (103.98919 1.39017)
347,kml_348,<center><table><tr><th colspan='2' align='cent...,POINT (103.88595 1.35520)
348,kml_349,<center><table><tr><th colspan='2' align='cent...,POINT (103.90374 1.39164)


In [31]:
des = parks['Description']
parks_dict = defaultdict(dict)

for i, description in enumerate(des):
    soup = BeautifulSoup(des[i])
    table = soup.find_all('tr')[1:]

    parks_dict[i] = {re.findall('<th>(\w+)<\/th>\s<td>(.+)?<\/td>', str(row))[0][0]:
                           re.findall('<th>(\w+)<\/th>\s<td>(.+)?<\/td>', str(row))[0][1] for row in table}

parks_descriptions = pd.DataFrame(parks_dict).T
parks_descriptions.replace(r'^\s*$', np.nan, regex=True, inplace=True)
parks_descriptions.dropna(axis=1, inplace=True)

parks.rename({'Name': 'ID'}, axis=1, inplace=True)
parks_full = parks.merge(parks_descriptions, left_index=True, right_index=True)
parks_full

Unnamed: 0,ID,Description,geometry,LANDXADDRESSPOINT,LANDYADDRESSPOINT,NAME,INC_CRC,FMEL_UPD_D
0,kml_1,<center><table><tr><th colspan='2' align='cent...,POINT (103.96086 1.34618),42193.6328,36479.08,Jalan Pelatok Open Space,BC5EA394BD8CE9F8,20200218182414
1,kml_2,<center><table><tr><th colspan='2' align='cent...,POINT (103.97898 1.38990),44210.12,41314.54,Turnhouse Park,AB31CCDD36F93C1C,20200218182414
2,kml_3,<center><table><tr><th colspan='2' align='cent...,POINT (103.92313 1.40991),37994.87,43526.7266,Coney Island Park,7CAC664DFF94240C,20200218182414
3,kml_4,<center><table><tr><th colspan='2' align='cent...,POINT (103.86697 1.37924),31744.6465,40134.5625,Stratton Walk Playground,8FC92F17CF75A88A,20200218182414
4,kml_5,<center><table><tr><th colspan='2' align='cent...,POINT (103.82907 1.37276),27527.52,39418.4531,Leban Park,EB7459375A3D10F7,20200218182414
...,...,...,...,...,...,...,...,...
345,kml_346,<center><table><tr><th colspan='2' align='cent...,POINT (103.89036 1.36796),34348.5273,38888.0625,Realty Park,A7BB1BF496807A8B,20200218182414
346,kml_347,<center><table><tr><th colspan='2' align='cent...,POINT (103.98919 1.39017),45346.4766,41344.2461,Changi Point Promenade,5FEA22BCC48153DA,20200218182414
347,kml_348,<center><table><tr><th colspan='2' align='cent...,POINT (103.88595 1.35520),33857.832,37476.270000000004,Aroozoo Avenue Playground,ACBC9E152D19BD06,20200218182414
348,kml_349,<center><table><tr><th colspan='2' align='cent...,POINT (103.90374 1.39164),35836.5352,41506.1133,St. Anne's Wood Playground,43A081292A7AE024,20200218182414


In [32]:
parks_full = parks_full[['ID', 'LANDXADDRESSPOINT',
                         'LANDYADDRESSPOINT', 'NAME', 'INC_CRC', 'FMEL_UPD_D', 'geometry']]
parks_full.columns = [col.lower() for col in parks_full.columns]
parks_full

Unnamed: 0,id,landxaddresspoint,landyaddresspoint,name,inc_crc,fmel_upd_d,geometry
0,kml_1,42193.6328,36479.08,Jalan Pelatok Open Space,BC5EA394BD8CE9F8,20200218182414,POINT (103.96086 1.34618)
1,kml_2,44210.12,41314.54,Turnhouse Park,AB31CCDD36F93C1C,20200218182414,POINT (103.97898 1.38990)
2,kml_3,37994.87,43526.7266,Coney Island Park,7CAC664DFF94240C,20200218182414,POINT (103.92313 1.40991)
3,kml_4,31744.6465,40134.5625,Stratton Walk Playground,8FC92F17CF75A88A,20200218182414,POINT (103.86697 1.37924)
4,kml_5,27527.52,39418.4531,Leban Park,EB7459375A3D10F7,20200218182414,POINT (103.82907 1.37276)
...,...,...,...,...,...,...,...
345,kml_346,34348.5273,38888.0625,Realty Park,A7BB1BF496807A8B,20200218182414,POINT (103.89036 1.36796)
346,kml_347,45346.4766,41344.2461,Changi Point Promenade,5FEA22BCC48153DA,20200218182414,POINT (103.98919 1.39017)
347,kml_348,33857.832,37476.270000000004,Aroozoo Avenue Playground,ACBC9E152D19BD06,20200218182414,POINT (103.88595 1.35520)
348,kml_349,35836.5352,41506.1133,St. Anne's Wood Playground,43A081292A7AE024,20200218182414,POINT (103.90374 1.39164)


In [34]:
parks_full.to_postgis('parks_points', conn, if_exists='replace', index=False, dtype={'geometry': Geometry(geometry_type='POINT', srid= 4326)})

In [3]:
pos = gpd.read_file('../michael/data/parks-and-open-space/mp14-sdcp-pw-plan-parks-and-open-space-shp/G_MP14_PKWB_PARKS_PL.shp')
pos.drop('OID_1', axis=1, inplace=True)
pos = pos.to_crs(4326)
pos.to_postgis('parks_open_space_poly',conn, if_exists='replace', index=False, dtype={'geometry': Geometry(geometry_type='POLYGON', srid= 4326)})
pos

Unnamed: 0,OBJECTID,CLASSIFCTN,PRP_STATUS,INC_CRC,FMEL_UPD_D,X_ADDR,Y_ADDR,SHAPE_Leng,SHAPE_Area,geometry
0,1,NEIGHBOURHOOD PARK,EXISTING,592C7550177BF268,2016-07-18,27919.5872,45166.4263,283.063527,3193.830838,"POLYGON ((103.83311 1.42485, 103.83309 1.42459..."
1,2,NEIGHBOURHOOD PARK,EXISTING,E9A1AC47C9DC8698,2016-07-18,23205.8335,46915.3998,887.637083,19909.684922,"POLYGON ((103.78891 1.44025, 103.78891 1.44025..."
2,3,NEIGHBOURHOOD PARK,EXISTING,D7DF045D6805B00D,2016-07-18,24735.6118,47398.1171,503.291711,14221.679594,"POLYGON ((103.80415 1.44445, 103.80410 1.44443..."
3,4,NEIGHBOURHOOD PARK,EXISTING,F9D174E1E0E0111C,2016-07-18,24305.6678,46590.1864,942.984201,16383.964451,"POLYGON ((103.80170 1.43855, 103.80145 1.43844..."
4,5,NEIGHBOURHOOD PARK,EXISTING,C399D41D7CB9B4A5,2016-07-18,15074.8532,35773.5449,376.819317,1840.105171,"POLYGON ((103.71771 1.34016, 103.71765 1.34013..."
...,...,...,...,...,...,...,...,...,...,...
1102,1103,MARINE PARK,PLANNED,3A8CBA5EE0DFA259,2016-07-18,29449.7895,22378.3747,3421.757359,107711.294857,"POLYGON ((103.84818 1.21689, 103.84817 1.21680..."
1103,1104,NEIGHBOURHOOD PARK,PLANNED,5F7F93191F2E4D83,2016-07-18,24580.0774,29060.4320,339.151069,6169.308722,"POLYGON ((103.80260 1.27974, 103.80283 1.27924..."
1104,1105,REGIONAL PARK,EXISTING,740FE78FC19EC02A,2016-07-18,26065.4119,32926.0370,4962.018398,529242.527514,"POLYGON ((103.81547 1.32263, 103.81597 1.32258..."
1105,1106,REGIONAL PARK,PLANNED,3599F642FE503E51,2016-07-18,25464.4604,32890.8773,1522.745875,79886.842508,"POLYGON ((103.80990 1.31564, 103.81005 1.31550..."


In [5]:
water = gpd.read_file('../michael/data/waterbody/G_MP14_PKWB_WATERBODY_PL.shp')
water.drop('OID_1', axis=1, inplace=True)
water = water.to_crs(4326)
water.to_postgis('water_body_poly',conn, if_exists='replace', index=False, dtype={'geometry': Geometry(geometry_type='POLYGON', srid= 4326)})
water

Unnamed: 0,OBJECTID,INC_CRC,FMEL_UPD_D,X_ADDR,Y_ADDR,SHAPE_Leng,SHAPE_Area,geometry
0,1,6DD192AAE1C42AE8,2016-07-18,24689.4342,47413.9979,435.366560,3354.737405,"POLYGON ((103.80401 1.44584, 103.80339 1.44450..."
1,2,0BFB2A11BA4651B5,2016-07-18,19155.7404,42249.5335,435.186388,1751.024357,"POLYGON ((103.75293 1.39860, 103.75358 1.39846..."
2,3,863E730B86B85A58,2016-07-18,18891.3534,46899.5320,2399.249844,56117.381899,"POLYGON ((103.75061 1.44496, 103.75066 1.44478..."
3,4,CAF208B6890D3F6F,2016-07-18,19287.1631,43169.9860,215.227533,1754.483745,"POLYGON ((103.75474 1.40687, 103.75479 1.40704..."
4,5,BED6AC0E03AD389C,2016-07-18,15102.4748,46490.7800,1782.828610,6031.275924,"POLYGON ((103.72129 1.43796, 103.72061 1.43771..."
...,...,...,...,...,...,...,...,...
949,950,EA00A9ACEB652A39,2016-07-18,6130.9831,28442.2639,451.487124,3948.818891,"POLYGON ((103.63778 1.27340, 103.63778 1.27336..."
950,951,9F2A3669C833921E,2016-07-18,39252.4936,39243.0202,773.110355,10765.157217,"POLYGON ((103.93607 1.37127, 103.93593 1.37126..."
951,952,892D63CC7248F33D,2016-07-18,44764.4780,33300.9690,431.551699,6703.944494,"POLYGON ((103.98442 1.31808, 103.98382 1.31668..."
952,953,49B49270D79A034D,2016-07-18,43852.8830,34407.4732,664.006253,11933.914220,"POLYGON ((103.97589 1.32632, 103.97569 1.32613..."


### Load Other Location Features

Stuart's Notebooks

Run the following 
- food_centre_markets.ipynb
- load_conservation.ipynb
- load_fire_police_health_mall_supermarkets.ipynb
- load_manual_cluster_poi.ipynb
- load_road_mrt_bus_taxi.ipynb
- primary_schools.ipynb
- secondary_schools.ipynb

### Process and Merge Resale Transactions

In [10]:
def create_resale_prices(approval_df, registration_df):
    rs_df = pd.concat([approval_df, registration_df])

    rs_df['month'] = pd.to_datetime(rs_df['month'])
    rs_df['lease_commence_date'] = pd.to_datetime(rs_df['lease_commence_date'], format='%Y')

    rs_df['transaction_id'] = rs_df['month'].dt.strftime('%Y') + "_" + rs_df['month'].dt.strftime('%m') + "_" +  rs_df['block'] + "_" + rs_df['street_name'].str.replace(' ', '_') + "_" + rs_df['storey_range'].str.replace(' ', '_')

    # Remaining lease years =
    # if month >= 6
    # lease_commence_date + 99 - month
    # if month < 6
    # lease_commence_date + 98 - month
    rs_df['remaining_lease_years'] = np.where(rs_df['month'].dt.month >= 6, ((rs_df['lease_commence_date'].dt.year + 98) - rs_df['month'].dt.year), ((rs_df['lease_commence_date'].dt.year + 99) - rs_df['month'].dt.year))

    # Convert back to int year.
    rs_df['lease_commence_date'] = rs_df['lease_commence_date'].dt.year.astype(int)


    # Price per square foot and meter
    rs_df['price_per_sq_ft'] = rs_df['resale_price'] / (rs_df['floor_area_sqm'] * 3.28084) #3.28084 ft per meter
    rs_df['price_per_sq_m'] = rs_df['resale_price'] / rs_df['floor_area_sqm']

    # Price per square foot/meter and remaining lease years
    rs_df['price_per_sq_ft_per_lease_yr'] = rs_df['price_per_sq_ft'] / rs_df['remaining_lease_years']
    rs_df['price_per_sq_m_per_lease_yr'] = rs_df['price_per_sq_m'] / rs_df['remaining_lease_years']



    return rs_df

rs_df = create_resale_prices(df_approval, df_registration)
rs_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,resale_price,...,storey_range_min,storey_range_max,remaining_lease,remaining_lease_years,remaining_lease_months,transaction_id,price_per_sq_ft,price_per_sq_m,price_per_sq_ft_per_lease_yr,price_per_sq_m_per_lease_yr
0,1990-01-01,ANG MO KIO,1 ROOM,309,ANG MO KIO AVE 1,10 TO 12,31.0,IMPROVED,1977,9000.0,...,10,12,,86,,1990_01_309_ANG_MO_KIO_AVE_1_10_TO_12,88.490320,290.322581,1.028957,3.375844
1,1990-01-01,KALLANG/WHAMPOA,3 ROOM,44,BENDEMEER RD,04 TO 06,63.0,STANDARD,1981,31400.0,...,4,6,,90,,1990_01_44_BENDEMEER_RD_04_TO_06,151.916186,498.412698,1.687958,5.537919
2,1990-01-01,KALLANG/WHAMPOA,3 ROOM,20,ST. GEORGE'S RD,04 TO 06,67.0,NEW GENERATION,1984,66500.0,...,4,6,,93,,1990_01_20_ST._GEORGE'S_RD_04_TO_06,302.525363,992.537313,3.252961,10.672444
3,1990-01-01,KALLANG/WHAMPOA,3 ROOM,14,KG ARANG RD,04 TO 06,103.0,NEW GENERATION,1984,77000.0,...,4,6,,93,,1990_01_14_KG_ARANG_RD_04_TO_06,227.860187,747.572816,2.450110,8.038417
4,1990-01-01,KALLANG/WHAMPOA,3 ROOM,46,OWEN RD,01 TO 03,68.0,NEW GENERATION,1982,58000.0,...,1,3,,91,,1990_01_46_OWEN_RD_01_TO_03,259.976462,852.941176,2.856884,9.372980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210825,2022-03-01,CLEMENTI,3 ROOM,208B,CLEMENTI AVE 6,19 TO 21,68.0,Model A,2017,575000.0,...,19,21,94 years 08 months,94,8,2022_03_208B_CLEMENTI_AVE_6_19_TO_21,2577.352859,8455.882353,27.418647,89.956195
210826,2022-03-01,CLEMENTI,3 ROOM,320,CLEMENTI AVE 4,04 TO 06,67.0,New Generation,1979,380888.0,...,4,6,56 years 02 months,56,2,2022_03_320_CLEMENTI_AVE_4_04_TO_06,1732.756100,5684.895522,30.942073,101.515991
210827,2022-03-01,CLEMENTI,3 ROOM,334,CLEMENTI AVE 2,07 TO 09,67.0,New Generation,1978,416000.0,...,7,9,55 years 05 months,55,5,2022_03_334_CLEMENTI_AVE_2_07_TO_09,1892.489492,6208.955224,34.408900,112.890095
210828,2022-03-01,GEYLANG,3 ROOM,3,JOO CHIAT RD,10 TO 12,74.0,Model A,1985,470000.0,...,10,12,62 years 10 months,62,10,2022_03_3_JOO_CHIAT_RD_10_TO_12,1935.891830,6351.351351,31.224062,102.441151


In [11]:
def update_street_abbreviations(resale_df):
    resale_df['address']=resale_df['block']+' '+resale_df['street_name']

    ##Deal with short forms
    # resale_df['address']=resale_df['address'].str.replace(' AVE ',' AVENUE ')
    # resale_df['address']=resale_df['address'].str.replace(' ST ',' STREET ')
    # resale_df['address']=resale_df['address'].str.replace(' NTH',' NORTH')
    # resale_df['address']=resale_df['address'].str.replace(' STH',' SOUTH')
    # resale_df['address']=resale_df['address'].str.replace(' RD',' ROAD')
    # resale_df['address']=resale_df['address'].str.replace(' UPP ',' UPPER ')
    # resale_df['address']=resale_df['address'].str.replace(' CTRL',' CENTRAL')
    # resale_df['address']=resale_df['address'].str.replace(' JLN ',' JALAN ')
    # resale_df['address']=resale_df['address'].str.replace('BT ','BUKIT ')
    # resale_df['address']=resale_df['address'].str.replace(' HTS',' HEIGHTS')
    # resale_df['address']=resale_df['address'].str.replace(' MKT ',' MARKET ')
    # resale_df['address']=resale_df['address'].str.replace('TG ','TANJONG ')
    # resale_df['address']=resale_df['address'].str.replace("C'WEALTH",'COMMONWEALTH')
    # resale_df['address']=resale_df['address'].str.replace("LOR ",'LORONG ')
    # resale_df['address']=resale_df['address'].str.replace("GDNS",'GARDENS')
    # resale_df['address']=resale_df['address'].str.replace('ST. ','SAINT ')
    # resale_df['address']=resale_df['address'].str.replace(' PK',' PARK')
    # resale_df['address']=resale_df['address'].str.replace(' DR ',' DRIVE ')
    # resale_df['address']=resale_df['address'].str.replace('KG ','KAMPONG ')
    # resale_df['address']=resale_df['address'].str.replace(' CL ',' CLOSE ')

    return resale_df


def sort_cols(resale_df):
    resale_df = resale_df[['transaction_id', 'month', 'block', 'town', 'street_name', 'address','storey_range', 'storey_range_min', 'storey_range_max',
      'flat_model', 'flat_type', 'n_rooms', 'floor_area_sqm',
       'lease_commence_date', 'remaining_lease', 'remaining_lease_years', 'remaining_lease_months',
      'resale_price', 'price_per_sq_ft', 'price_per_sq_m', 'price_per_sq_ft_per_lease_yr', 'price_per_sq_m_per_lease_yr']].sort_values('month', ascending=True)

    return resale_df

In [12]:
rs_df = update_street_abbreviations(rs_df)
rs_df = sort_cols(rs_df)
rs_df

Unnamed: 0,transaction_id,month,block,town,street_name,address,storey_range,storey_range_min,storey_range_max,flat_model,...,floor_area_sqm,lease_commence_date,remaining_lease,remaining_lease_years,remaining_lease_months,resale_price,price_per_sq_ft,price_per_sq_m,price_per_sq_ft_per_lease_yr,price_per_sq_m_per_lease_yr
0,1990_01_309_ANG_MO_KIO_AVE_1_10_TO_12,1990-01-01,309,ANG MO KIO,ANG MO KIO AVE 1,309 ANG MO KIO AVE 1,10 TO 12,10,12,IMPROVED,...,31.0,1977,,86,,9000.0,88.490320,290.322581,1.028957,3.375844
1010,1990_01_230_ANG_MO_KIO_AVE_3_07_TO_09,1990-01-01,230,ANG MO KIO,ANG MO KIO AVE 3,230 ANG MO KIO AVE 3,07 TO 09,7,9,NEW GENERATION,...,82.0,1978,,87,,60000.0,223.024383,731.707317,2.563499,8.410429
1009,1990_01_223_ANG_MO_KIO_AVE_1_10_TO_12,1990-01-01,223,ANG MO KIO,ANG MO KIO AVE 1,223 ANG MO KIO AVE 1,10 TO 12,10,12,NEW GENERATION,...,67.0,1978,,87,,37000.0,168.322383,552.238806,1.934740,6.347572
1008,1990_01_302_HOUGANG_AVE_5_01_TO_03,1990-01-01,302,HOUGANG,HOUGANG AVE 5,302 HOUGANG AVE 5,01 TO 03,1,3,MODEL A,...,131.0,1983,,92,,126000.0,293.166403,961.832061,3.186591,10.454696
1007,1990_01_309_ANG_MO_KIO_AVE_1_04_TO_06,1990-01-01,309,ANG MO KIO,ANG MO KIO AVE 1,309 ANG MO KIO AVE 1,04 TO 06,4,6,IMPROVED,...,31.0,1977,,86,,6000.0,58.993546,193.548387,0.685971,2.250563
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210529,2022_03_501B_YISHUN_ST_51_07_TO_09,2022-03-01,501B,YISHUN,YISHUN ST 51,501B YISHUN ST 51,07 TO 09,7,9,Improved,...,112.0,2018,94 years 11 months,95,11,652000.0,1774.371372,5821.428571,18.677593,61.278195
210528,2022_03_820_YISHUN_ST_81_07_TO_09,2022-03-01,820,YISHUN,YISHUN ST 81,820 YISHUN ST 81,07 TO 09,7,9,Improved,...,122.0,1988,65 years 06 months,65,6,585000.0,1461.540937,4795.081967,22.485245,73.770492
210527,2022_03_359_YISHUN_RING_RD_01_TO_03,2022-03-01,359,YISHUN,YISHUN RING RD,359 YISHUN RING RD,01 TO 03,1,3,Apartment,...,145.0,1988,65 years 05 months,65,5,682000.0,1433.610989,4703.448276,22.055554,72.360743
210538,2022_03_751_YISHUN_ST_72_04_TO_06,2022-03-01,751,YISHUN,YISHUN ST 72,751 YISHUN ST 72,04 TO 06,4,6,New Generation,...,91.0,1984,61 years 02 months,61,2,458000.0,1534.048303,5032.967033,25.148333,82.507656


In [13]:
rs_df[rs_df['remaining_lease_years'].isnull() == True]

Unnamed: 0,transaction_id,month,block,town,street_name,address,storey_range,storey_range_min,storey_range_max,flat_model,...,floor_area_sqm,lease_commence_date,remaining_lease,remaining_lease_years,remaining_lease_months,resale_price,price_per_sq_ft,price_per_sq_m,price_per_sq_ft_per_lease_yr,price_per_sq_m_per_lease_yr


In [15]:
rs_df.to_sql('resale_prices', conn, if_exists='replace', index=False)