## Import Libraries

In [109]:
import datetime
import math
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
import json
from bs4 import BeautifulSoup
import time
import zipfile
import requests
# import gdown
import pickle
import io
from datetime import date
from requests import Session
from geopy.geocoders import Nominatim

## Pre-set Variables

In [110]:
district_postal = {
    "1" : ["01", "02", "03", "04", "05", "06"],
    "2" : ["07", "08"],
    "3" : ["14", "15", "16"],
    "4" : ["09", "10"],
    "5" : ["11", "12", '13'],
    "6" : ["17"],
    "7" : ["18", "19"],
    "8" : ["20", "21"],
    "9" : ["22", "23"],
    "10" : ["24", "25", "26", "27"],
    "11" : ["28", "29", "30"],
    "12" : ["31", "32", "33"],
    "13" : ["34", "35", "36", "37"],
    "14" : ["38", "39", "40", "41"],
    "15" : ["42", "43", "44", "45"],
    "16" : ["46", "47", "48"],
    "17" : ["49", "50", "81"],
    "18" : ["51", "52"],
    "19" : ["53", "54", "55", "82"],
    "20" : ["56", "57"],
    "21" : ["58", "59"],
    "22" : ["60", "61", "62", "63", "64"],
    "23" : ["65", "66", "67", "68"],
    "24" : ["69", "70", "71"],
    "25" : ["72", "73"],
    "26" : ["77", "78"],
    "27" : ["75", "76"],
    "28" : ["79", "80"]
}

In [111]:
replace_floors = {
    '01 TO 03' : '01-05',
    '04 TO 06' : '01-05',
    '07 TO 09' : '06-10',
    '10 TO 12' : '11-15',
    '13 TO 15' : '11-15',
    '16 TO 18' : '16-20',
    '19 TO 21' : '16-20',
    '22 TO 24' : '21-25',
    '25 TO 27' : '26-30',
    '28 TO 30' : '26-30',
    '31 TO 33' : '31-35',
    '34 TO 36' : '31-35',
    '37 TO 39' : '36-40',
    '40 TO 42' : '41-45',
    '43 TO 45' : '41-45',
    '46 TO 48' : '46-50',
    '-' : '-'
}

## Import CSVs needed

In [112]:
bus_stops_url = 'https://raw.githubusercontent.com/nicolepng/BT4222/main/Data/bus_stops.csv' 
bus_stops = pd.read_csv(bus_stops_url)
bus_stops.drop(bus_stops.columns[0], axis=1, inplace=True)

In [113]:
amenities_url = 'https://raw.githubusercontent.com/nicolepng/BT4222/main/Data/ameneties_per_district.csv' 
ameneties_per_district = pd.read_csv(amenities_url)
ameneties_per_district.drop(ameneties_per_district.columns[0], axis=1, inplace=True)

In [114]:
avg_crime_locations_url = 'https://raw.githubusercontent.com/nicolepng/BT4222/main/Data/average_crimes_by_location_v3.csv' 
average_crimes_by_location_v3 = pd.read_csv(avg_crime_locations_url)
average_crimes_by_location_v3

Unnamed: 0,Neighbourhood,Postal,Offence,Number
0,Ang Mo Kio North,569784,Housebreaking,4.857143
1,Ang Mo Kio North,569784,Outrage Of Modesty,26.000000
2,Ang Mo Kio North,569784,Robbery,2.000000
3,Ang Mo Kio North,569784,Snatch Theft,2.714286
4,Ang Mo Kio North,569784,Theft Of Motor Vehicle,5.000000
...,...,...,...,...
210,[Total]Woodlands Police Division - Total,-,Housebreaking,0.285714
211,[Total]Woodlands Police Division - Total,-,Outrage Of Modesty,2.142857
212,[Total]Woodlands Police Division - Total,-,Robbery,0.000000
213,[Total]Woodlands Police Division - Total,-,Snatch Theft,0.000000


In [115]:
# swap dictionary mapping direction
postal_district = {k: oldk for oldk, oldv in district_postal.items() for k in oldv}

# create new column to obtain the 1st 2 characters of [Postal]
average_crimes_by_location_v3['postal prefix'] = average_crimes_by_location_v3['Postal'].astype(str).str[0:2]

# map postal to district code
average_crimes_by_location_v3['district']= average_crimes_by_location_v3['postal prefix'].map(postal_district) 

# get overall crime rate in each district
average_crimes_by_location_v3 = average_crimes_by_location_v3.groupby(['district']).agg({'Number':'sum'}).reset_index()

district_int = average_crimes_by_location_v3.district.astype(int)
district_int = pd.DataFrame(district_int)
average_crimes_by_location_v3['district'] = district_int

In [116]:
average_crimes_by_location_v3

Unnamed: 0,district,Number
0,10,35.428571
1,12,60.857143
2,14,167.714286
3,15,45.428571
4,16,130.0
5,18,163.857143
6,19,202.142857
7,2,200.0
8,20,122.714286
9,22,226.0


In [117]:
sentiment_url = 'https://raw.githubusercontent.com/nicolepng/BT4222/main/Data/combined_vader_sentiment.csv' 
sentiment = pd.read_csv(sentiment_url)
sentiment = sentiment[['district_num','weighted_sentiment','year']]
#sentiment.columns = ['district_num', 'weighted_sentiment', 'year']
sentiment

Unnamed: 0,district_num,weighted_sentiment,year
0,21,0.181969,2018
1,1,0.133391,2018
2,2,0.165917,2018
3,3,0.061551,2018
4,4,0.059997,2018
...,...,...,...
107,24,0.051782,2021
108,25,0.078996,2021
109,26,0.822100,2021
110,27,0.102094,2021


## Functions needed to get input

In [118]:
# Get district number
def get_postal_onemap(place, district_postal):
    start_code= "https://developers.onemap.sg/commonapi/search?returnGeom=Y&getAddrDetails=Y&pageNum=1&searchVal="+ place
    s_response = requests.get(start_code)
    s_data = json.loads(s_response.text)
    postal = None
    count = 0
    while postal is None:
        count += 1
        if count == 10:
            postal = None
            break
        for i in range(len(s_data['results'])):
            postal = s_data['results'][i]['POSTAL']
            try:
                postal = int(postal)
            except:
                continue
    for district, sub_dist in district_postal.items():
        if str(postal)[:2] in sub_dist:
            dist = district
    try:
        return dist
    except:
        return -1

In [119]:
# get latitude and longitude
def get_lat_long(street):
    geolocator = Nominatim(user_agent="newtestuserbtproj")
    location = geolocator.geocode({"street": street, "country": "Singapore"})
    coordinates = [location.latitude, location.longitude]
    return coordinates

In [120]:
# get number of bus stops
# Formula to calculate distance 
from math import cos, sqrt
R = 6371000 #radius of the Earth in m
def distance(lon1, lat1, lon2, lat2):
    x = (lon2 - lon1) * cos(0.5*(lat2+lat1))
    y = (lat2 - lat1)
    return R * sqrt( x*x + y*y )

def num_of_bus_stops(lat, long):
    busStops = bus_stops.to_dict(orient='records')
    # threshold of within 1km
    numOfStops = []
    buslist = list(filter(lambda d: distance(d["Longitude"], d["Latitude"], long, lat) <= 1000, busStops))
    return len(buslist)

In [121]:
# get num of schools
def num_schools(district):
    sch_list = ameneties_per_district.loc[ameneties_per_district['district'] == district]['school'].item()
    num_sch = sch_list.strip('][').split(',') 
    if " SINGAPORE'" in num_sch:
        num_sch.remove(" SINGAPORE'")
    return len(num_sch)

In [122]:
# get num of supermarkets
def num_supermarkets(district):
    market_list = ameneties_per_district.loc[ameneties_per_district['district'] == district]['supermarkets'].item()
    num_market = market_list.strip('][').split(',') 
    return len(num_market)

In [123]:
# get num of hawker
def num_hawker(district):
    hawker_list = ameneties_per_district.loc[ameneties_per_district['district'] == district]['hawkercentre'].item()
    num_hawker = hawker_list.strip('][').split(',') 
    return len(num_hawker)

In [124]:
# get crime number 
def crime_num(district):
    if len(average_crimes_by_location_v3[average_crimes_by_location_v3.district == district]) == 0:
        return -1
    else:
        return average_crimes_by_location_v3[average_crimes_by_location_v3.district == district].Number.item() 

In [125]:
# get sentiment score
def sentiment_score(district, year):
    return sentiment[(sentiment.district_num == district) & (sentiment.year == year)].weighted_sentiment.item()

## Actual Code

district  
street             
propertyType        
remaining_lease   
price              
school             
hawkercentre        
supermarkets       
Bus Stops Nearby   
crime_number        
latitude          
longitude           
floor_area_sqm     
floor_range        
sentiment
Street given, propertyType given, remaining lease given, square feet also given

In [126]:
street = input("Enter Street Name: ")

propertyType = input("Enter Type of Property: ")

remaining_lease = int(input("Enter num of years left: "))

floor_area_sqm = input("Area of House (Square Feet): ")

floor_range = input("Enter Floor Range: ")

year = int(input("Current Year? "))

Enter Street Name: Lengkong Tujoh
Enter Type of Property: Terrace
Enter num of years left: 999
Area of House (Square Feet): 3341.11456
Enter Floor Range: -
Current Year? 2021


In [127]:
district = get_postal_onemap(street, district_postal)

In [128]:
district

'14'

In [129]:
# Get all details 
#district = get_postal_onemap(street, district_postal)
coordinates = get_lat_long(street)
latitude = coordinates[0]
longitude = coordinates[1]
school = num_schools(int(district))
hawkercentre = num_hawker(int(district))
supermarkets = num_supermarkets(int(district))
bus_stops_nearby = num_of_bus_stops(latitude, longitude)
crime_number = crime_num(int(district))
score = sentiment_score(int(district), year)

In [130]:
# Create temp dataframe to fit into model
temp_column_names = ['district', 'street', 'propertyType', 'remaining_lease',
                    'school', 'hawkercentre', 'supermarkets', 'Bus Stops Neaby',
                    'crime_number', 'latitude', 'longitude', 'floor_area_sqm', 'floor_range',
                    'sentiment']
temp = pd.DataFrame(columns = temp_column_names)

In [131]:
temp = temp.append({'district': district, 'street': street, 'propertyType': propertyType, 
             'remaining_lease': remaining_lease,'school': school, 'hawkercentre': hawkercentre, 
             'supermarkets': supermarkets, 'Bus Stops Neaby': bus_stops_nearby,
            'crime_number': crime_number, 'latitude': latitude, 
             'longitude': longitude, 'floor_area_sqm': floor_area_sqm, 'floor_range': floor_range,
            'sentiment': score
}, ignore_index = True)

In [132]:
# Street, Property and Floor_range mapping
import pickle
street_mapping = pickle.load(open('/Users/vickiyew/Documents/BT4222/project/data/street_mapping.pickle', 'rb'))
floor_mapping = pickle.load(open('/Users/vickiyew/Documents/BT4222/project/data/floor_mapping.pickle', 'rb'))
property_mapping = pickle.load(open('/Users/vickiyew/Documents/BT4222/project/data/property_mapping.pickle', 'rb'))

In [133]:
temp['street'] = temp['street'].str.upper()
temp['street'] = temp['street'].map(street_mapping)
temp['propertyType'] = temp['propertyType'].map(property_mapping)
temp['floor_range'] = temp['floor_range'].map(floor_mapping)

In [134]:
temp

Unnamed: 0,district,street,propertyType,remaining_lease,school,hawkercentre,supermarkets,Bus Stops Neaby,crime_number,latitude,longitude,floor_area_sqm,floor_range,sentiment
0,14,1065,15,999,29,7,14,1,167.714286,1.328467,103.914929,3341.11456,0,0.247077


## Feature Cross (Latitude and Longitude)

In [135]:
combined = '/Users/vickiyew/Desktop/bt4222_data/combined.csv'
df = pd.read_csv(combined, index_col=0)
df = df.reset_index(drop=True)

# Label Encoding 
df['street'] = df['street'].astype('category')
street_dict = dict(zip(df['street'], df['street'].cat.codes))
df['street'] = df['street'].cat.codes

df['propertyType'] = df['propertyType'].astype('category')
property_dict = dict(zip(df['propertyType'], df['propertyType'].cat.codes))
df['propertyType'] = df['propertyType'].cat.codes

df['floor_range'] = df['floor_range'].astype('category')
floor_dict = dict(zip(df['floor_range'], df['floor_range'].cat.codes))
df['floor_range'] = df['floor_range'].cat.codes

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('price', axis=1),
                                                    df['price'], 
                                                    test_size=0.2,
                                                    random_state=1)



In [136]:
_, longitude_boundaries = np.histogram(X_train['longitude'])
_, latitude_boundaries = np.histogram(X_train['latitude'])

def assign_bucket(x, ranges):
    if x <= ranges[0]:
        return 0
    
    for i in range(1, len(ranges) - 1):
        if x <= ranges[i]:
            return i - 1
    return len(ranges) - 2

X_train['lgt_discrete'] = X_train['longitude'].apply(lambda x: assign_bucket(x, longitude_boundaries))
X_train['lat_discrete'] = X_train['latitude'].apply(lambda x: assign_bucket(x, latitude_boundaries))
X_test['lgt_discrete'] = X_test['longitude'].apply(lambda x: assign_bucket(x, longitude_boundaries))
X_test['lat_discrete'] = X_test['latitude'].apply(lambda x: assign_bucket(x, latitude_boundaries))

In [137]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categories='auto')
features_lat_lgn_train = enc.fit_transform(X_train[['lgt_discrete', 'lat_discrete']]) # fit and transform TRAIN
features_lat_lgn_test = enc.transform(X_test[['lgt_discrete', 'lat_discrete']]) # transform only TEST

In [138]:
features_to_drop = ['longitude', 'latitude', 'lgt_discrete', 'lat_discrete']
OHE_train = np.hstack((X_train.drop(features_to_drop, axis=1).values, features_lat_lgn_train.toarray()))
OHE_test = np.hstack((X_test.drop(features_to_drop, axis=1).values, features_lat_lgn_test.toarray()))

In [139]:
OHE_test[0]

array([2.00000000e+01, 1.94000000e+03, 8.00000000e+00, 8.08300000e+01,
       3.70000000e+01, 1.10000000e+01, 5.00000000e+00, 0.00000000e+00,
       1.22714286e+02, 1.41000000e+02, 1.00000000e+00, 7.01235837e-02,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00])

## >> do feature cross for temp

In [140]:
temp['lgt_discrete'] = temp['longitude'].apply(lambda x: assign_bucket(x, longitude_boundaries))
temp['lat_discrete'] = temp['latitude'].apply(lambda x: assign_bucket(x, latitude_boundaries))
features_lat_lgn_temp = enc.transform(temp[['lgt_discrete', 'lat_discrete']])

In [141]:
features_to_drop = ['longitude', 'latitude', 'lgt_discrete', 'lat_discrete']
OHE_temp = np.hstack((temp.drop(features_to_drop, axis=1).values, features_lat_lgn_temp.toarray()))

In [142]:
OHE_temp

array([['14', 1065, 15, 999, 29, 7, 14, 1, 167.71428571428572,
        '3341.11456', 0, 0.2470765840220384, 0.0, 0.0, 0.0, 0.0, 0.0,
        0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
        0.0, 0.0]], dtype=object)

## Load Model 
> saved_model.sav can be found in our Github Repo: main > Models
> Please download saved_model.sav and load it in the next cell:

In [143]:
# load model
loaded_model = pickle.load(open('/Users/vickiyew/Documents/BT4222/project/codebase/BT4222/Models/saved_model.sav', 'rb'))

## Actual Price Prediction - Predict on OHE_temp

In [144]:
print('Predicted Price Per Sqft: $' + str(loaded_model.predict(OHE_temp)[0]))

Predicted Price Per Sqft: $1601.4767
