In [178]:

import datetime
import math
import matplotlib.pyplot as plt 
import numpy as np
import pandas as pd
import json
from bs4 import BeautifulSoup
import time
import zipfile
import requests
#import gdown
import pickle
import io
from datetime import date
from requests import Session
from geopy.geocoders import Nominatim

In [179]:
# path = '/content/drive/My Drive/BT4222 Data'

## Pre-set Variables

In [180]:
district_postal = {
    "1" : ["01", "02", "03", "04", "05", "06"],
    "2" : ["07", "08"],
    "3" : ["14", "15", "16"],
    "4" : ["09", "10"],
    "5" : ["11", "12", '13'],
    "6" : ["17"],
    "7" : ["18", "19"],
    "8" : ["20", "21"],
    "9" : ["22", "23"],
    "10" : ["24", "25", "26", "27"],
    "11" : ["28", "29", "30"],
    "12" : ["31", "32", "33"],
    "13" : ["34", "35", "36", "37"],
    "14" : ["38", "39", "40", "41"],
    "15" : ["42", "43", "44", "45"],
    "16" : ["46", "47", "48"],
    "17" : ["49", "50", "81"],
    "18" : ["51", "52"],
    "19" : ["53", "54", "55", "82"],
    "20" : ["56", "57"],
    "21" : ["58", "59"],
    "22" : ["60", "61", "62", "63", "64"],
    "23" : ["65", "66", "67", "68"],
    "24" : ["69", "70", "71"],
    "25" : ["72", "73"],
    "26" : ["77", "78"],
    "27" : ["75", "76"],
    "28" : ["79", "80"]
}

In [181]:
replace_floors = {
    '01 TO 03' : '01-05',
    '04 TO 06' : '01-05',
    '07 TO 09' : '06-10',
    '10 TO 12' : '11-15',
    '13 TO 15' : '11-15',
    '16 TO 18' : '16-20',
    '19 TO 21' : '16-20',
    '22 TO 24' : '21-25',
    '25 TO 27' : '26-30',
    '28 TO 30' : '26-30',
    '31 TO 33' : '31-35',
    '34 TO 36' : '31-35',
    '37 TO 39' : '36-40',
    '40 TO 42' : '41-45',
    '43 TO 45' : '41-45',
    '46 TO 48' : '46-50',
    '-' : '-'
}

## Import CSVs needed

In [182]:
bus_stops_url = 'https://raw.githubusercontent.com/nicolepng/BT4222/main/Data/bus_stops.csv' 
bus_stops = pd.read_csv(bus_stops_url)
bus_stops.drop(bus_stops.columns[0], axis=1, inplace=True)

In [183]:
amenities_url = 'https://raw.githubusercontent.com/nicolepng/BT4222/main/Data/ameneties_per_district.csv' 
ameneties_per_district = pd.read_csv(amenities_url)
ameneties_per_district.drop(ameneties_per_district.columns[0], axis=1, inplace=True)

In [184]:
avg_crime_locations_url = 'https://raw.githubusercontent.com/nicolepng/BT4222/main/Data/average_crimes_by_location_v3.csv' 
average_crimes_by_location_v3 = pd.read_csv(avg_crime_locations_url)

In [185]:
# swap dictionary mapping direction
postal_district = {k: oldk for oldk, oldv in district_postal.items() for k in oldv}

# create new column to obtain the 1st 2 characters of [Postal]
average_crimes_by_location_v3['postal prefix'] = average_crimes_by_location_v3['Postal'].astype(str).str[0:2]

# map postal to district code
average_crimes_by_location_v3['district']= average_crimes_by_location_v3['postal prefix'].map(postal_district) 

# get overall crime rate in each district
average_crimes_by_location_v3 = average_crimes_by_location_v3.groupby(['district']).agg({'Number':'sum'}).reset_index()

district_int = average_crimes_by_location_v3.district.astype(int)
district_int = pd.DataFrame(district_int)
average_crimes_by_location_v3['district'] = district_int

In [186]:
sentiment_url = 'https://raw.githubusercontent.com/nicolepng/BT4222/main/Data/combined_vader_sentiment.csv' 
sentiment = pd.read_csv(sentiment_url)
sentiment = sentiment[['district_num','weighted_sentiment','year']]
#sentiment.columns = ['district_num', 'weighted_sentiment', 'year']
sentiment

Unnamed: 0,district_num,weighted_sentiment,year
0,21,0.181969,2018
1,1,0.133391,2018
2,2,0.165917,2018
3,3,0.061551,2018
4,4,0.059997,2018
...,...,...,...
107,24,0.051782,2021
108,25,0.078996,2021
109,26,0.822100,2021
110,27,0.102094,2021


In [187]:
# ! gdown --id 1EHquP_sYLLYOmi7oGam_tEyd1pWXWMT2

## note: model will be imported from local 

In [188]:
# import model
# https://drive.google.com/u/1/uc?id=1xrcwXEFJkEZYoYinpEXK54RAaMTHe7J-&export=download
# url = 'https://drive.google.com/u/1/uc?id=1xrcwXEFJkEZYoYinpEXK54RAaMTHe7J-'
# output = 'finalized_model_new.sav'
# gdown.download(url, output, quiet=False)

## Functions needed to get input

In [189]:
# Get district number
def get_postal_onemap(place, district_postal):
    start_code= "https://developers.onemap.sg/commonapi/search?returnGeom=Y&getAddrDetails=Y&pageNum=1&searchVal="+ place
    s_response = requests.get(start_code)
    s_data = json.loads(s_response.text)
    postal = None
    count = 0
    while postal is None:
        count += 1
        if count == 10:
            postal = None
            break
        for i in range(len(s_data['results'])):
            postal = s_data['results'][i]['POSTAL']
            try:
                postal = int(postal)
            except:
                continue
    for district, sub_dist in district_postal.items():
        if str(postal)[:2] in sub_dist:
            dist = district
    try:
        return dist
    except:
        return -1

In [190]:
# get latitude and longitude
def get_lat_long(street):
    geolocator = Nominatim(user_agent="newtestuserbtproj")
    location = geolocator.geocode({"street": street, "country": "Singapore"})
    coordinates = [location.latitude, location.longitude]
    return coordinates

In [191]:
# get number of bus stops
# Formula to calculate distance 
from math import cos, sqrt
R = 6371000 #radius of the Earth in m
def distance(lon1, lat1, lon2, lat2):
    x = (lon2 - lon1) * cos(0.5*(lat2+lat1))
    y = (lat2 - lat1)
    return R * sqrt( x*x + y*y )

def num_of_bus_stops(lat, long):
    busStops = bus_stops.to_dict(orient='records')
    # threshold of within 1km
    numOfStops = []
    buslist = list(filter(lambda d: distance(d["Longitude"], d["Latitude"], long, lat) <= 1000, busStops))
    return len(buslist)

In [192]:
# get num of schools
def num_schools(district):
    sch_list = ameneties_per_district.loc[ameneties_per_district['district'] == district]['school'].item()
    num_sch = sch_list.strip('][').split(',') 
    if " SINGAPORE'" in num_sch:
        num_sch.remove(" SINGAPORE'")
    return len(num_sch)

In [193]:
# get num of supermarkets
def num_supermarkets(district):
    market_list = ameneties_per_district.loc[ameneties_per_district['district'] == district]['supermarkets'].item()
    num_market = market_list.strip('][').split(',') 
    return len(num_market)

In [194]:
# get num of hawker
def num_hawker(district):
    hawker_list = ameneties_per_district.loc[ameneties_per_district['district'] == district]['hawkercentre'].item()
    num_hawker = hawker_list.strip('][').split(',') 
    return len(num_hawker)

In [195]:
# get crime number 
def crime_num(district):
    return average_crimes_by_location_v3[average_crimes_by_location_v3.district == district].Number.item() 

In [196]:
# get sentiment score
def sentiment_score(district, year):
    return sentiment[(sentiment.district_num == district) & (sentiment.year == year)].weighted_sentiment.item()

In [197]:
# get floor range


## Actual Code

district  
street             
propertyType        
remaining_lease   
price              
school             
hawkercentre        
supermarkets       
Bus Stops Nearby   
crime_number        
latitude          
longitude           
floor_area_sqm     
floor_range        
sentiment
Street given, propertyType given, remaining lease given, square feet also given

In [211]:
street = input("Enter Street Name: ")

propertyType = input("Enter Type of Property: ")

remaining_lease = int(input("Enter num of years left: "))

floor_area_sqm = input("Area of House (Square Metres): ")

floor_range = input("Enter Floor Range: ")

year = int(input("Current Year? "))

Enter Street Name: Jalan Khamis
Enter Type of Property: Semi-detached
Enter num of years left: 999
Area of House (Square Metres): 334.5
Enter Floor Range: -
Current Year? 2021


In [212]:
district = get_postal_onemap(street, district_postal)

In [213]:
district

'20'

In [214]:
# Get all details 
#district = get_postal_onemap(street, district_postal)
coordinates = get_lat_long(street)
latitude = coordinates[0]
longitude = coordinates[1]
school = num_schools(int(district))
hawkercentre = num_hawker(int(district))
supermarkets = num_supermarkets(int(district))
bus_stops_nearby = num_of_bus_stops(latitude, longitude)
crime_number = crime_num(int(district))
score = sentiment_score(int(district), year)

In [215]:
# Create temp dataframe to fit into model
# temp_column_names = ['district', 'street', 'propertyType', 'remaining_lease',
#                     'school', 'hawkercentre', 'supermarkets', 'Bus Stops Neaby',
#                     'crime_number', 'latitude', 'longitude', 'floor_area_sqm', 'floor_range',
#                     'sentiment']

temp_column_names = ['district', 'street', 'propertyType', 'remaining_lease', 'school',
       'hawkercentre', 'supermarkets', 'Bus Stops Nearby', 'crime_number',
       'latitude', 'longitude', 'floor_area_sqm', 'floor_range', 'sentiment']

temp = pd.DataFrame(columns = temp_column_names)

In [216]:
temp = temp.append({'district': district, 'street': street, 'propertyType': propertyType, 
             'remaining_lease': remaining_lease,'school': school, 'hawkercentre': hawkercentre, 
             'supermarkets': supermarkets, 'Bus Stops Neaby': bus_stops_nearby,
            'crime_number': crime_number, 'latitude': latitude, 
             'longitude': longitude, 'floor_area_sqm': floor_area_sqm, 'floor_range': floor_range,
            'sentiment': score
}, ignore_index = True)

In [217]:
# Street, Property and Floor_range mapping
import pickle
street_mapping = pickle.load(open('/Users/vickiyew/Documents/BT4222/project/data/street_mapping.pickle', 'rb'))
floor_mapping = pickle.load(open('/Users/vickiyew/Documents/BT4222/project/data/floor_mapping.pickle', 'rb'))
property_mapping = pickle.load(open('/Users/vickiyew/Documents/BT4222/project/data/property_mapping.pickle', 'rb'))

In [218]:
temp['street'] = temp['street'].str.upper()
temp['street'] = temp['street'].map(street_mapping)
temp['propertyType'] = temp['propertyType'].map(property_mapping)
temp['floor_range'] = temp['floor_range'].map(floor_mapping)

In [219]:
temp

Unnamed: 0,district,street,propertyType,remaining_lease,school,hawkercentre,supermarkets,Bus Stops Nearby,crime_number,latitude,longitude,floor_area_sqm,floor_range,sentiment,Bus Stops Neaby
0,20,776,11,999,37,11,5,,122.714286,1.353818,103.837695,334.5,0,0.129204,0.0


In [220]:
temp.drop(temp.columns[7], axis=1)

Unnamed: 0,district,street,propertyType,remaining_lease,school,hawkercentre,supermarkets,crime_number,latitude,longitude,floor_area_sqm,floor_range,sentiment,Bus Stops Neaby
0,20,776,11,999,37,11,5,122.714286,1.353818,103.837695,334.5,0,0.129204,0.0


In [221]:
temp = temp.reindex(columns=temp_column_names)

In [226]:
temp

Unnamed: 0,district,street,propertyType,remaining_lease,school,hawkercentre,supermarkets,Bus Stops Nearby,crime_number,latitude,longitude,floor_area_sqm,floor_range,sentiment
0,20,776,11,999,37,11,5,,122.714286,1.353818,103.837695,334.5,0,0.129204


## Run Model

In [222]:
# load model
loaded_model = pickle.load(open('/Users/vickiyew/Downloads/finalized_model.sav', 'rb'))

In [223]:
loaded_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=0.2, colsample_bytree=1, eval_metric='rmse',
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.1, max_delta_step=0,
             max_depth=10, min_child_weight=1, min_samples_leaf=2, missing=None,
             monotone_constraints='()', n_estimators=300, n_fold=5, n_jobs=1,
             nthread=1, num_parallel_tree=1, objective='reg:linear',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=0, silent=None, subsample=0.9, ...)

## Predict Results

In [224]:
import numpy as np
np.array(temp.loc[0].values.tolist()).astype(np.float)

array([2.00000000e+01, 7.76000000e+02, 1.10000000e+01, 9.99000000e+02,
       3.70000000e+01, 1.10000000e+01, 5.00000000e+00,            nan,
       1.22714286e+02, 1.35381770e+00, 1.03837695e+02, 3.34500000e+02,
       0.00000000e+00, 1.29203939e-01])

In [225]:
# Predicted Price
loaded_model.predict(np.array(temp.loc[0].values.tolist()).astype(np.float).reshape(1, -1))

ValueError: feature_names mismatch: ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31'] ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13']
expected f26, f16, f29, f25, f30, f19, f22, f27, f23, f24, f14, f20, f31, f18, f15, f21, f17, f28 in input data