In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set(rc={'figure.figsize':(10,7)})

In [3]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

In [4]:
train['year'] = train.month.str[:4].astype('int')
train['price_per_area'] = train['resale_price']/train['floor_area_sqm']
train['lease_duration'] = train['year']-train['lease_commence_date']
train['storey'] = (train.storey_range.str[-2:].astype('int') + train.storey_range.str[:2].astype('int'))/2
train['flat_type'] = train['flat_type'].str.replace('-', ' ') # there are some flat_type encoded as 4-room and 4 room

# flat_type as ordinal variables (either this or the one after)
train.flat_type_ord = train.flat_type.map({
    '1_room':1,
    '2_room':2,
    '3_room':3,
    '4_room':4,
    '5_room':5,
    'executive':6,
    'multi_generation': 7
})

# flat_type as categorical variable (either this or the one before)
train = pd.get_dummies(train, columns=["flat_type"], drop_first=False)
train.drop(columns = 'flat_type_multi generation') # dropping this since it is the least frequent among all flat types


train.drop(columns = ['elevation', 'eco_category'], inplace=True) # confirm drop
train.drop(columns = ['month', 'storey_range', 'block', 'street_name', 'planning_area', 'lease_commence_date'], inplace=True) # to discuss

  from ipykernel import kernelapp as app


## joining with supplementary data

simple joins for
* comm
* hawker
* mall

count number of x within

https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula

1 deg lat = 110.574km
1 deg lng = 

In [6]:
# comm = pd.read_csv("../data/raw/sg-commerical-centres.csv")
# hawker = pd.read_csv("../data/raw/sg-gov-markets-hawker-centres.csv")
# mall = pd.read_csv("../data/raw/sg-shopping-malls.csv")
# pop = pd.read_csv("../data/raw/sg-population-demographics.csv") # calculate median age
# psch = pd.read_csv("../data/raw/sg-primary-schools.csv")
# ssch = pd.read_csv("../data/raw/sg-secondary-schools.csv")
stations = pd.read_csv("stations-dist-matrix.csv")

In [82]:
import numpy as np

def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km


## Distance metrics

Create distance matrix for all tables since haversine is not cheap to run.

In [60]:
def count_n_nearest(df, row):
    length = df.shape[0]
    lat = [row['latitude'] for i in range(length)]
    lng = [row['longitude'] for i in range(length)]
    return haversine_np(list(df['lng']), list(df['lat']), lng, lat)

In [217]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(ssch, row))
ssch_dist_matrix = pd.DataFrame.from_records(overall)
ssch_dist_matrix.to_csv("ssch-dist-matrix.csv")

In [218]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(psch, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("psch-dist-matrix.csv")

In [219]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(mall, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("mall-dist-matrix.csv")

In [None]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(hawker, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("hawker-dist-matrix.csv")

In [None]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(comm, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("comm-dist-matrix.csv")

In [223]:
# (ssch_dist_matrix < 1).sum(axis=1)

0         4
1         6
2         1
3         2
4         2
         ..
431727    3
431728    1
431729    3
431730    4
431731    2
Length: 431732, dtype: int64

## Processing population data

Calculate average age

In [61]:
def getage(row):
    if '+' in row:
        return int(row.replace('+', ''))
    else:
        nums = row.split('-')
        return (int(nums[1])+int(nums[0])) / 2.0

In [146]:
# ## pop age
# pop['age'] = pop['age_group'].apply(getage)

In [162]:
# df = pop[['subzone', 'age', 'count']].groupby(['subzone', 'age']).sum()
# df = df.reset_index()

In [175]:
# df['weight'] = df['age'] * df['count']
# df2 = df[['subzone', 'count', 'weight']].groupby(['subzone']).sum().reset_index()
# df2['avg'] = df2['weight']/df2['count']

In [177]:
# df2.to_csv("subzone-ave-age.csv")

## MRT distance metrix

Difference is that mrts that existed only after the sales are masked.

In [62]:
def count_n_nearest_mrt(df, row):
    length = df.shape[0]
    lat = [row['latitude'] for i in range(length)]
    lng = [row['longitude'] for i in range(length)]
    can_be_used = np.array([row['year'] for i in range(length)]) > df['opening_year']
    can_be_used = np.where(can_be_used == 0, 9999, can_be_used)
    return haversine_np(list(df['lng']), list(df['lat']), lng,lat) * can_be_used

In [83]:
overall = []
for index, row in train[['year', 'latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest_mrt(stations, row))

In [84]:
overall

[array([8.23823984e+00, 2.52924330e+01, 1.49650217e+01, 2.27077927e+01,
        1.80662389e+01, 2.36860681e+01, 2.92435373e+05, 1.92933536e+01,
        9.29015543e+00, 2.92806765e+00, 2.31084110e+00, 1.92411184e+01,
        1.52944925e+01, 1.13680756e+00, 2.23093613e+01, 1.02484567e+01,
        1.21039798e+01, 3.79997995e+04, 2.38826890e+01, 2.43698874e+01,
        1.88706579e+01, 3.58889477e+05, 3.35725227e+05, 2.04729738e+01,
        3.48045722e+05, 1.16164112e+01, 3.57089255e+05, 1.99301930e+01,
        2.10007516e+05, 1.77866663e+01, 7.36521869e+00, 2.33388703e+01,
        2.82686006e+01, 2.30027455e+01, 2.16905183e+01, 5.93406140e+00,
        3.14752426e+05, 2.65551375e+01, 4.84574074e+00, 1.27159477e+01,
        6.30456812e+04, 7.49377474e+04, 8.31911369e+04, 9.19393348e+04,
        7.82276265e+04, 2.17593972e+05, 2.09420497e+05, 7.43349336e+04,
        8.16056768e+04, 8.04614733e+04, 7.91972774e+04, 5.79027085e+04,
        5.82208752e+04, 9.74809986e+04, 2.33117879e+05, 6.867713

In [85]:
stations_dist_matrix = pd.DataFrame.from_records(overall)

In [86]:
stations_dist_matrix.to_csv("stations-dist-matrix.csv")

In [87]:
stations_dist_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,156,157,158,159,160,161,162,163,164,165
0,8.238240,25.292433,14.965022,22.707793,18.066239,23.686068,292435.372580,19.293354,9.290155,2.928068,...,30458.574830,35424.813111,12.829428,74454.687763,15.195471,191368.897611,198231.775893,155958.160093,165265.397952,211403.796441
1,8.814102,20.371297,8.471771,16.349576,15.756714,17.729461,24.272304,16.153991,9.167521,8.062610,...,87817.731110,10.158189,9.328955,2.363647,14.178672,15.663789,17.883947,14.516850,98858.653134,145441.182814
2,8.291978,16.500102,5.566827,13.061111,12.673020,14.080235,20.402635,12.708784,8.116411,10.162817,...,11.029939,13.184694,6.108596,2.229299,11.845070,12.110316,14.880418,12.121867,7.814890,118397.958937
3,15.170839,4.624461,13.227826,11.865984,6.429123,9.041308,79937.353254,3.938219,13.989137,20.919087,...,217894.278929,249980.860797,9.227352,157603.196890,10.161521,33583.886189,71418.029823,99171.989329,155177.452198,145759.283281
4,18.117614,1.290161,12.644089,8.829755,10.500596,5.513174,5.171669,8.001287,16.998531,23.415529,...,243169.675104,27.315510,11.600738,17.155177,14.026016,7.187212,11.380985,138302.200891,143144.111661,121105.749782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,17.327730,11.770118,4.690429,3.415628,15.799224,6.158921,147185.983104,14.151594,16.676282,20.282192,...,211516.108818,23.187988,11.597435,123182.748802,17.445070,130980.015748,177592.658581,175054.971403,43876.993255,22074.383640
431728,8.377979,16.783642,5.656982,13.228443,12.955551,14.303505,20.682268,13.007048,8.243312,10.054884,...,109128.485395,13.008608,6.390588,2.068516,12.081293,12.411494,15.160158,12.362631,78204.516376,119319.716390
431729,4.804022,22.489168,13.623527,20.964988,14.700001,21.511486,26.426771,16.042397,5.856077,2.157101,...,30653.153114,6.005842,9.837300,5.961573,11.759498,159599.178496,164116.882339,121606.044727,156667.277848,198977.149949
431730,7.801887,19.411816,8.119934,15.882672,14.536634,17.053173,23.333417,14.966956,8.070285,7.822246,...,86155.290098,10.389688,8.130678,1.152829,12.969888,14.497921,16.660422,133036.394473,98566.668397,143282.764924


## Joining

In [108]:
comm = pd.read_csv("../data/processed/comm-dist-matrix.csv")
hawker = pd.read_csv("../data/processed/hawker-dist-matrix.csv")
mall = pd.read_csv("../data/processed/mall-dist-matrix.csv")
psch = pd.read_csv("../data/processed/psch-dist-matrix.csv")
ssch = pd.read_csv("../data/processed/ssch-dist-matrix.csv")


stations = pd.read_csv("../data/processed/stations-dist-matrix.csv")
pop = pd.read_csv("../data/processed/subzone-ave-age.csv") # calculate median age

In [109]:
stations

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,156,157,158,159,160,161,162,163,164,165
0,0,8.238240,25.292433,14.965022,22.707793,18.066239,23.686068,292435.372580,19.293354,9.290155,...,30458.574830,35424.813111,12.829428,74454.687763,15.195471,191368.897611,198231.775893,155958.160093,165265.397952,211403.796441
1,1,8.814102,20.371297,8.471771,16.349576,15.756714,17.729461,24.272304,16.153991,9.167521,...,87817.731110,10.158189,9.328955,2.363647,14.178672,15.663789,17.883947,14.516850,98858.653134,145441.182814
2,2,8.291978,16.500102,5.566827,13.061111,12.673020,14.080235,20.402635,12.708784,8.116411,...,11.029939,13.184694,6.108596,2.229299,11.845070,12.110316,14.880418,12.121867,7.814890,118397.958937
3,3,15.170839,4.624461,13.227826,11.865984,6.429123,9.041308,79937.353254,3.938219,13.989137,...,217894.278929,249980.860797,9.227352,157603.196890,10.161521,33583.886189,71418.029823,99171.989329,155177.452198,145759.283281
4,4,18.117614,1.290161,12.644089,8.829755,10.500596,5.513174,5.171669,8.001287,16.998531,...,243169.675104,27.315510,11.600738,17.155177,14.026016,7.187212,11.380985,138302.200891,143144.111661,121105.749782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,431727,17.327730,11.770118,4.690429,3.415628,15.799224,6.158921,147185.983104,14.151594,16.676282,...,211516.108818,23.187988,11.597435,123182.748802,17.445070,130980.015748,177592.658581,175054.971403,43876.993255,22074.383640
431728,431728,8.377979,16.783642,5.656982,13.228443,12.955551,14.303505,20.682268,13.007048,8.243312,...,109128.485395,13.008608,6.390588,2.068516,12.081293,12.411494,15.160158,12.362631,78204.516376,119319.716390
431729,431729,4.804022,22.489168,13.623527,20.964988,14.700001,21.511486,26.426771,16.042397,5.856077,...,30653.153114,6.005842,9.837300,5.961573,11.759498,159599.178496,164116.882339,121606.044727,156667.277848,198977.149949
431730,431730,7.801887,19.411816,8.119934,15.882672,14.536634,17.053173,23.333417,14.966956,8.070285,...,86155.290098,10.389688,8.130678,1.152829,12.969888,14.497921,16.660422,133036.394473,98566.668397,143282.764924


In [110]:
train['nbr_ssch_1k'] = (ssch < 1).sum(axis=1)
train['nbr_psch_1k'] = (psch < 1).sum(axis=1)
train['nbr_mall_1k'] = (mall < 1).sum(axis=1)
train['nbr_hawker_1k'] = (hawker < 1).sum(axis=1)
train['nbr_comm_1k'] = (comm < 1).sum(axis=1)
train['nbr_station_1k'] = (stations < 1).sum(axis=1) # need extra count for dups

In [111]:
train=train.join(pop[['subzone', 'avg']], lsuffix='subzone', rsuffix='subzone')

In [112]:
# drop redundant ones
train.drop(columns=['subzonesubzone', 'latitude', 'longitude', 'region'], inplace=True)

In [113]:

train = pd.get_dummies(train, columns=["town"], drop_first=False)
train = pd.get_dummies(train, columns=["flat_model"], drop_first=False)

In [8]:
train['nbr_station_1k'] = (stations < 1).sum(axis=1) # need extra count for dups

In [None]:
sns.lineplot(x=train['nbr_station_1k'], y=train['resale_price'])

In [92]:
train[train['nbr_station_1k']>8]

Unnamed: 0,town,floor_area_sqm,flat_model,latitude,longitude,subzone,region,resale_price,year,price_per_area,lease_duration,storey,flat_type_1 room,flat_type_2 room,flat_type_3 room,flat_type_4 room,flat_type_5 room,flat_type_executive,flat_type_multi generation,nbr_station_1k
5353,central area,60.0,improved,1.299009,103.852285,victoria,central region,405000.0,2018,6750.0,32,17.0,0,0,1,0,0,0,0,9
10446,central area,68.0,improved,1.300933,103.853991,victoria,central region,405000.0,2020,5955.882353,41,11.0,0,0,1,0,0,0,0,9
12279,central area,82.0,improved,1.298539,103.851856,victoria,central region,481500.0,2018,5871.95122,40,8.0,0,0,0,1,0,0,0,9
14973,central area,60.0,improved,1.300933,103.853991,victoria,central region,364500.0,2020,6075.0,41,11.0,0,0,1,0,0,0,0,9
21857,central area,60.0,improved,1.300933,103.853991,victoria,central region,379800.0,2018,6330.0,39,20.0,0,0,1,0,0,0,0,9
30146,central area,92.0,new generation,1.300357,103.853466,victoria,central region,649800.0,2018,7063.043478,34,8.0,0,0,0,1,0,0,0,9
34320,central area,60.0,improved,1.300933,103.853991,victoria,central region,373500.0,2018,6225.0,39,5.0,0,0,1,0,0,0,0,9
58240,central area,82.0,improved,1.296853,103.853348,bugis,central region,587700.0,2018,7167.073171,38,11.0,0,0,0,1,0,0,0,9
66869,central area,68.0,improved,1.299009,103.852285,victoria,central region,414000.0,2018,6088.235294,32,11.0,0,0,1,0,0,0,0,9
86785,central area,68.0,improved,1.300933,103.853991,victoria,central region,450000.0,2018,6617.647059,39,17.0,0,0,1,0,0,0,0,9


In [96]:
for i, d in enumerate(list(stations_dist_matrix.iloc[5353])):
    if d < 1: 
        print(i)

83
85
90
93
102
111
127
145
146


In [81]:
haversine_np(1.35197, 103.755, 1.319778,103.903252)

16.854069474473505