In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set(rc={'figure.figsize':(10,7)})

In [269]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

In [270]:
train['year'] = train.month.str[:4].astype('int')
train['price_per_area'] = train['resale_price']/train['floor_area_sqm']
train['lease_duration'] = train['year']-train['lease_commence_date']
train['storey'] = (train.storey_range.str[-2:].astype('int') + train.storey_range.str[:2].astype('int'))/2
train['flat_type'] = train['flat_type'].str.replace('-', ' ') # there are some flat_type encoded as 4-room and 4 room

# flat_type as ordinal variables (either this or the one after)
train.flat_type_ord = train.flat_type.map({
    '1_room':1,
    '2_room':2,
    '3_room':3,
    '4_room':4,
    '5_room':5,
    'executive':6,
    'multi_generation': 7
})

# flat_type as categorical variable (either this or the one before)
train = pd.get_dummies(train, columns=["flat_type"], drop_first=False)
train.drop(columns = 'flat_type_multi generation') # dropping this since it is the least frequent among all flat types


train.drop(columns = ['elevation', 'eco_category'], inplace=True) # confirm drop
train.drop(columns = ['month', 'storey_range', 'block', 'street_name', 'planning_area', 'lease_commence_date'], inplace=True) # to discuss

  from ipykernel import kernelapp as app


## joining with supplementary data

simple joins for
* comm
* hawker
* mall

count number of x within

https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula

1 deg lat = 110.574km
1 deg lng = 

In [5]:
# comm = pd.read_csv("../data/raw/sg-commerical-centres.csv")
# hawker = pd.read_csv("../data/raw/sg-gov-markets-hawker-centres.csv")
# mall = pd.read_csv("../data/raw/sg-shopping-malls.csv")
# pop = pd.read_csv("../data/raw/sg-population-demographics.csv") # calculate median age
# psch = pd.read_csv("../data/raw/sg-primary-schools.csv")
# ssch = pd.read_csv("../data/raw/sg-secondary-schools.csv")
# stations = pd.read_csv("../data/raw/sg-train-stations.csv")

In [61]:
import numpy as np

def haversine_np(lat1, lon1, lat2, lon2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km


## Distance metrics

Create distance matrix for all tables since haversine is not cheap to run.

In [101]:
def count_n_nearest(df, row):
    length = df.shape[0]
    lat = [row['latitude'] for i in range(length)]
    lng = [row['longitude'] for i in range(length)]
    return haversine_np(list(df['lat']), list(df['lng']), lat, lng)

In [217]:
# overall = []
# for index, row in train[['latitude', 'longitude']].iterrows():
#     overall.append(count_n_nearest(ssch, row))

In [218]:
# ssch_dist_matrix = pd.DataFrame.from_records(overall)

In [219]:
# ssch_dist_matrix.to_csv("ssch-dist-matrix.csv")

In [223]:
# (ssch_dist_matrix < 1).sum(axis=1)

0         4
1         6
2         1
3         2
4         2
         ..
431727    3
431728    1
431729    3
431730    4
431731    2
Length: 431732, dtype: int64

## Processing population data

Calculate average age

In [145]:
def getage(row):
    if '+' in row:
        return int(row.replace('+', ''))
    else:
        nums = row.split('-')
        return (int(nums[1])+int(nums[0])) / 2.0

In [146]:
# ## pop age
# pop['age'] = pop['age_group'].apply(getage)

In [162]:
# df = pop[['subzone', 'age', 'count']].groupby(['subzone', 'age']).sum()
# df = df.reset_index()

In [175]:
# df['weight'] = df['age'] * df['count']
# df2 = df[['subzone', 'count', 'weight']].groupby(['subzone']).sum().reset_index()
# df2['avg'] = df2['weight']/df2['count']

In [177]:
# df2.to_csv("subzone-ave-age.csv")

## MRT distance metrix

Difference is that mrts that existed only after the sales are masked.

In [207]:
def count_n_nearest_mrt(df, row):
    length = df.shape[0]
    lat = [row['latitude'] for i in range(length)]
    lng = [row['longitude'] for i in range(length)]
    can_be_used = np.array([row['year'] for i in range(length)]) > df['opening_year']
    return haversine_np(list(df['lat']), list(df['lng']), lat, lng)* can_be_used

In [208]:
# overall = []
# for index, row in train[['year', 'latitude', 'longitude']].iterrows():
#     overall.append(count_n_nearest_mrt(stations, row))

In [210]:
# stations_dist_matrix = pd.DataFrame.from_records(overall)

In [211]:
# stations_dist_matrix.to_csv("stations-dist-matrix.csv")

In [225]:
stations_dist_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,156,157,158,159,160,161,162,163,164,165
0,8.238240,25.292433,14.965022,22.707793,18.066239,23.686068,0.000000,19.293354,9.290155,2.928068,...,0.000000,0.000000,12.829428,0.000000,15.195471,0.000000,0.000000,0.000000,0.00000,0.0
1,8.814102,20.371297,8.471771,16.349576,15.756714,17.729461,24.272304,16.153991,9.167521,8.062610,...,0.000000,10.158189,9.328955,2.363647,14.178672,15.663789,17.883947,14.516850,0.00000,0.0
2,8.291978,16.500102,5.566827,13.061111,12.673020,14.080235,20.402635,12.708784,8.116411,10.162817,...,11.029939,13.184694,6.108596,2.229299,11.845070,12.110316,14.880418,12.121867,7.81489,0.0
3,15.170839,4.624461,13.227826,11.865984,6.429123,9.041308,0.000000,3.938219,13.989137,20.919087,...,0.000000,0.000000,9.227352,0.000000,10.161521,0.000000,0.000000,0.000000,0.00000,0.0
4,18.117614,1.290161,12.644089,8.829755,10.500596,5.513174,5.171669,8.001287,16.998531,23.415529,...,0.000000,27.315510,11.600738,17.155177,14.026016,7.187212,11.380985,0.000000,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,17.327730,11.770118,4.690429,3.415628,15.799224,6.158921,0.000000,14.151594,16.676282,20.282192,...,0.000000,23.187988,11.597435,0.000000,17.445070,0.000000,0.000000,0.000000,0.00000,0.0
431728,8.377979,16.783642,5.656982,13.228443,12.955551,14.303505,20.682268,13.007048,8.243312,10.054884,...,0.000000,13.008608,6.390588,2.068516,12.081293,12.411494,15.160158,12.362631,0.00000,0.0
431729,4.804022,22.489168,13.623527,20.964988,14.700001,21.511486,26.426771,16.042397,5.856077,2.157101,...,0.000000,6.005842,9.837300,5.961573,11.759498,0.000000,0.000000,0.000000,0.00000,0.0
431730,7.801887,19.411816,8.119934,15.882672,14.536634,17.053173,23.333417,14.966956,8.070285,7.822246,...,0.000000,10.389688,8.130678,1.152829,12.969888,14.497921,16.660422,0.000000,0.00000,0.0


## Joining

In [271]:
comm = pd.read_csv("../data/processed/comm-dist-matrix.csv")
hawker = pd.read_csv("../data/processed/hawker-dist-matrix.csv")
mall = pd.read_csv("../data/processed/mall-dist-matrix.csv")
psch = pd.read_csv("../data/processed/psch-dist-matrix.csv")
ssch = pd.read_csv("../data/processed/ssch-dist-matrix.csv")


stations = pd.read_csv("../data/processed/stations-dist-matrix.csv")
pop = pd.read_csv("../data/processed/subzone-ave-age.csv") # calculate median age

In [272]:
train['nbr_ssch_1k'] = (ssch < 1).sum(axis=1)
train['nbr_psch_1k'] = (psch < 1).sum(axis=1)
train['nbr_mall_1k'] = (mall < 1).sum(axis=1)
train['nbr_hawker_1k'] = (hawker < 1).sum(axis=1)
train['nbr_comm_1k'] = (comm < 1).sum(axis=1)
train['nbr_station_1k'] = (stations < 1).sum(axis=1) # need extra count for dups

In [273]:
train=train.join(pop[['subzone', 'avg']], lsuffix='subzone', rsuffix='subzone')

In [274]:
# drop redundant ones
train.drop(columns=['subzonesubzone', 'latitude', 'longitude', 'region'], inplace=True)

In [275]:

train = pd.get_dummies(train, columns=["town"], drop_first=False)
train = pd.get_dummies(train, columns=["flat_model"], drop_first=False)

In [276]:
train

Unnamed: 0,floor_area_sqm,resale_price,year,price_per_area,lease_duration,storey,flat_type_1 room,flat_type_2 room,flat_type_3 room,flat_type_4 room,...,flat_model_multi generation,flat_model_new generation,flat_model_premium apartment,flat_model_premium apartment loft,flat_model_premium maisonette,flat_model_simplified,flat_model_standard,flat_model_terrace,flat_model_type s1,flat_model_type s2
0,118.0,209700.0,2001,1777.118644,12,2.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,110.0,402300.0,2014,3657.272727,11,11.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,112.0,351000.0,2020,3133.928571,16,2.0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,67.0,151200.0,2000,2256.716418,20,8.0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
4,73.0,318600.0,2013,4364.383562,28,8.0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,101.0,238500.0,2005,2361.386139,5,2.0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
431728,95.0,376200.0,2016,3960.000000,4,14.0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
431729,67.0,255600.0,2011,3814.925373,25,2.0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
431730,123.0,508500.0,2013,4134.146341,14,17.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
