## Generating distance matrix

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

sns.set(rc={'figure.figsize':(10,7)})

In [None]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')

## Distance metrics

Create distance matrix for all tables since haversine is not cheap to run.

In [None]:
import numpy as np

def haversine_np(lat1, lon1, lat2, lon2):
    """
    https://stackoverflow.com/questions/27928/calculate-distance-between-two-latitude-longitude-points-haversine-formula
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km


In [None]:
def count_n_nearest(df, row):
    length = df.shape[0]
    lat = [row['latitude'] for i in range(length)]
    lng = [row['longitude'] for i in range(length)]
    return haversine_np(list(df['lat']), list(df['lng']), lat, lng)

### Sec Schools Distance Matrix

In [None]:
ssch = pd.read_csv("../data/raw/sg-secondary-schools.csv")

In [None]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(ssch, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("ssch-dist-matrix.csv")

In [None]:
overall = []
for index, row in test[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(ssch, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("test-ssch-dist-matrix.csv")

###  Pri Schools Distance Matrix

In [None]:
psch = pd.read_csv("../data/raw/sg-primary-schools.csv")

In [None]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(psch, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("psch-dist-matrix.csv")

In [None]:
overall = []
for index, row in test[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(psch, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("test-psch-dist-matrix.csv")

### Mall Distance Matrix

In [None]:
mall = pd.read_csv("../data/raw/sg-shopping-malls.csv")

In [None]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(mall, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("mall-dist-matrix.csv")

In [None]:
overall = []
for index, row in test[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(mall, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("test-mall-dist-matrix.csv")

### Commercial Center Distance Matrix

In [None]:
comm = pd.read_csv("../data/raw/sg-commerical-centres.csv")

In [None]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(comm, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("comm-dist-matrix.csv")

In [None]:
overall = []
for index, row in test[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(comm, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("test-comm-dist-matrix.csv")

### Hawker Center Distance Matrix

In [None]:
hawker = pd.read_csv("../data/raw/sg-gov-markets-hawker-centres.csv")

In [None]:
overall = []
for index, row in train[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(hawker, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("hawker-dist-matrix.csv")

In [None]:
overall = []
for index, row in test[['latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest(hawker, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("test-hawker-dist-matrix.csv")

## MRT distance metrix

Difference is that mrts that existed only after the sales are masked.

In [None]:
stations = pd.read_csv("../data/raw/sg-train-stations.csv")

In [None]:
def count_n_nearest_mrt(df, row):
    length = df.shape[0]
    lat = [row['latitude'] for i in range(length)]
    lng = [row['longitude'] for i in range(length)]
    can_be_used = np.array([row['year'] for i in range(length)]) > df['opening_year']
    can_be_used = np.where(~can_be_used, 999999, can_be_used)
    return haversine_np(list(df['lat']), list(df['lng']), lat, lng) * can_be_used

In [None]:
overall = []
for index, row in train[['year', 'latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest_mrt(stations, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("stations-dist-matrix.csv")

In [None]:
overall = []
for index, row in test[['year', 'latitude', 'longitude']].iterrows():
    overall.append(count_n_nearest_mrt(stations, row))
matrix = pd.DataFrame.from_records(overall)
matrix.to_csv("test-stations-dist-matrix.csv")