In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import geopy.distance
from sklearn.preprocessing import OneHotEncoder

In [2]:
df_train = pd.read_csv("train.csv")
df_train.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900
3,2021-08,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850
4,2022-11,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100


In [3]:
town_encoder = OneHotEncoder()
flat_type_encoder = OneHotEncoder()
flat_model_encoder = OneHotEncoder()
planning_area_encoder = OneHotEncoder()
region_encoder = OneHotEncoder()

def preprocess_town(df, is_train):
    data = np.array(df["town"]).reshape(-1, 1)
    if is_train:
        data = town_encoder.fit_transform(data)
    else:
        data = town_encoder.transform(data)               
    return data.toarray()

def preprocess_flat_type(df, is_train):
    data = np.array(df["flat_type"]).reshape(-1, 1)
    if is_train:
        data = flat_type_encoder.fit_transform(data)
    else:
        data = flat_type_encoder.transform(data)               
    return data.toarray()

def preprocess_flat_model(df, is_train):
    data = np.array(df["flat_model"]).reshape(-1, 1)
    if is_train:
        data = flat_model_encoder.fit_transform(data)
    else:
        data = flat_model_encoder.transform(data)               
    return data.toarray()

def preprocess_floor_area_sqm(df, is_train):
    return np.array(df["floor_area_sqm"]).reshape(-1, 1) / 100

def preprocess_lease_commence_date(df, is_train):
    smallest_date = 1965
    return (np.array(df["lease_commence_date"]).reshape(-1, 1) - smallest_date) / 100

def preprocess_planning_area(df, is_train):
    data = np.array(df["planning_area"]).reshape(-1, 1)
    if is_train:
        data = planning_area_encoder.fit_transform(data)
    else:
        data = planning_area_encoder.transform(data)               
    return data.toarray()

def preprocess_region(df, is_train):
    data = np.array(df["region"]).reshape(-1, 1)
    if is_train:
        data = region_encoder.fit_transform(data)
    else:
        data = region_encoder.transform(data)               
    return data.toarray()

def compute_distance(pt_a, pts):
    res = []
    for pt in pts:
        dis = geopy.distance.distance(pt, pt_a).km
        res.append(dis)
    return res

df_existing_mrts = pd.read_csv("auxiliary-data/auxiliary-data/sg-mrt-existing-stations.csv")
def get_distance_to_existing_stations(df, k=1):
    latitude = np.array(df_existing_mrts["latitude"]).reshape(-1, 1)
    longitude = np.array(df_existing_mrts["longitude"]).reshape(-1, 1)
    pts = np.hstack([latitude, longitude])

    res = []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        pt_a = (row["latitude"], row["longitude"])
        distances = compute_distance(pt_a, pts)
        distances = sorted(distances)
        res.append([*distances[:k],])

    res = np.array(res) / 1000
    return res

df_planned_mrts = pd.read_csv("auxiliary-data/auxiliary-data/sg-mrt-planned-stations.csv")
def get_distance_to_planned_stations(df, k=1):
    latitude = np.array(df_planned_mrts["latitude"]).reshape(-1, 1)
    longitude = np.array(df_planned_mrts["longitude"]).reshape(-1, 1)
    pts = np.hstack([latitude, longitude])

    res = []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        pt_a = (row["latitude"], row["longitude"])
        distances = compute_distance(pt_a, pts)
        distances = sorted(distances)
        res.append([*distances[:k],])

    res = np.array(res) / 1000
    return res

df_primary_schools = pd.read_csv("auxiliary-data/auxiliary-data/sg-primary-schools.csv")
def get_distance_to_primary_schools(df, k=1):
    latitude = np.array(df_primary_schools["latitude"]).reshape(-1, 1)
    longitude = np.array(df_primary_schools["longitude"]).reshape(-1, 1)
    pts = np.hstack([latitude, longitude])

    res = []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        pt_a = (row["latitude"], row["longitude"])
        distances = compute_distance(pt_a, pts)
        distances = sorted(distances)
        res.append([*distances[:k],])

    res = np.array(res) / 1000
    return res

df_shopping_malls = pd.read_csv("auxiliary-data/auxiliary-data/sg-shopping-malls.csv")
def get_distance_to_shopping_malls(df, k=1):
    latitude = np.array(df_shopping_malls["latitude"]).reshape(-1, 1)
    longitude = np.array(df_shopping_malls["longitude"]).reshape(-1, 1)
    pts = np.hstack([latitude, longitude])

    res = []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        pt_a = (row["latitude"], row["longitude"])
        distances = compute_distance(pt_a, pts)
        distances = sorted(distances)
        res.append([*distances[:k],])

    res = np.array(res) / 1000
    return res

def preprocess(df, is_train, k=2):
    if is_train:
        town = preprocess_town(df, is_train)
        flat_type = preprocess_flat_type(df, is_train)
        flat_model = preprocess_flat_model(df, is_train)
        floor_area_sqm = preprocess_floor_area_sqm(df, is_train)
        lease_commence_date = preprocess_lease_commence_date(df, is_train)
        planning_area = preprocess_planning_area(df, is_train)
        region = preprocess_region(df, is_train)
        distance_to_existing_stations = get_distance_to_existing_stations(df, k=k)
        distance_to_planned_stations = get_distance_to_planned_stations(df, k=k)
        distance_to_primary_schools = get_distance_to_primary_schools(df, k=k)
        distance_to_shopping_malls = get_distance_to_shopping_malls(df, k=k)

    else:
        town = preprocess_town(df, is_train)
        flat_type = preprocess_flat_type(df, is_train)
        flat_model = preprocess_flat_model(df, is_train)
        floor_area_sqm = preprocess_floor_area_sqm(df, is_train)
        lease_commence_date = preprocess_lease_commence_date(df, is_train)
        planning_area = preprocess_planning_area(df, is_train)
        region = preprocess_region(df, is_train)
        distance_to_existing_stations = get_distance_to_existing_stations(df, k=k)
        distance_to_planned_stations = get_distance_to_planned_stations(df, k=k)
        distance_to_primary_schools = get_distance_to_primary_schools(df, k=k)
        distance_to_shopping_malls = get_distance_to_shopping_malls(df, k=k)

    features = [
        town, 
        flat_type, 
        flat_model, 
        floor_area_sqm, 
        lease_commence_date,
        planning_area, 
        region, 
        distance_to_existing_stations,
        distance_to_planned_stations,
        distance_to_primary_schools,
        distance_to_shopping_malls,
    ]
    X = np.hstack(features)
    if is_train:
        y = np.array(df["monthly_rent"])
        return X, y
    else:
        return X, None

df_test = pd.read_csv("train.csv")

X, y = preprocess(df_train, is_train=True)
np.save("X_train.npy", X)
np.save("y_train.npy", y)
print(X[0].tolist())
X.shape, y.shape, 

  3%|▎         | 1591/60000 [00:16<09:55, 98.15it/s]

In [None]:
df_test = pd.read_csv("train.csv")

X, y = preprocess(df_train, is_train=True)
np.save("X_train.npy", X)
np.save("y_train.npy", y)
print(X[0].tolist())
X.shape, y.shape, 

In [None]:
df_test = pd.read_csv("test.csv")

X, y = preprocess(df_test, is_train=False)
np.save("X_test.npy", X)
np.save("y_test.npy", y)
print(X.shape)