In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import geopy.distance
from tqdm import tqdm


In [2]:
df_train = pd.read_csv("train.csv")
df_train.head()

Unnamed: 0,rent_approval_date,town,block,street_name,flat_type,flat_model,floor_area_sqm,furnished,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,monthly_rent
0,2021-09,jurong east,257,Jurong East Street 24,3 room,new generation,67.0,yes,1983,1.344518,103.73863,0.0,yuhua east,jurong east,west region,1600
1,2022-05,bedok,119,bedok north road,4-room,new generation,92.0,yes,1978,1.330186,103.938717,0.0,bedok north,bedok,east region,2250
2,2022-10,toa payoh,157,lorong 1 toa payoh,3-room,improved,67.0,yes,1971,1.332242,103.845643,0.0,toa payoh central,toa payoh,central region,1900
3,2021-08,pasir ris,250,Pasir Ris Street 21,executive,apartment,149.0,yes,1993,1.370239,103.962894,0.0,pasir ris drive,pasir ris,east region,2850
4,2022-11,kallang/whampoa,34,Whampoa West,3-room,improved,68.0,yes,1972,1.320502,103.863341,0.0,bendemeer,kallang,central region,2100


In [3]:
def compute_distance(pt_a, pts):
    res = []
    for pt in pts:
        dis = geopy.distance.distance(pt, pt_a).km
        res.append(dis)
    return res

df_existing_mrt = pd.read_csv("auxiliary-data/auxiliary-data/sg-mrt-existing-stations.csv")
def get_distance_to_existing_station(df, k=1):
    latitude = np.array(df_existing_mrt["latitude"]).reshape(-1, 1)
    longitude = np.array(df_existing_mrt["longitude"]).reshape(-1, 1)
    pts = np.hstack([latitude, longitude])

    res = []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        pt_a = (row["latitude"], row["longitude"])
        distances = compute_distance(pt_a, pts)
        res.append([np.min(distances),])

    res = np.array(res)    
    return res

In [4]:
town_encoder = OneHotEncoder()
flat_type_encoder = OneHotEncoder()
flat_model_encoder = OneHotEncoder()
planning_area_encoder = OneHotEncoder()
region_encoder = OneHotEncoder()

def preprocess(df, is_train):
    if is_train:
        town = town_encoder.fit_transform(np.array(df["town"]).reshape(-1, 1)).toarray()
        flat_type = flat_type_encoder.fit_transform(np.array(df["flat_type"]).reshape(-1, 1)).toarray()
        flat_model = flat_model_encoder.fit_transform(np.array(df["flat_model"]).reshape(-1, 1)).toarray()
        floor_area_sqm = np.array(df["floor_area_sqm"]).reshape(-1, 1)
        planning_area = planning_area_encoder.fit_transform(np.array(df["planning_area"]).reshape(-1, 1)).toarray()
        region = region_encoder.fit_transform(np.array(df["region"]).reshape(-1, 1)).toarray()
        distance_to_existing_station = get_distance_to_existing_station(df, k=1)
    else:
        town = town_encoder.transform(np.array(df["town"]).reshape(-1, 1)).toarray()
        flat_type = flat_type_encoder.transform(np.array(df["flat_type"]).reshape(-1, 1)).toarray()
        flat_model = flat_model_encoder.transform(np.array(df["flat_model"]).reshape(-1, 1)).toarray()
        floor_area_sqm = np.array(df["floor_area_sqm"]).reshape(-1, 1)
        planning_area = planning_area_encoder.transform(np.array(df["planning_area"]).reshape(-1, 1)).toarray()
        region = region_encoder.transform(np.array(df["region"]).reshape(-1, 1)).toarray()
        distance_to_existing_station = get_distance_to_existing_station(df, k=1)

    features = [
        town, 
        flat_type, 
        flat_model, 
        floor_area_sqm, 
        planning_area, 
        region, 
        distance_to_existing_station
    ]
    X = np.hstack(features)
    if is_train:
        y = np.array(df["monthly_rent"])
        return X, y
    else:
        return X, None

X, y = preprocess(df_train, is_train=True)
X.shape, y.shape


100%|██████████| 60000/60000 [10:11<00:00, 98.17it/s]


((60000, 90), (60000,))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
for i in range(64, 65, 16):
    regressor = RandomForestRegressor(n_estimators = i, random_state = 0)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    print("N_Estimators: ", i, "MSE:", mean_absolute_error(y_pred, y_test))

N_Estimators:  64 MSE: 461.22310766599577


In [6]:
regressor = RandomForestRegressor(n_estimators = 64, random_state = 0)
regressor.fit(X, y)

In [7]:
df_test = pd.read_csv("test.csv")

X, y = preprocess(df_test, is_train=False)
print(X.shape)
y_pred = regressor.predict(X)
df = pd.DataFrame({
    "Id": [i for i in range(len(y_pred))],
    "Predicted": y_pred,
})
df.to_csv("out.csv", index=False, sep=",")

100%|██████████| 30000/30000 [05:06<00:00, 98.03it/s]


(30000, 90)
