In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings(action="ignore")

In [None]:
df_train = pd.read_csv('../data/train_cleaned.csv')
df_test = pd.read_csv('../data/test_cleaned.csv')

In [None]:
#Proof of 1 to many mapping of planning_area vs subzone
df_train.groupby(['subzone']).planning_area.nunique().unique()

In [None]:
df_train = df_train.drop([ 'title', 'property_name', 'address', 'available_unit_types', 'lat', 'lng'], axis=1)
df_test = df_test.drop([ 'title', 'property_name', 'address', 'available_unit_types', 'lat', 'lng'], axis=1)

df_train["price_per_sqft"] = df_train["price"]/df_train["size_sqft"] 

price_avg = df_train.groupby(['subzone']).price_per_sqft.mean().reset_index(name = 'price_avg_subzone')
df_train = df_train.merge(price_avg, on=['subzone'])

In [None]:
from sklearn.preprocessing import OneHotEncoder
#One hot encoding ['property_type', 'furnishing','planning_area', 'tenure']
encoder = OneHotEncoder()
cols = ['property_type', 'furnishing','planning_area', 'tenure']
encoder.fit(df_train[cols])

X = encoder.transform(df_train[cols]).toarray()
colnames = []
for col, arr in zip(cols, encoder.categories_):
    colnames += [col + '_' + x for x in arr]
X = pd.DataFrame(X, columns=colnames, index=df_train.index).astype(int)
df_train = pd.concat([df_train, X], axis=1).drop(cols, axis = 1)

#test data
X = encoder.transform(df_test[cols]).toarray()
X = pd.DataFrame(X, columns=colnames, index=df_test.index).astype(int)
df_test = pd.concat([df_test, X], axis=1).drop(cols, axis = 1)


In [None]:
#Encode age from built_year
df_train['age'] =2022 - df_train['built_year']
df_test['age'] =2022 - df_test['built_year']

In [None]:
#replace 1s of planning_area one hot encoding with the average price/sqft of the subzone it belongs to.
cols = df_train.columns
cols = cols[cols.str.startswith('planning_area')]
df_train[cols] = df_train[cols].apply(lambda x: x.mul(df_train['price_avg_subzone']), axis=0)
df_train = df_train.drop(['subzone', 'price_avg_subzone','size_sqft', 'built_year'], axis = 1)
df_train.head()

df_test = df_test.merge(price_avg, on='subzone')
df_test[cols] = df_test[cols].apply(lambda x: x.mul(df_test['price_avg_subzone']), axis=0)
df_test = df_test.drop(['subzone', 'price_avg_subzone', 'built_year'], axis = 1)


In [None]:
df_test.head()


In [None]:
df_train.to_csv('../data/train_encoded_1.csv')
df_test.to_csv('../data/test_encoded_1.csv')