
# CS5228 Project - Team not found

# Data Preprocessing v2


In [1]:
# import packages
import numpy as np
import pandas as pd

In [2]:
# output numeric only
num_cols = [
    "bedrooms",
    "bathrooms",
    "lat",
    "lng",
    "since_built_year",
    "no_of_units",
    "area_size",
    "since_listing_month",
]
bin_cols = ["name", "region", "additional_rooms", "freehold"]
cat_cols = ["model", "district", "planning_area", "subszone"]
res_col = ["price"]

assert len(num_cols+bin_cols+cat_cols) == 16


train_df = pd.read_csv('./data_clean_output/train_numeric_cat.csv')
test_df = pd.read_csv('./data_clean_output/test_numeric_cat.csv')

### label encode model  




### model: label encoding

In [3]:
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder

In [4]:
print(train_df.model.unique())

['condominium' 'apartment' 'executive condominium' 'landed']


In [5]:

encode_order = ["executive condominium", "landed", "apartment", "condominium"]

for i, cat in enumerate(encode_order):
    train_df.loc[train_df['model']==cat, 'model'] = i
    test_df.loc[test_df['model']==cat, 'model'] = i

train_df['model'] = train_df['model'].astype('int')
test_df['model'] = test_df['model'].astype('int')

In [6]:
assert len(train_df.model.unique()) == 4
assert len(test_df.model.unique()) == 4

### district, planning_area, subszone  one hot encoding 

In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
def my_onehot_encoder(train_XY, test_X, onehot_col_names, scaler=None):
    """
     Onehot encoding for given column names.

    """
    
    encoder = OneHotEncoder(handle_unknown='ignore')
    
    numeric_cols = [c for c in test_X.columns if c not in onehot_col_names]
    onehot_train_df, onehot_test_df = train_XY[numeric_cols].copy(), test_X[numeric_cols].copy()
    
    # keep train Y
    response_name = train_XY.columns[-1]
    train_Y = train_XY[[response_name]]
    
    
    # apply minmax or standard scaler
    if scaler:
        onehot_train_df[onehot_train_df.columns] = scaler.fit_transform(onehot_train_df)
        onehot_test_df[onehot_test_df.columns] = scaler.transform(onehot_test_df)

    
    # one-hot encoding
    for c_name in onehot_col_names:
        uniq_count = len(train_XY[c_name].unique())

        c_onehot_names = [f'{c_name}_{i}' for i in range(uniq_count)]
        
        encoder.fit(train_XY[[c_name]])
        
        
        # train
        train_output = pd.DataFrame(encoder.transform(train_XY[[c_name]]).toarray(), columns=c_onehot_names)
        onehot_train_df = pd.concat([onehot_train_df, train_output], axis=1)
        # test
        test_output = pd.DataFrame(encoder.transform(test_X[[c_name]]).toarray(), columns=c_onehot_names)
        onehot_test_df = pd.concat([onehot_test_df, test_output], axis=1)
        
    # append train Y
    onehot_train_df = pd.concat([onehot_train_df, train_Y], axis=1)

    return onehot_train_df, onehot_test_df

In [9]:
onehot_train_df, onehot_test_df = my_onehot_encoder(train_df, test_df,  ['district','planning_area', 'subszone'])

In [10]:
# display(onehot_train_df)
# display(onehot_test_df)

In [12]:

onehot_train_df.iloc[:,1:].to_csv('./data_clean_output/train_numeric_bin_onehot.csv', index=False)
onehot_test_df.iloc[:,1:].to_csv('./data_clean_output/test_numeric_bin_onehot.csv', index=False)

### district, planning_area, subszone target encoding 

In [17]:
import category_encoders as ce

temp_train_df, temp_test_df = train_df.copy(), test_df.copy()

In [18]:
# temp_train_df

In [19]:
#Create target encoding object
encoder=ce.TargetEncoder() 

# district
temp_train_df[['district']] = temp_train_df[['district']].astype('object')
temp_train_df[['district']]  = encoder.fit_transform(temp_train_df[['district']],temp_train_df['price'])

temp_test_df[['district']] = encoder.transform(temp_test_df[['district']])


# # planning_area
encoder=ce.TargetEncoder() 

temp_train_df[['planning_area']]  = encoder.fit_transform(temp_train_df[['planning_area']],temp_train_df['price'])
temp_test_df[['planning_area']] = encoder.transform(temp_test_df[['planning_area']])


# # subszone
encoder=ce.TargetEncoder() 
temp_train_df[['subszone']]  = encoder.fit_transform(temp_train_df[['subszone']],temp_train_df['price'])
temp_test_df[['subszone']] = encoder.transform(temp_test_df[['subszone']])




In [21]:
temp_train_df.iloc[:, 1:].to_csv('./data_clean_output/train_numeric_bin_target.csv', index=False)
temp_test_df.iloc[:, 1:].to_csv('./data_clean_output/test_numeric_bin_target.csv', index=False)

train_tar_df = temp_train_df.copy()
test_tar_df = temp_test_df.copy()

## Make use of auxiliary-data

### combine all auxiliary data together

In [22]:
train_distance = pd.read_csv('./data_clean_output/000-auxiliary-mrt-distance-weighted.csv', index_col=0)
test_distance = pd.read_csv('./data_clean_output/000-auxiliary-mrt-distance-weighted-test.csv', index_col=0)

In [23]:
train_num = pd.read_csv('./data_clean_output/000-auxiliary-mrt-num-weighted.csv', index_col=0)
test_num = pd.read_csv('./data_clean_output/000-auxiliary-mrt-num-weighted-test.csv', index_col=0)

In [24]:
train_aux = pd.concat([train_distance, train_num], axis=1)
test_aux =  pd.concat([test_distance, test_num], axis=1)

In [25]:
train_aux.shape

(26048, 16)

### combine one-hot data and auxiliary data together

In [26]:
train_aux_oh = pd.concat([train_aux, onehot_train_df], axis=1).dropna().drop(columns=['Unnamed: 0'])
test_aux_oh = pd.concat([test_aux, onehot_test_df], axis=1).drop(columns=['Unnamed: 0'])


train_aux_oh.to_csv('./data_clean_output/train_aux_oh.csv', index=False)
test_aux_oh.to_csv('./data_clean_output/test_aux_oh.csv',  index=False)

In [27]:
display(train_aux_oh)
display(test_aux_oh)

Unnamed: 0,closeness_closest_mrt,betweenness_closest_mrt,closest_comercial,closest_hawker,closest_primary,closest_secondary,closest_mall,closest_mrt,closeness_num_mrt_1.5km,betweenness_num_mrt_1.5km,...,subszone_184,subszone_185,subszone_186,subszone_187,subszone_188,subszone_189,subszone_190,subszone_191,subszone_192,price
0,0.026394,0.008419,5.577145,4.061902,4.042344,4.409490,3.206053,3.397070,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5390000.0
1,0.299715,0.089016,0.408020,0.518271,0.234300,1.265343,0.339099,0.370460,0.565896,0.505815,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2310000.0
2,0.208124,0.061813,0.496095,0.647547,0.294125,1.075357,0.200503,0.533492,0.456412,0.474675,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5279500.0
3,0.130204,0.129623,1.845415,1.018270,0.483838,0.495765,1.680770,0.686674,0.447603,0.382574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1843600.0
4,0.064783,0.039485,2.221437,0.618157,0.374952,0.625021,0.821359,1.323576,0.178316,0.115548,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1131900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22398,0.339242,0.015664,2.416836,1.209081,1.246248,1.650216,0.198658,0.310185,0.325306,0.156632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2090000.0
22399,0.086217,0.042654,3.004581,0.508551,0.246769,0.764453,1.093861,1.082635,0.280763,0.165943,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1650000.0
22400,0.113931,0.031138,1.504972,1.009305,0.257872,0.364757,0.466058,0.616290,0.263577,0.087457,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2090000.0
22401,0.050619,0.080994,1.278922,0.566675,0.848240,0.255925,0.522581,1.781516,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6449300.0


Unnamed: 0,closeness_closest_mrt,betweenness_closest_mrt,closest_comercial,closest_hawker,closest_primary,closest_secondary,closest_mall,closest_mrt,closeness_num_mrt_1.5km,betweenness_num_mrt_1.5km,...,subszone_183,subszone_184,subszone_185,subszone_186,subszone_187,subszone_188,subszone_189,subszone_190,subszone_191,subszone_192
0,0.170265,0.265983,1.659074,0.409342,0.977205,1.046077,1.129543,0.653039,0.420823,0.549154,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.104577,0.021741,2.404586,0.444060,1.462463,0.249407,0.969666,0.883631,0.277822,0.061204,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.150029,0.091443,1.714778,0.587086,0.890698,1.339162,1.467670,0.571525,0.178316,0.115548,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.051879,0.030169,1.851910,2.328437,0.707464,0.529325,2.311544,1.250516,0.064876,0.037727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.235992,0.064245,1.174624,0.326901,1.076387,1.678925,0.394807,0.409401,0.871747,0.374799,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0.205507,0.078125,1.234028,0.201581,0.753366,0.590887,0.430780,0.464698,1.113541,0.571727,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7496,0.181796,0.041782,1.769759,0.778493,1.349495,1.699094,0.921446,0.517440,0.590142,0.248702,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7497,0.192986,0.099167,4.532308,1.691462,0.804616,0.721210,0.636956,0.376113,0.736355,0.619808,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,0.383203,0.002731,1.481399,2.773995,0.631159,0.937855,0.769121,0.179380,0.284551,0.026294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### combine target encode data and auxiliary data together

In [28]:
train_aux_tar = pd.concat([train_aux, train_tar_df], axis=1).dropna().drop(columns=['Unnamed: 0'])
test_aux_tar = pd.concat([test_aux, test_tar_df], axis=1).drop(columns=['Unnamed: 0'])


train_aux_tar.to_csv('./data_clean_output/train_aux_tar.csv', index=False)
test_aux_tar.to_csv('./data_clean_output/test_aux_tar.csv', index=False)

In [29]:
display(train_aux_tar)
display(test_aux_tar)

Unnamed: 0,closeness_closest_mrt,betweenness_closest_mrt,closest_comercial,closest_hawker,closest_primary,closest_secondary,closest_mall,closest_mrt,closeness_num_mrt_1.5km,betweenness_num_mrt_1.5km,...,subszone,lat,lng,freehold,since_built_year,no_of_units,area_size,since_listing_month,additional_rooms,price
0,0.026394,0.008419,5.577145,4.061902,4.042344,4.409490,3.206053,3.397070,0.000000,0.000000,...,5.809466e+06,1.239337,103.837487,0.0,11.0,151.0,2336.0,11.0,0.0,5390000.0
1,0.299715,0.089016,0.408020,0.518271,0.234300,1.265343,0.339099,0.370460,0.565896,0.505815,...,2.865934e+06,1.319533,103.847030,1.0,23.0,24.0,1259.0,11.0,0.0,2310000.0
2,0.208124,0.061813,0.496095,0.647547,0.294125,1.075357,0.200503,0.533492,0.456412,0.474675,...,2.865934e+06,1.315656,103.844445,1.0,10.0,235.0,1959.0,10.0,0.0,5279500.0
3,0.130204,0.129623,1.845415,1.018270,0.483838,0.495765,1.680770,0.686674,0.447603,0.382574,...,1.323385e+06,1.329367,103.905791,1.0,92.0,237.0,883.0,3.0,0.0,1843600.0
4,0.064783,0.039485,2.221437,0.618157,0.374952,0.625021,0.821359,1.323576,0.178316,0.115548,...,1.535318e+06,1.439199,103.829104,0.0,92.0,448.0,678.0,3.0,0.0,1131900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22398,0.339242,0.015664,2.416836,1.209081,1.246248,1.650216,0.198658,0.310185,0.325306,0.156632,...,2.278433e+06,1.338214,103.872090,0.0,62.0,667.0,958.0,4.0,0.0,2090000.0
22399,0.086217,0.042654,3.004581,0.508551,0.246769,0.764453,1.093861,1.082635,0.280763,0.165943,...,6.046682e+06,1.308991,103.838285,1.0,9.0,158.0,526.0,3.0,0.0,1650000.0
22400,0.113931,0.031138,1.504972,1.009305,0.257872,0.364757,0.466058,0.616290,0.263577,0.087457,...,4.278711e+06,1.293516,103.822235,0.0,6.0,109.0,893.0,7.0,0.0,2090000.0
22401,0.050619,0.080994,1.278922,0.566675,0.848240,0.255925,0.522581,1.781516,0.000000,0.000000,...,5.507400e+06,1.316263,103.825190,1.0,92.0,17.0,1561.0,3.0,0.0,6449300.0


Unnamed: 0,closeness_closest_mrt,betweenness_closest_mrt,closest_comercial,closest_hawker,closest_primary,closest_secondary,closest_mall,closest_mrt,closeness_num_mrt_1.5km,betweenness_num_mrt_1.5km,...,planning_area,subszone,lat,lng,freehold,since_built_year,no_of_units,area_size,since_listing_month,additional_rooms
0,0.170265,0.265983,1.659074,0.409342,0.977205,1.046077,1.129543,0.653039,0.420823,0.549154,...,2.535026e+06,2.723756e+06,1.313566,103.803218,1,62.0,638.0,710.0,4,0
1,0.104577,0.021741,2.404586,0.444060,1.462463,0.249407,0.969666,0.883631,0.277822,0.061204,...,2.184191e+06,2.917566e+06,1.298437,103.884408,1,6.0,130.0,1055.0,6,0
2,0.150029,0.091443,1.714778,0.587086,0.890698,1.339162,1.467670,0.571525,0.178316,0.115548,...,1.597532e+06,1.846157e+06,1.317851,103.908905,1,9.0,157.0,463.0,4,0
3,0.051879,0.030169,1.851910,2.328437,0.707464,0.529325,2.311544,1.250516,0.064876,0.037727,...,1.385825e+06,1.366258e+06,1.358005,103.965725,0,92.0,428.0,645.0,4,0
4,0.235992,0.064245,1.174624,0.326901,1.076387,1.678925,0.394807,0.409401,0.871747,0.374799,...,4.528126e+06,5.409189e+06,1.277474,103.849419,0,5.0,510.0,6200.0,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,0.205507,0.078125,1.234028,0.201581,0.753366,0.590887,0.430780,0.464698,1.113541,0.571727,...,3.734419e+06,3.183496e+06,1.289610,103.841276,0,23.0,492.0,743.0,4,0
7496,0.181796,0.041782,1.769759,0.778493,1.349495,1.699094,0.921446,0.517440,0.590142,0.248702,...,2.184191e+06,2.917566e+06,1.296610,103.867069,0,25.0,737.0,1345.0,5,0
7497,0.192986,0.099167,4.532308,1.691462,0.804616,0.721210,0.636956,0.376113,0.736355,0.619808,...,1.344970e+06,1.284003e+06,1.375255,103.757531,0,92.0,74.0,1033.0,3,1
7498,0.383203,0.002731,1.481399,2.773995,0.631159,0.937855,0.769121,0.179380,0.284551,0.026294,...,1.425021e+06,1.262095e+06,1.398588,103.874632,0,1.0,735.0,581.0,4,1


## normalisation & standardisation 

https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/


- Normalization is good to use when you know that the distribution of your data does not follow a Gaussian distribution. This can be useful in algorithms that do not assume any distribution of the data like K-Nearest Neighbors and Neural Networks.
- Standardization, on the other hand, can be helpful in cases where the data follows a Gaussian distribution. However, this does not have to be necessarily true. Also, unlike normalization, standardization does not have a bounding range. So, even if you have outliers in your data, they will not be affected by standardization.

In [30]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from typing import AnyStr, Callable

In [31]:
def my_scaler(train_XY:pd.DataFrame, test_X:pd.DataFrame, scaler:Callable):
    
    col_names = test_X.columns
    
    norm_train_df, norm_test_df = train_XY.copy(), test_X.copy()
    
    train_X = train_XY.iloc[:, :-1]
    
    normalizer = scaler.fit(train_X)
    norm_train_X = normalizer.transform(train_X)
    norm_test_X = normalizer.transform(test_X)
    
    norm_train_df[col_names] = norm_train_X
    norm_test_df[col_names] = norm_test_X
    
    return norm_train_df, norm_test_df



### for aux + target encoding

In [32]:
# for aux + target encoding

# normalize
train_aux_tar_norm, test_aux_tar_norm = my_scaler(train_aux_tar, test_aux_tar, MinMaxScaler())

train_aux_tar_norm.to_csv('./data_clean_output/train_aux_tar_norm.csv', index=False)
test_aux_tar_norm.to_csv('./data_clean_output/test_aux_tar_norm.csv', index=False)


# train_aux_tar_norm.describe()

In [36]:
# standardise
train_aux_tar_stand, test_aux_tar_stand = my_scaler(train_aux_tar, test_aux_tar, StandardScaler())

train_aux_tar_stand.to_csv('./data_clean_output/train_aux_tar_stand.csv', index=False)
test_aux_tar_stand.to_csv('./data_clean_output/test_aux_tar_stand.csv', index=False)


### for aux + onehot encoding

In [37]:

train_oh = train_aux_oh.iloc[:, 29:]
train_nu = train_aux_oh.iloc[:, :29]

test_oh = test_aux_oh.iloc[:, 29:]
test_nu = test_aux_oh.iloc[:, :29]


############# normalize
scaler = MinMaxScaler()
train_nu_norm, test_nu_norm = train_nu.copy(), test_nu.copy()
train_nu_norm[train_nu_norm.columns] = scaler.fit_transform(train_nu)
test_nu_norm[test_nu_norm.columns] = scaler.transform(test_nu)

# append onehot back
train_aux_oh_norm = pd.concat([train_nu_norm, train_oh], axis=1)
test_aux_oh_norm = pd.concat([test_nu_norm, test_oh], axis=1)



train_aux_oh_norm.to_csv('./data_clean_output/train_aux_oh_norm.csv', index=False)
test_aux_oh_norm.to_csv('./data_clean_output/test_aux_oh_norm.csv', index=False)







Unnamed: 0,closeness_closest_mrt,betweenness_closest_mrt,closest_comercial,closest_hawker,closest_primary,closest_secondary,closest_mall,closest_mrt,closeness_num_mrt_1.5km,betweenness_num_mrt_1.5km,...,bathrooms,region,lat,lng,freehold,since_built_year,no_of_units,area_size,since_listing_month,additional_rooms
0,0.026394,0.008419,5.577145,4.061902,4.042344,4.409490,3.206053,3.397070,0.000000,0.000000,...,4.0,1.0,1.239337,103.837487,0.0,11.0,151.0,2336.0,11.0,0.0
1,0.299715,0.089016,0.408020,0.518271,0.234300,1.265343,0.339099,0.370460,0.565896,0.505815,...,3.0,1.0,1.319533,103.847030,1.0,23.0,24.0,1259.0,11.0,0.0
2,0.208124,0.061813,0.496095,0.647547,0.294125,1.075357,0.200503,0.533492,0.456412,0.474675,...,3.0,1.0,1.315656,103.844445,1.0,10.0,235.0,1959.0,10.0,0.0
3,0.130204,0.129623,1.845415,1.018270,0.483838,0.495765,1.680770,0.686674,0.447603,0.382574,...,2.0,0.0,1.329367,103.905791,1.0,92.0,237.0,883.0,3.0,0.0
4,0.064783,0.039485,2.221437,0.618157,0.374952,0.625021,0.821359,1.323576,0.178316,0.115548,...,1.0,0.0,1.439199,103.829104,0.0,92.0,448.0,678.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22398,0.339242,0.015664,2.416836,1.209081,1.246248,1.650216,0.198658,0.310185,0.325306,0.156632,...,3.0,1.0,1.338214,103.872090,0.0,62.0,667.0,958.0,4.0,0.0
22399,0.086217,0.042654,3.004581,0.508551,0.246769,0.764453,1.093861,1.082635,0.280763,0.165943,...,1.0,1.0,1.308991,103.838285,1.0,9.0,158.0,526.0,3.0,0.0
22400,0.113931,0.031138,1.504972,1.009305,0.257872,0.364757,0.466058,0.616290,0.263577,0.087457,...,2.0,1.0,1.293516,103.822235,0.0,6.0,109.0,893.0,7.0,0.0
22401,0.050619,0.080994,1.278922,0.566675,0.848240,0.255925,0.522581,1.781516,0.000000,0.000000,...,3.0,1.0,1.316263,103.825190,1.0,92.0,17.0,1561.0,3.0,0.0


In [39]:
############## standardise
scaler = StandardScaler()
train_nu_stand, test_nu_stand = train_nu.copy(), test_nu.copy()
train_nu_stand[train_nu_stand.columns] = scaler.fit_transform(train_nu)
test_nu_stand[test_nu_stand.columns] = scaler.transform(test_nu)

# append onehot back
train_aux_oh_stand = pd.concat([train_nu_stand, train_oh], axis=1)
test_aux_oh_stand = pd.concat([test_nu_stand, test_oh], axis=1)

train_aux_oh_stand.to_csv('./data_clean_output/train_aux_oh_stand.csv', index=False)
test_aux_oh_stand.to_csv('./data_clean_output/test_aux_oh_stand.csv', index=False)



In [40]:
display(train_aux_oh_stand)
display(test_aux_oh_stand)

Unnamed: 0,closeness_closest_mrt,betweenness_closest_mrt,closest_comercial,closest_hawker,closest_primary,closest_secondary,closest_mall,closest_mrt,closeness_num_mrt_1.5km,betweenness_num_mrt_1.5km,...,subszone_184,subszone_185,subszone_186,subszone_187,subszone_188,subszone_189,subszone_190,subszone_191,subszone_192,price
0,-0.034450,-0.590997,3.635126,4.299375,5.949928,5.128132,4.426602,5.173777,-1.233481,-1.140868,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5390000.0
1,-0.009890,-0.366370,-1.351794,-0.611457,-0.984635,0.432037,-0.693816,-0.549078,0.426612,0.686961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2310000.0
2,-0.018120,-0.442185,-1.266824,-0.432305,-0.875692,0.148276,-0.941351,-0.240810,0.105433,0.574435,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5279500.0
3,-0.025122,-0.253196,0.034935,0.081451,-0.530218,-0.717402,1.702426,0.048835,0.079590,0.241614,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1843600.0
4,-0.031000,-0.504413,0.397702,-0.473033,-0.728503,-0.524346,0.167506,1.253119,-0.710379,-0.723320,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1131900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22398,-0.006338,-0.570805,0.586214,0.345880,0.858154,1.006884,-0.944646,-0.663049,-0.279175,-0.574856,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2090000.0
22399,-0.029074,-0.495583,1.153241,-0.624927,-0.961928,-0.316091,0.654199,0.797536,-0.409845,-0.541209,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1650000.0
22400,-0.026584,-0.527678,-0.293508,0.069026,-0.941709,-0.913076,-0.467065,-0.084251,-0.460262,-0.824830,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2090000.0
22401,-0.032273,-0.388727,-0.511590,-0.544378,0.133368,-1.075628,-0.366114,2.119012,-1.233481,-1.140868,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6449300.0


Unnamed: 0,closeness_closest_mrt,betweenness_closest_mrt,closest_comercial,closest_hawker,closest_primary,closest_secondary,closest_mall,closest_mrt,closeness_num_mrt_1.5km,betweenness_num_mrt_1.5km,...,subszone_183,subszone_184,subszone_185,subszone_186,subszone_187,subszone_188,subszone_189,subszone_190,subszone_191,subszone_192
0,-0.021522,0.126846,-0.144839,-0.762413,0.368218,0.104543,0.717927,-0.014764,0.001031,0.843574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.027425,-0.553869,0.574396,-0.714300,1.251887,-1.085363,0.432384,0.421250,-0.418472,-0.919698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.023340,-0.359605,-0.091098,-0.516092,0.210686,0.542293,1.321827,-0.168895,-0.710379,-0.723320,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.032160,-0.530379,0.041201,1.897105,-0.122988,-0.667277,2.828997,1.114972,-1.043163,-1.004538,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.015616,-0.435406,-0.612212,-0.876661,0.548831,1.049764,-0.594320,-0.475447,1.323844,0.213518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7495,-0.018355,-0.396724,-0.554902,-1.050332,-0.039399,-0.575329,-0.530073,-0.370888,2.033164,0.925146,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7496,-0.020486,-0.498012,-0.038054,-0.250837,1.046169,1.079888,0.346263,-0.271162,0.497737,-0.242150,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7497,-0.019480,-0.338078,2.627118,1.014373,0.053929,-0.380678,-0.161840,-0.538389,0.926663,1.098891,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7498,-0.002388,-0.606849,-0.316250,2.514569,-0.261943,-0.057098,0.074208,-0.910381,-0.398733,-1.045850,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## PCA for feature reduction

In [41]:
from sklearn.decomposition import PCA

In [42]:
def my_pca(train_XY, test_X, pca_func, scaler=None):
    
    # keep train Y
    response_name = train_XY.columns[-1]
    train_Y = train_XY[[response_name]]
    
    # get train X
    train_X = train_XY.iloc[:, :-1] 
    
    # fit training X
    pca_func.fit(train_X)
    # transform train X and test X
    train_pca_X = pca_func.transform(train_X)
    test_pca_X = pca_func.transform(test_X)
    
    # to dataframe
    d = train_pca_X.shape[1]
    col_names = [f'PC_{i}' for i in range(d)]
    
    train_pca_X = pd.DataFrame(train_pca_X, columns=col_names)
    test_pca_X = pd.DataFrame(test_pca_X, columns=col_names)
    
    # append train Y
    train_pca_XY = pd.concat([train_pca_X, train_Y], axis=1)
    
    
    return train_pca_XY, test_pca_X

In [43]:
# norm + pca
train_aux_tar_norm_pca, test_aux_tar_norm_pca = my_pca(train_aux_tar_norm, test_aux_tar_norm, PCA(0.95))

train_aux_tar_norm_pca.to_csv('./data_clean_output/train_aux_tar_norm_pca.csv', index=False)
test_aux_tar_norm_pca.to_csv('./data_clean_output/test_aux_tar_norm_pca.csv', index=False)


# stand + pca
train_aux_tar_stand_pca, test_aux_tar_stand_pca = my_pca(train_aux_tar_stand, test_aux_tar_stand, PCA(0.95))
train_aux_tar_stand_pca.to_csv('./data_clean_output/train_aux_tar_stand_pca.csv',index=False)
test_aux_tar_stand_pca.to_csv('./data_clean_output/test_aux_tar_stand_pca.csv',index=False)