# Initialization

In [1]:
import preprocess

print("************* Preprocessing original dataset *************")
original_dataset = preprocess.DatasetPreprocess("FinalData.csv", sequence_length=0, is_dataset_reshaped = False)

print("\n************* Preprocessing reshaped dataset *************")
reshaped_dataset = preprocess.DatasetPreprocess("FinalData_Reshaped.csv", sequence_length=0, is_dataset_reshaped = True)


import sklearn.linear_model
import numpy as np
from sklearn.neural_network import MLPRegressor

def print_metrics(model, Y_inverse_transform, X, Y):
    Y_hat = model.predict(X)
    if Y_hat.ndim < 2:
        Y_hat = Y_hat[:, None]

    MSE = np.mean((Y_hat - Y) ** 2)
    orig_MSE = np.mean((Y_inverse_transform(Y_hat) - Y_inverse_transform(Y)) ** 2)
    print("\tMSE:", MSE)
    print("\tOriginal scale MSE:", orig_MSE)

    print("\n\tRMSE:", np.sqrt(MSE))
    print("\tOriginal scale RMSE:", np.sqrt(orig_MSE))

    print("\n\tMAE:", np.mean(np.abs(Y_hat - Y)))
    print("\tOriginal scale MAE:", np.mean(np.abs(Y_inverse_transform(Y_hat) - Y_inverse_transform(Y))))

def print_all_metrics(model, dataset):
    print("Train metrics:")
    print_metrics(model, dataset.Y_inverse_transform, dataset.X_train, dataset.Y_train)
    print("\nValidation metrics:")
    print_metrics(model, dataset.Y_inverse_transform, dataset.X_val,   dataset.Y_val)
    print("\nTest metrics:")
    print_metrics(model, dataset.Y_inverse_transform, dataset.X_test,  dataset.Y_test)

************* Preprocessing original dataset *************
categorical_variables: ['mrt_station', 'hour', 'status', 'day_in_a_week', 'month']
metro_flow index:  [1] 
categorical indices:  [0, 2, 10, 26, 27] 
numerical indices:  [3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]


7 unique mrt_station:	 ['中山', '北投', '北門', '古亭', '士林', '大橋頭', '松山']
mrt_station is a string!
24 unique hour:	 [0, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 2, 3, 4]
5 unique status:	 ['良好', '普通', '對敏感族群不健康', nan, '對所有族群不健康']
status is a string!
7 unique day_in_a_week:	 [2, 3, 4, 5, 6, 0, 1]
12 unique month:	 [6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5]
Processing string categories
	First occurance of 中山: 0
	First occurance of 北投: 1
	First occurance of 北門: 2
	First occurance of 古亭: 3
	First occurance of 士林: 4
	First occurance of 大橋頭: 5
	First occurance of 松山: 6
	First occurance of 良好: 0
	First occurance of 普通: 54
	First occurance of 對敏感族群不健康: 8025
	First 

  return function_base._ureduce(a, func=_nanmedian, keepdims=keepdims,
  return _nanquantile_unchecked(


Y_test_raw mean:  	 2309.976980965029
Y_test_raw median:	 1760.0
Y_test_raw std:   	 2051.8194300373616


# Sklearn LinearRegression - original dataset

In [2]:
# Training
LR = sklearn.linear_model.LinearRegression(n_jobs=-1).fit(original_dataset.X_train, np.ravel(original_dataset.Y_train))

# Evaluation
print_all_metrics(LR, original_dataset)

Train metrics:
	MSE: 0.16411246282561986
	Original scale MSE: 685651.3664878189

	RMSE: 0.40510796440655156
	Original scale RMSE: 828.0406792469914

	MAE: 0.2817718927164729
	Original scale MAE: 575.9417487124706

Validation metrics:
	MSE: 0.1992902363849208
	Original scale MSE: 832621.8530410704

	RMSE: 0.4464193503701657
	Original scale RMSE: 912.4811521566187

	MAE: 0.30151310421198946
	Original scale MAE: 616.2927850093064

Test metrics:
	MSE: 0.23655225958321435
	Original scale MSE: 988300.2011940563

	RMSE: 0.4863663841007254
	Original scale RMSE: 994.1328891018827

	MAE: 0.32336636172997824
	Original scale MAE: 660.9608433760754


# Sklearn LinearRegression - reshaped dataset

In [3]:
# Training
LR = sklearn.linear_model.LinearRegression(n_jobs=-1).fit(reshaped_dataset.X_train, reshaped_dataset.Y_train)

# Evaluation
print_all_metrics(LR, reshaped_dataset)

Train metrics:
	MSE: 0.04111878035586768
	Original scale MSE: 117630.11954971934

	RMSE: 0.20277766236907774
	Original scale RMSE: 342.9724763734247

	MAE: 0.13448463512076367
	Original scale MAE: 220.35458160372357

Validation metrics:
	MSE: 0.1117463203669921
	Original scale MSE: 291242.74579741806

	RMSE: 0.3342847893144289
	Original scale RMSE: 539.6691076923137

	MAE: 0.2021691317811276
	Original scale MAE: 325.99686152105414

Test metrics:
	MSE: 0.09498425688054864
	Original scale MSE: 240725.14405229915

	RMSE: 0.3081951603782068
	Original scale RMSE: 490.63748741030696

	MAE: 0.21444958734371672
	Original scale MAE: 340.92698509771645


# Sklearn MLP - original dataset

In [4]:
# Training
MLP = MLPRegressor(max_iter=2000, hidden_layer_sizes = (10000)).fit(original_dataset.X_train, np.ravel(original_dataset.Y_train))

# Evaluation
print_all_metrics(MLP, original_dataset)

Train metrics:
	MSE: 0.01730874334933838
	Original scale MSE: 72314.8219539614

	RMSE: 0.1315626974082638
	Original scale RMSE: 268.91415350249116

	MAE: 0.07195235365162225
	Original scale MAE: 147.0706108639159

Validation metrics:
	MSE: 0.07155279610540236
	Original scale MSE: 298943.0027494203

	RMSE: 0.26749354404434206
	Original scale RMSE: 546.7568040266351

	MAE: 0.1807632307682581
	Original scale MAE: 369.4800436903195

Test metrics:
	MSE: 0.07875882948372227
	Original scale MSE: 329049.3490179047

	RMSE: 0.28064003542567173
	Original scale RMSE: 573.628232410073

	MAE: 0.19104452391837765
	Original scale MAE: 390.4950068891639


# Sklearn MLP - reshaped dataset

In [5]:
# Training
MLP = MLPRegressor(max_iter=2000, hidden_layer_sizes = (10000)).fit(reshaped_dataset.X_train, reshaped_dataset.Y_train)

# Evaluation
print_all_metrics(MLP, reshaped_dataset)

Train metrics:
	MSE: 0.0027649284353395537
	Original scale MSE: 8925.031539691243

	RMSE: 0.05258258680722691
	Original scale RMSE: 94.47238506405586

	MAE: 0.038610080363767084
	Original scale MAE: 65.21900988839606

Validation metrics:
	MSE: 0.054362414741242815
	Original scale MSE: 141260.41282386414

	RMSE: 0.2331574891382278
	Original scale RMSE: 375.84626221882814

	MAE: 0.14351751676608004
	Original scale MAE: 238.99639601140163

Test metrics:
	MSE: 0.07883269756393908
	Original scale MSE: 182757.1142687961

	RMSE: 0.28077161103633513
	Original scale RMSE: 427.5010108395021

	MAE: 0.18599792667577048
	Original scale MAE: 295.76378269751325
