In [1]:
# Model with default parameters
# 1: LinearRegression
# 2: RandomForestRegressor

# Metrics
# 1: Mean Squared Error (MSE)
# 2: Mean Absolute Error (MAE)
# 3: Root Mean Squared Error (RMSE)

In [2]:
# from sklearn.metrics import get_scorer_names
# get_scorer_names()

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [202]:
def run(X, y, kfold=None):
    LR_model = LinearRegression()
    RFR_model = RandomForestRegressor()
    
    if kfold:
        print('Linear Regression')
        print(f'MAE {kfold}: {np.mean(-cross_val_score(LR_model, X, y, scoring="neg_mean_absolute_error", cv=kfold)):5f}')
        print(f'MSE {kfold}: {np.mean(-cross_val_score(LR_model, X, y, scoring="neg_mean_squared_error", cv=kfold)):5f}')
        print(f'RMSE {kfold}: {np.mean(-cross_val_score(LR_model, X, y, scoring="neg_root_mean_squared_error", cv=kfold)):5f}')
        print('')
        print('Random Forest Regression')
        print(f'MAE {kfold}: {np.mean(-cross_val_score(RFR_model, X, y, scoring="neg_mean_absolute_error", cv=kfold)):5f}')
        print(f'MSE {kfold}: {np.mean(-cross_val_score(RFR_model, X, y, scoring="neg_mean_squared_error", cv=kfold)):5f}')
        print(f'RMSE {kfold}: {np.mean(-cross_val_score(RFR_model, X, y, scoring="neg_root_mean_squared_error", cv=kfold)):5f}')
        print('')
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print('Linear Regression')
        LR_model.fit(X_train, y_train)
        y_pred = LR_model.predict(X_test)
        print(f'MAE: {mean_absolute_error(y_test, y_pred):5f}')
        print(f'MSE: {mean_squared_error(y_test, y_pred):5f}')
        print(f'RMSE: {root_mean_squared_error(y_test, y_pred):5f}')
        print('')
        print('Random Forest Regression')
        RFR_model.fit(X_train, y_train)
        y_pred = RFR_model.predict(X_test)
        print(f'MAE: {mean_absolute_error(y_test, y_pred):5f}')
        print(f'MSE: {mean_squared_error(y_test, y_pred):5f}')
        print(f'RMSE: {root_mean_squared_error(y_test, y_pred):5f}')
        print('')

In [4]:
# Load data
columns = ['Year', 'Month', 'Day', 'H_KG', 'H_LT', 'H_DH', 'R_KG', 'R_LT', 'R_DH']
df = pd.read_excel('Dataset_floodseason_76_20_NEW_3.11.xlsx', skiprows=range(0, 2), header=None)
df.columns = columns
df

Unnamed: 0,Year,Month,Day,H_KG,H_LT,H_DH,R_KG,R_LT,R_DH
0,1976,9,1,5.79,-0.10,0.02,0.0,0.0,0.0
1,1976,9,2,5.75,-0.11,0.00,0.0,0.0,0.0
2,1976,9,3,5.73,-0.12,0.00,0.0,0.0,0.0
3,1976,9,4,5.74,-0.14,-0.02,0.0,0.0,0.0
4,1976,9,5,5.74,-0.14,-0.01,0.0,12.0,0.0
...,...,...,...,...,...,...,...,...,...
5485,2020,12,27,6.36,0.32,0.13,0.0,0.0,0.0
5486,2020,12,28,6.33,0.27,0.03,0.4,0.0,0.1
5487,2020,12,29,6.30,0.21,0.06,0.0,0.0,0.0
5488,2020,12,30,6.28,0.14,0.27,7.6,0.4,2.4


In [224]:
# Case 1
# x: H (t-3, t-2, t-1, t) => 12
# y: H (t+1)

In [238]:
max_len = len(df)-4
X = np.empty((max_len, 12))
for i in range(max_len):
    dx = df.iloc[i:i+4][['R_KG', 'R_LT', 'R_DH']].values
    dx = dx.reshape(1, -1)
    X[i] = dx

y = df.iloc[4:].H_KG.values

In [239]:
run(X, y)

Linear Regression
MAE: 0.395710
MSE: 0.364533
RMSE: 0.603765

Random Forest Regression
MAE: 0.381218
MSE: 0.319073
RMSE: 0.564865


In [240]:
# Case 2
# x: H (t-3, t-2, t-1, t) | R (t-3, t-2, t-1, t) => 24
# y: H (t+1)

In [241]:
max_len = len(df)-4
X = np.empty((max_len, 24))
for i in range(max_len):
    dx = df.iloc[i:i+4][['R_KG', 'R_LT', 'R_DH', 'H_KG', 'H_LT', 'H_DH']].values
    dx = dx.reshape(1, -1)
    X[i] = dx

y = df.iloc[4:].H_KG.values

In [242]:
run(X, y)

Linear Regression
MAE: 0.248055
MSE: 0.204958
RMSE: 0.452723

Random Forest Regression
MAE: 0.252419
MSE: 0.217520
RMSE: 0.466390


In [230]:
# Case 3
# x: H (t-3, t-2, t-1, t) | R (t-3, t-2, t-1, t, t+1) => 27
# y: H (t+1)

In [243]:
max_len = len(df)-4
X = np.empty((max_len, 27))
for i in range(max_len):
    dx1 = df.iloc[i:i+4][['R_KG', 'R_LT', 'R_DH', 'H_KG', 'H_LT', 'H_DH']].values
    dx1 = dx1.reshape(1, -1)
    dx2 = df.iloc[i+4][['R_KG', 'R_LT', 'R_DH']].values
    dx2 = dx2.reshape(1, -1)
    X[i] = np.hstack((dx1, dx2))

y = df.iloc[4:].H_KG.values

In [244]:
run(X, y)

Linear Regression
MAE: 0.151919
MSE: 0.081719
RMSE: 0.285866

Random Forest Regression
MAE: 0.130299
MSE: 0.067391
RMSE: 0.259598
