In [2]:
# Model with default parameters
# 1: LinearRegression
# 2: RandomForestRegressor

# Metrics
# 1: Mean Squared Error (MSE)
# 2: Mean Absolute Error (MAE)
# 3: Root Mean Squared Error (RMSE)

In [3]:
# from sklearn.metrics import get_scorer_names
# get_scorer_names()

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
def run(X, y, kfold=None):
    LR_model = LinearRegression()
    RFR_model = RandomForestRegressor()
    
    if kfold:
        print('Linear Regression')
        print(f'MAE {kfold}: {np.mean(-cross_val_score(LR_model, X, y, scoring="neg_mean_absolute_error", cv=kfold)):5f}')
        print(f'MSE {kfold}: {np.mean(-cross_val_score(LR_model, X, y, scoring="neg_mean_squared_error", cv=kfold)):5f}')
        print(f'RMSE {kfold}: {np.mean(-cross_val_score(LR_model, X, y, scoring="neg_root_mean_squared_error", cv=kfold)):5f}')
        print('')
        print('Random Forest Regression')
        print(f'MAE {kfold}: {np.mean(-cross_val_score(RFR_model, X, y, scoring="neg_mean_absolute_error", cv=kfold)):5f}')
        print(f'MSE {kfold}: {np.mean(-cross_val_score(RFR_model, X, y, scoring="neg_mean_squared_error", cv=kfold)):5f}')
        print(f'RMSE {kfold}: {np.mean(-cross_val_score(RFR_model, X, y, scoring="neg_root_mean_squared_error", cv=kfold)):5f}')
        print('')
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print('Linear Regression')
        LR_model.fit(X_train, y_train)
        y_pred = LR_model.predict(X_test)
        print(f'MAE: {mean_absolute_error(y_test, y_pred):5f}')
        print(f'MSE: {mean_squared_error(y_test, y_pred):5f}')
        print(f'RMSE: {root_mean_squared_error(y_test, y_pred):5f}')
        print('')
        print('Random Forest Regression')
        RFR_model.fit(X_train, y_train)
        y_pred = RFR_model.predict(X_test)
        print(f'MAE: {mean_absolute_error(y_test, y_pred):5f}')
        print(f'MSE: {mean_squared_error(y_test, y_pred):5f}')
        print(f'RMSE: {root_mean_squared_error(y_test, y_pred):5f}')
        print('')

In [32]:
# Load data
columns = ['Year', 'Month', 'Day', 'H_KG', 'H_LT', 'H_DH', 'R_KG', 'R_LT', 'R_DH']
df = pd.read_excel('Dataset_floodseason_76_20_NEW_3.11.xlsx', skiprows=range(0, 2), header=None)
df.columns = columns
years = df.Year.unique()
df

Unnamed: 0,Year,Month,Day,H_KG,H_LT,H_DH,R_KG,R_LT,R_DH
0,1976,9,1,5.79,-0.10,0.02,0.0,0.0,0.0
1,1976,9,2,5.75,-0.11,0.00,0.0,0.0,0.0
2,1976,9,3,5.73,-0.12,0.00,0.0,0.0,0.0
3,1976,9,4,5.74,-0.14,-0.02,0.0,0.0,0.0
4,1976,9,5,5.74,-0.14,-0.01,0.0,12.0,0.0
...,...,...,...,...,...,...,...,...,...
5485,2020,12,27,6.36,0.32,0.13,0.0,0.0,0.0
5486,2020,12,28,6.33,0.27,0.03,0.4,0.0,0.1
5487,2020,12,29,6.30,0.21,0.06,0.0,0.0,0.0
5488,2020,12,30,6.28,0.14,0.27,7.6,0.4,2.4


In [224]:
# Case 1
# x: H (t-3, t-2, t-1, t) => 12
# y: H (t+1)

In [55]:
years = df.Year.unique()
k = 4
X = []
y = []

for year in years:
    dt = df[df.Year == year]
    max_len = len(dt)-k

    x = np.empty((max_len, 12))
    for i in range(max_len):
        dx = dt.iloc[i:i+k][['R_KG', 'R_LT', 'R_DH']].values
        dx = dx.reshape(1, -1)
        x[i] = dx
    X.append(x)
    y.append(dt.iloc[k:].H_KG.values)

X = np.vstack(X)
y = np.hstack(y)

In [56]:
run(X, y)

Linear Regression
MAE: 0.411358
MSE: 0.426563
RMSE: 0.653118

Random Forest Regression
MAE: 0.386804
MSE: 0.365419
RMSE: 0.604499



In [240]:
# Case 2
# x: H (t-3, t-2, t-1, t) | R (t-3, t-2, t-1, t) => 24
# y: H (t+1)

In [57]:
years = df.Year.unique()
k = 4
X = []
y = []

for year in years:
    dt = df[df.Year == year]
    max_len = len(dt)-k

    x = np.empty((max_len, 24))
    for i in range(max_len):
        dx = dt.iloc[i:i+k][['R_KG', 'R_LT', 'R_DH', 'H_KG', 'H_LT', 'H_DH']].values
        dx = dx.reshape(1, -1)
        x[i] = dx
    X.append(x)
    y.append(dt.iloc[k:].H_KG.values)

X = np.vstack(X)
y = np.hstack(y)

In [58]:
run(X, y)

Linear Regression
MAE: 0.270892
MSE: 0.262311
RMSE: 0.512163

Random Forest Regression
MAE: 0.272269
MSE: 0.276305
RMSE: 0.525648



In [230]:
# Case 3
# x: H (t-3, t-2, t-1, t) | R (t-3, t-2, t-1, t, t+1) => 27
# y: H (t+1)

In [59]:
years = df.Year.unique()
k = 4
X = []
y = []

for year in years:
    dt = df[df.Year == year]
    max_len = len(dt)-k

    x = np.empty((max_len, 27))
    for i in range(max_len):
        dx1 = dt.iloc[i:i+k][['R_KG', 'R_LT', 'R_DH', 'H_KG', 'H_LT', 'H_DH']].values
        dx1 = dx1.reshape(1, -1)
        dx2 = dt.iloc[i+k][['R_KG', 'R_LT', 'R_DH']].values
        dx2 = dx2.reshape(1, -1)
        x[i] = np.hstack((dx1, dx2))
    X.append(x)
    y.append(dt.iloc[k:].H_KG.values)

X = np.vstack(X)
y = np.hstack(y)

In [60]:
run(X, y)

Linear Regression
MAE: 0.147491
MSE: 0.073049
RMSE: 0.270276

Random Forest Regression
MAE: 0.126608
MSE: 0.064787
RMSE: 0.254532

