In [8]:
# Model with default parameters
# 1: LinearRegression
# 2: RandomForestRegressor

# Metrics
# 1: Mean Squared Error (MSE)
# 2: Mean Absolute Error (MAE)
# 3: Root Mean Squared Error (RMSE)

# Data
# - No preprocessing

In [43]:
# from sklearn.metrics import get_scorer_names
# get_scorer_names()

In [284]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [249]:
# Load data
columns = ['Year', 'Month', 'Day', 'Mực nước KG', 'Mực nước LT', 'Mực nước DH', 'Lượng mưa KG', 'Lượng mưa LT', 'Lượng mưa DH']
df = pd.read_excel('Dataset_floodseason_76_20_NEW_3.11.xlsx', skiprows=range(0, 2), header=None)
df.columns = columns
df

Unnamed: 0,Year,Month,Day,Mực nước KG,Mực nước LT,Mực nước DH,Lượng mưa KG,Lượng mưa LT,Lượng mưa DH
0,1976,9,1,5.79,-0.10,0.02,0.0,0.0,0.0
1,1976,9,2,5.75,-0.11,0.00,0.0,0.0,0.0
2,1976,9,3,5.73,-0.12,0.00,0.0,0.0,0.0
3,1976,9,4,5.74,-0.14,-0.02,0.0,0.0,0.0
4,1976,9,5,5.74,-0.14,-0.01,0.0,12.0,0.0
...,...,...,...,...,...,...,...,...,...
5485,2020,12,27,6.36,0.32,0.13,0.0,0.0,0.0
5486,2020,12,28,6.33,0.27,0.03,0.4,0.0,0.1
5487,2020,12,29,6.30,0.21,0.06,0.0,0.0,0.0
5488,2020,12,30,6.28,0.14,0.27,7.6,0.4,2.4


In [293]:
def run(df=df, feature_columns=columns[4:9], label_column=columns[3], kfold=None, show=False):
    X = df[feature_columns].values
    y = df[label_column].values
    LR_model = LinearRegression()
    RFR_model = RandomForestRegressor()
    
    if kfold:
        print('Linear Regression')
        print(f'MAE {kfold}: {np.mean(-cross_val_score(LR_model, X, y, scoring="neg_mean_absolute_error", cv=kfold)):5f}')
        print(f'MSE {kfold}: {np.mean(-cross_val_score(LR_model, X, y, scoring="neg_mean_squared_error", cv=kfold)):5f}')
        print(f'RMSE {kfold}: {np.mean(-cross_val_score(LR_model, X, y, scoring="neg_root_mean_squared_error", cv=kfold)):5f}')
        print('')
        print('Random Forest Regression')
        print(f'MAE {kfold}: {np.mean(-cross_val_score(RFR_model, X, y, scoring="neg_mean_absolute_error", cv=kfold)):5f}')
        print(f'MSE {kfold}: {np.mean(-cross_val_score(RFR_model, X, y, scoring="neg_mean_squared_error", cv=kfold)):5f}')
        print(f'RMSE {kfold}: {np.mean(-cross_val_score(RFR_model, X, y, scoring="neg_root_mean_squared_error", cv=kfold)):5f}')
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        print('Linear Regression')
        LR_model.fit(X_train, y_train)
        y_pred = LR_model.predict(X_test)
        print(f'MAE: {mean_absolute_error(y_test, y_pred):5f}')
        print(f'MSE: {mean_squared_error(y_test, y_pred):5f}')
        print(f'RMSE: {root_mean_squared_error(y_test, y_pred):5f}')
        print('')
        print('Random Forest Regression')
        RFR_model.fit(X_train, y_train)
        y_pred = RFR_model.predict(X_test)
        print(f'MAE: {mean_absolute_error(y_test, y_pred):5f}')
        print(f'MSE: {mean_squared_error(y_test, y_pred):5f}')
        print(f'RMSE: {root_mean_squared_error(y_test, y_pred):5f}')
    
        if show:
            df = df.copy()
            df['LR_predicted'] = LR_model.predict(X).reshape(-1, 1)
            df['RFR_predicted'] = RFR_model.predict(X).reshape(-1, 1)
            
            fig = sns.kdeplot(df[[columns[3], 'LR_predicted', 'RFR_predicted']], fill=True).get_figure()
            fig.savefig('density.png', dpi=300)
            plt.clf()

            mapping = {
                'Mực nước KG': 'mean',
                'Mực nước LT': 'mean',
                'Mực nước DH': 'mean',
                'Lượng mưa KG': 'sum',
                'Lượng mưa LT': 'sum',
                'Lượng mưa DH': 'sum',
                'LR_predicted': 'mean',
                'RFR_predicted': 'mean'
            }
            df = df.groupby(['Year']).agg(mapping)
            
            fig = sns.lineplot(df[[columns[3], 'LR_predicted', 'RFR_predicted']], palette="tab10", linewidth=2).get_figure()
            fig.savefig('lineplot.png', dpi=300)
            plt.clf()

In [294]:
# run(df, kfold=10)
run(df, show=True)

Linear Regression
MAE: 0.234910
MSE: 0.128280
RMSE: 0.358162

Random Forest Regression
MAE: 0.219696
MSE: 0.109523
RMSE: 0.330943


<Figure size 640x480 with 0 Axes>