In [4]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
import plotly.express as px
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.feature_selection import RFE
from sklearn.metrics import mean_squared_error
from scipy import stats

In [5]:
monthly_dataset = pd.read_parquet("e:/flux_ET_dataset/monthly_dataset_3sep.parquet", engine='pyarrow')
budyko_train = pd.read_excel("e:/Term2/WERI/ML_Budyko/3- result/train_test_result_Budyko_11Nov.xlsx", sheet_name = "train")
budyko_test = pd.read_excel("e:/Term2/WERI/ML_Budyko/3- result/train_test_result_Budyko_11Nov.xlsx", sheet_name = "test")

In [6]:
monthly_dataset = monthly_dataset.drop(["evaporation_from_bare_soil_max",
                                        "evaporation_from_bare_soil_min",
                                        "evaporation_from_bare_soil_sum",
                                        "evaporation_from_open_water_surfaces_excluding_oceans_max",
                                        "evaporation_from_open_water_surfaces_excluding_oceans_min",
                                        "evaporation_from_open_water_surfaces_excluding_oceans_sum",
                                        "evaporation_from_the_top_of_canopy_max",
                                        "evaporation_from_the_top_of_canopy_min",
                                        "evaporation_from_the_top_of_canopy_sum",
                                        "evaporation_from_vegetation_transpiration_max",
                                        "evaporation_from_vegetation_transpiration_min",
                                        "evaporation_from_vegetation_transpiration_sum",
                                        "forecast_albedo_min",
                                        "lake_bottom_temperature",
                                        "lake_bottom_temperature_max",
                                        "lake_bottom_temperature_min",
                                        "lake_ice_depth",
                                        "lake_ice_depth_max",
                                        "lake_ice_depth_min",
                                        "lake_ice_temperature",
                                        "lake_ice_temperature_max",
                                        "lake_ice_temperature_min",
                                        "lake_mix_layer_depth",
                                        "lake_mix_layer_depth_max",
                                        "lake_mix_layer_depth_min",
                                        "lake_mix_layer_temperature",
                                        "lake_mix_layer_temperature_max",
                                        "lake_mix_layer_temperature_min",
                                        "lake_shape_factor",
                                        "lake_shape_factor_max",
                                        "lake_shape_factor_min",
                                        "lake_total_layer_temperature",
                                        "lake_total_layer_temperature_max",
                                        "lake_total_layer_temperature_min",
                                        "potential_evaporation_max",
                                        "potential_evaporation_min",
                                        "potential_evaporation_sum",
                                        "runoff_max",
                                        "runoff_min",
                                        "runoff_sum",
                                        "skin_reservoir_content",
                                        "skin_reservoir_content_max",
                                        "skin_reservoir_content_min",
                                        "skin_temperature",
                                        "skin_temperature_max",
                                        "skin_temperature_min",
                                        "snow_albedo",
                                        "snow_albedo_max",
                                        "snow_albedo_min",
                                        "snow_cover",
                                        "snow_cover_max",
                                        "snow_cover_min",
                                        "snow_density",
                                        "snow_density_max",
                                        "snow_density_min",
                                        "snow_depth",
                                        "snow_depth_max",
                                        "snow_depth_min",
                                        "snow_depth_water_equivalent",
                                        "snow_depth_water_equivalent_max",
                                        "snow_depth_water_equivalent_min",
                                        "snow_evaporation_max",
                                        "snow_evaporation_min",
                                        "snow_evaporation_sum",
                                        "snowfall_max",
                                        "snowfall_min",
                                        "snowfall_sum",
                                        "snowmelt_max",
                                        "snowmelt_min",
                                        "snowmelt_sum",
                                        "sub_surface_runoff_max",
                                        "sub_surface_runoff_min",
                                        "sub_surface_runoff_sum",
                                        "surface_latent_heat_flux_max",
                                        "surface_latent_heat_flux_min",
                                        "surface_latent_heat_flux_sum",
                                        "surface_net_solar_radiation_min",
                                        "surface_net_thermal_radiation_max",
                                        "surface_net_thermal_radiation_min",
                                        "surface_net_thermal_radiation_sum",
                                        "surface_runoff_max",
                                        "surface_runoff_min",
                                        "surface_runoff_sum",
                                        "surface_sensible_heat_flux_min",
                                        "surface_sensible_heat_flux_sum",
                                        "surface_solar_radiation_downwards_min",
                                        "surface_solar_radiation_downwards_sum",
                                        "surface_thermal_radiation_downwards_sum",
                                        "temperature_of_snow_layer",
                                        "temperature_of_snow_layer_max",
                                        "temperature_of_snow_layer_min",
                                        "total_evaporation_max",
                                        "total_evaporation_min",
                                        "total_evaporation_sum",
                                        "total_precipitation_min",
                                        "total_precipitation_sum",
                                        "u_component_of_wind_10m",
                                        "u_component_of_wind_10m_min",
                                        "v_component_of_wind_10m",
                                        "v_component_of_wind_10m_min",
                                        "leaf_area_index_low_vegetation_max",
                                        "leaf_area_index_low_vegetation_min",
                                        "leaf_area_index_high_vegetation_max",
                                        "leaf_area_index_high_vegetation_min",
                                        "forecast_albedo_max",
                                        "soil_temperature_level_1_max",
                                        "soil_temperature_level_1_min",
                                        "soil_temperature_level_2_max",
                                        "soil_temperature_level_2_min",
                                        "soil_temperature_level_3_max",
                                        "soil_temperature_level_3_min",
                                        "soil_temperature_level_4_min",
                                        "soil_temperature_level_4_max",
                                        "volumetric_soil_water_layer_1_max",
                                        "volumetric_soil_water_layer_1_min",
                                        "volumetric_soil_water_layer_2_max",
                                        "volumetric_soil_water_layer_2_min",
                                        "volumetric_soil_water_layer_3_max",
                                        "volumetric_soil_water_layer_3_min",
                                        "volumetric_soil_water_layer_4_max",
                                        "volumetric_soil_water_layer_4_min",
                                        "swe"], axis = 1)

X = monthly_dataset.drop(['ET_fill',
                          'Site_ID',
                          'General_classification',
                          'Land_cover_details',
                          'Land_cover_type',
                          'eto_hargreaves',
                          'time',
                          'date',
                          'aet',
                          'pet',
                          'delta_s',
                          'def'], axis =1)

y = monthly_dataset['ET_fill']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)


In [7]:
def calculate_metrics(actual, predicted):

    actual = actual
    predicted = predicted

    rmse = np.sqrt(mean_squared_error(actual, predicted))

    nse = 1 - (np.sum((actual - predicted) ** 2) / np.sum((actual - np.mean(actual)) ** 2))

    nrmse = rmse / (np.max(actual) - np.min(actual))

    r = stats.pearsonr(predicted,actual)[0]
    r2= r**2

    mean_value = sum(predicted) / len(predicted)
    var_value = sum(pow(x-mean_value,2) for x in predicted) / len(predicted)  # variance
    std  = math.sqrt(var_value)  # standard deviation

    mbe = np.mean(predicted - actual)  # Mean Bias Error

    return rmse, nse, nrmse, r2, r, std, mbe

In [10]:
best_model_rf = RandomForestRegressor(
    max_depth= None,
    max_leaf_nodes= 20,
    min_samples_split= 5,
    n_estimators= 50,
    random_state= None)

selector_rf = RFE(estimator=best_model_rf, n_features_to_select = 3)
selector_rf.fit(X_train, y_train)

selected_features_rf = X.columns[selector_rf.support_]

y_train_pred_rf = selector_rf.predict(X_train)
y_test_pred_rf = selector_rf.predict(X_test)

In [26]:
rmse_train, nse_train, nrmse_train, r2_train, r_train, std_train, mbe_train = calculate_metrics(actual=y_train, predicted=y_train_pred_rf)
print("RandombForest Regressor :")
print(f"RMSE train : {rmse_train}\n",
      f"NSE train : {nse_train}\n",
      f"NRMSE train : {nrmse_train}\n",
      f"R2 train : {r2_train}\n",
      f"R train : {r_train}\n",
      f"std train : {std_train}\n",
      f"MBE train : {mbe_train}")
rmse_test, nse_test, nrmse_test, r2_test, r_test, std_test, mbe_test = calculate_metrics(actual=y_test, predicted=y_test_pred_rf)
print("----------------------------------------")

print(f"RMSE test : {rmse_test}\n",
      f"NSE test : {nse_test}\n",
      f"NRMSE test : {nrmse_test}\n",
      f"R2 test : {r2_test}\n",
      f"R test : {r_test}\n",
      f"std test : {std_test}\n",
      f"MBE test : {mbe_test}")


RandombForest Regressor :
RMSE train : 3.814783797300799
 NSE train : 0.8396438912628077
 NRMSE train : 0.06866681102508222
 R2 train : 0.8585706904585748
 R train : 0.9265908970298461
 std train : 7.516680007588366
 MBE train : 0.0242568840143343
----------------------------------------
RMSE test : 4.996721269039347
 NSE test : 0.7081347650369934
 NRMSE test : 0.11938009442387058
 R2 test : 0.7129253037026407
 R test : 0.844349041393807
 std test : 8.123092597449197
 MBE test : 0.5580124408133759


In [11]:
best_model_GB = GradientBoostingRegressor(
    learning_rate= 0.1,
    max_depth= 6,
    n_estimators= 200,
    subsample= 0.75)


selector_GB = RFE(estimator=best_model_GB, n_features_to_select = 5)
selector_GB.fit(X_train, y_train)

selected_features_GB = X.columns[selector_GB.support_]

y_train_pred_GB = selector_GB.predict(X_train)
y_test_pred_GB = selector_GB.predict(X_test)

In [28]:
rmse_train, nse_train, nrmse_train, r2_train, r_train, std_train, mbe_train = calculate_metrics(actual=y_train, predicted=y_train_pred_GB)
print("Gradient Boosting Regressor :")
print(f"RMSE train : {rmse_train}\n",
      f"NSE train : {nse_train}\n",
      f"NRMSE train : {nrmse_train}\n",
      f"R2 train : {r2_train}\n",
      f"R train : {r_train}\n",
      f"std train : {std_train}\n",
      f"MBE train : {mbe_train}")
rmse_test, nse_test, nrmse_test, r2_test, r_test, std_test, mbe_test = calculate_metrics(actual=y_test, predicted=y_test_pred_GB)
print("----------------------------------------")

print(f"RMSE test : {rmse_test}\n",
      f"NSE test : {nse_test}\n",
      f"NRMSE test : {nrmse_test}\n",
      f"R2 test : {r2_test}\n",
      f"R test : {r_test}\n",
      f"std test : {std_test}\n",
      f"MBE test : {mbe_test}")

Gradient Boosting Regressor :
RMSE train : 0.03300256270736279
 NSE train : 0.9999879983493748
 NRMSE train : 0.0005940522077223283
 R2 train : 0.999988945735466
 R train : 0.9999944728524583
 std train : 9.517040903239868
 MBE train : -0.00018555811326147857
----------------------------------------
RMSE test : 4.849285896042127
 NSE test : 0.7251044570471292
 NRMSE test : 0.11585761482131901
 R2 test : 0.7337645855742578
 R test : 0.856600598630574
 std test : 8.672438854961191
 MBE test : 0.4227129919887


In [12]:
best_model_Ada = AdaBoostRegressor(
    learning_rate = 0.25,
    loss = 'linear',
    n_estimators = 50)

selector_Ada = RFE(estimator=best_model_Ada, n_features_to_select = 6)
selector_Ada.fit(X_train, y_train)

selected_features_Ada = X.columns[selector_Ada.support_]

y_train_pred_Ada = selector_Ada.predict(X_train)
y_test_pred_Ada = selector_Ada.predict(X_test)

In [29]:
rmse_train, nse_train, nrmse_train, r2_train, r_train, std_train, mbe_train = calculate_metrics(actual=y_train, predicted=y_train_pred_Ada)
print("Ada Boost Regressor :")
print(f"RMSE train : {rmse_train}\n",
      f"NSE train : {nse_train}\n",
      f"NRMSE train : {nrmse_train}\n",
      f"R2 train : {r2_train}\n",
      f"R train : {r_train}\n",
      f"std train : {std_train}\n",
      f"MBE train : {mbe_train}")
rmse_test, nse_test, nrmse_test, r2_test, r_test, std_test, mbe_test = calculate_metrics(actual=y_test, predicted=y_test_pred_Ada)
print("----------------------------------------")

print(f"RMSE test : {rmse_test}\n",
      f"NSE test : {nse_test}\n",
      f"NRMSE test : {nrmse_test}\n",
      f"R2 test : {r2_test}\n",
      f"R test : {r_test}\n",
      f"std test : {std_test}\n",
      f"MBE test : {mbe_test}")

Ada Boost Regressor :
RMSE train : 4.945477802459704
 NSE train : 0.7304978935602164
 NRMSE train : 0.08901951138896022
 R2 train : 0.7885704463982166
 R train : 0.8880148908651344
 std train : 6.322133404394461
 MBE train : 0.8376244036322996
----------------------------------------
RMSE test : 5.797031705587225
 NSE test : 0.6071529726754192
 NRMSE test : 0.1385008598897144
 R2 test : 0.6345219645589353
 R test : 0.796568869940908
 std test : 6.1185712381716515
 MBE test : 0.8840490729807127


In [13]:
best_model_ExtraTrees = ExtraTreesRegressor(
    bootstrap = True,
    max_depth = 10,
    max_features = 0.5,
    min_impurity_decrease = 0.1,
    min_samples_split = 5,
    n_estimators = 50)

selector_ExtraTrees = RFE(estimator=best_model_ExtraTrees, n_features_to_select = 10)
selector_ExtraTrees.fit(X_train, y_train)

selected_features_ExtraTrees = X.columns[selector_ExtraTrees.support_]

y_train_pred_ExtraTrees = selector_ExtraTrees.predict(X_train)
y_test_pred_ExtraTrees = selector_ExtraTrees.predict(X_test)

In [30]:
rmse_train, nse_train, nrmse_train, r2_train, r_train, std_train, mbe_train = calculate_metrics(actual=y_train, predicted=y_train_pred_ExtraTrees)
print("Extra Trees Regressor :")
print(f"RMSE train : {rmse_train}\n",
      f"NSE train : {nse_train}\n",
      f"NRMSE train : {nrmse_train}\n",
      f"R2 train : {r2_train}\n",
      f"R train : {r_train}\n",
      f"std train : {std_train}\n",
      f"MBE train : {mbe_train}")
rmse_test, nse_test, nrmse_test, r2_test, r_test, std_test, mbe_test = calculate_metrics(actual=y_test, predicted=y_test_pred_ExtraTrees)
print("----------------------------------------")

print(f"RMSE test : {rmse_test}\n",
      f"NSE test : {nse_test}\n",
      f"NRMSE test : {nrmse_test}\n",
      f"R2 test : {r2_test}\n",
      f"R test : {r_test}\n",
      f"std test : {std_test}\n",
      f"MBE test : {mbe_test}")

Extra Trees Regressor :
RMSE train : 4.034943239419053
 NSE train : 0.8206007968186767
 NRMSE train : 0.07262972153603131
 R2 train : 0.8523738153645447
 R train : 0.9232409302909749
 std train : 7.106715204585316
 MBE train : 0.18086115984534887
----------------------------------------
RMSE test : 5.574055452853982
 NSE test : 0.6367925987787949
 NRMSE test : 0.1331735813259667
 R2 test : 0.6500050150926686
 R test : 0.8062288850522962
 std test : 6.503614619330074
 MBE test : 0.47083203019825953


In [14]:
y_train_budyko = budyko_train["observed"]
y_train_pred_budyko = budyko_train["simulated"]

y_test_budyko = budyko_test["observed"]
y_test_pred_budyko = budyko_test["simulated"]

In [31]:
rmse_train, nse_train, nrmse_train, r2_train, r_train, std_train, mbe_train = calculate_metrics(actual=y_train_budyko, predicted=y_train_pred_budyko)
print("Budyko :")
print(f"RMSE train : {rmse_train}\n",
      f"NSE train : {nse_train}\n",
      f"NRMSE train : {nrmse_train}\n",
      f"R2 train : {r2_train}\n",
      f"R train : {r_train}\n",
      f"std train : {std_train}\n",
      f"MBE train : {mbe_train}")
rmse_test, nse_test, nrmse_test, r2_test, r_test, std_test, mbe_test = calculate_metrics(actual=y_test_budyko, predicted=y_test_pred_budyko)
print("----------------------------------------")

print(f"RMSE test : {rmse_test}\n",
      f"NSE test : {nse_test}\n",
      f"NRMSE test : {nrmse_test}\n",
      f"R2 test : {r2_test}\n",
      f"R test : {r_test}\n",
      f"std test : {std_test}\n",
      f"MBE test : {mbe_test}")

Budyko :
RMSE train : 0.0
 NSE train : 1.0
 NRMSE train : 0.0
 R2 train : 0.9999999999999998
 R train : 0.9999999999999999
 std train : 7.010037006122805
 MBE train : 0.0
----------------------------------------
RMSE test : 3.4672084036473785
 NSE test : 0.7553646168653753
 NRMSE test : 0.1094915539500959
 R2 test : 0.7702716596704442
 R test : 0.8776512175519636
 std test : 7.001546559029022
 MBE test : 0.10694043669431008


In [16]:
fig = px.scatter(
    x = y_train_pred_rf,
    y = y_train,
)

fig.add_shape(
    type='line',
    x0 = 0,
    y0 = 0,
    x1 = 60,
    y1 = 60,
    line = dict(
        color = 'Red',
    )
)

fig.update_layout(
    autosize = False,
    width = 600,
    height = 600,
    # xaxis_range = [0, 12],
    # yaxis_range = [0, 12],
    xaxis_title = dict(text = 'ET Predicted'),
    yaxis_title = dict(text = 'ET')
)

fig.show()