# Uncertanty analysis

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from tqdm.notebook import tqdm
import plotly.express as px
import numpy as np
import pandas as pd
import os

os.chdir("/home/rooda/Dropbox/Patagonia/")

## Data 

In [None]:
# read file from datasets_signature.ipynb
dataset = pd.read_csv("MS2 Results/dataset_hydro_signatures.csv", index_col = 0)

variables = ["melt_on_glacier", "total_runoff"]
sources   = ["Outline", "Climate", "Volume", "GCM", "SSP", "BCM"]
metrics   = dataset.index.unique()

In [None]:
variable_importance = []

for variable in tqdm(variables):
    for metric in tqdm(metrics, leave = False):

        # only basins with at least one glacier
        data = dataset[dataset["Variable"] == variable].loc[metric]
        data = data.dropna(axis = 1) 

        # categorical variables
        x = data[sources]
        x = OrdinalEncoder().fit_transform(x)

        for basin in tqdm(data.columns[7:], leave = False):

            y = data[basin].values # for each basib
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state=123)

            # random forest regression model
            rf = RandomForestRegressor(n_estimators = 500, random_state = 123, n_jobs = -1)
            rf.fit(x_train, y_train)

            # permutation importance
            variable_importance_basin = permutation_importance(rf, x_test, y_test, random_state=123, n_jobs=-1, 
                                            n_repeats=30, scoring = 'neg_root_mean_squared_error')

            variable_importance_basin = pd.DataFrame([variable_importance_basin.importances_mean], columns = sources)
            variable_importance_basin = variable_importance_basin.div(variable_importance_basin.sum(axis=1), axis=0)
            variable_importance_basin["ID"] = basin
            variable_importance_basin["Metric"] = metric
            variable_importance_basin["Variable"] = variable
            variable_importance.append(variable_importance_basin)
            
variable_importance = pd.concat(variable_importance)
variable_importance = variable_importance[["ID", "Variable", "Metric", "Outline", "Climate", "Volume", "GCM", "SSP", "BCM"]]
variable_importance.to_csv("MS2 Results/feature_importance_rmse.csv", index = False)