# Compute metrics on CWatM data

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from pathlib import Path

from tqdm import tqdm
import pandas as pd
import dataframe_image as dfi
import matplotlib.pyplot as plt

import src.visualization.visualize as visualize
import src.data.utils as utils
from src.dependence_measures.compare import compute_bivariate_scores

In [10]:
PROCESSED_DATA_FOLDER_PATH = Path("../data/processed")

INPUTS_COLUMNS = [
    # "porosity",
    # "firstStorDepth",
    # "percolationImp",
    # "tanslope",
    # "maxRootDepth_forest",
    # "maxRootDepth_grassland"
]
FORCINGS_COLUMNS = [
    "pr",
    "tas"
]
# OUTPUTS_COLUMNS = ["evap-total", "potevap", "qr", "qtot"]
OUTPUTS_COLUMNS = [
    "evap-total",
    "potevap",
    "qr",
    "qtot"
]

## Load data

In [11]:
all_land_df = pd.read_parquet(PROCESSED_DATA_FOLDER_PATH.joinpath("CWatM_data", "all_land.parquet"))
forcings_land_df = pd.read_parquet(PROCESSED_DATA_FOLDER_PATH.joinpath("CWatM_data", "forcings_land.parquet"))
outputs_land_df = pd.read_parquet(PROCESSED_DATA_FOLDER_PATH.joinpath("CWatM_data", "outputs_land.parquet"))

In [None]:
data_df = pd.concat((all_land_df[INPUTS_COLUMNS], forcings_land_df[FORCINGS_COLUMNS], outputs_land_df[OUTPUTS_COLUMNS]), axis=1)
data_df

## Compute metrics

In [None]:
STORE_RESULTS = True
LOAD_RESULTS = True

# data_df = data_df.iloc[:1000]
data_df.shape

In [None]:
scores_df = compute_bivariate_scores(data_df,
                                     input_cols=INPUTS_COLUMNS + FORCINGS_COLUMNS,
                                     output_cols=OUTPUTS_COLUMNS,
                                     dst_file_path=PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics", "CWatM", "scores_all.csv"),
                                     return_all=True)

In [None]:
# if STORE_RESULTS:
#     scores_df.to_csv(PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics", "CWatM", "scores_all.csv"))

scores_df

#### Compute baseline

In [None]:
# TODO: Update to new compute_bivariate_scores()

# N_OF_SHUFFLES = 20

# shuffle_scores_df_list = []

# for _ in range(N_OF_SHUFFLES):
#     shuffle_data_df = utils.shuffle_data(data_df)

#     shuffle_scores_df = compute_bivariate_scores(shuffle_data_df, input_cols=INPUTS_COLUMNS, output_cols=OUTPUTS_COLUMNS)

#     if STORE_RESULTS:
#         import uuid
        
#         shuffle_scores_folder = PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics", "CWatM", "shuffled_baseline")
#         shuffle_scores_df.to_csv(shuffle_scores_folder.joinpath(f"scores_{uuid.uuid4()}.csv"))

#     shuffle_scores_df_list.append(shuffle_scores_df)

### Analyse results

In [None]:
if LOAD_RESULTS:
    scores_df = pd.read_csv(PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics", "CWatM", "scores_all.csv"), index_col=["input", "output"])

scores_df

In [None]:
fig = visualize.plot_metric_values_and_rank(
    scores_df=scores_df,
    metrics=["pearson", "spearman", "mutual information (sklearn)", "normalized mutual information", "MIC"],
    sort_values_by="MIC"
)

# if STORE_RESULTS:
#     fig.savefig("../reports/figures/CWatM/metrics_values_and_rank.png")