# Compute metrics on Gnann data

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from pathlib import Path

from tqdm import tqdm
import pandas as pd
import dataframe_image as dfi
import matplotlib.pyplot as plt

import src.visualization.visualize as visualize
import src.data.utils as utils
from src.dependence_measures.compare import compute_bivariate_scores

In [5]:
PROCESSED_DATA_FOLDER_PATH = Path("../data/processed")

DOMAINS_COLUMN = "domain"

MODEL = "pcr-globwb"

INPUTS_COLUMNS = ["pr", "netrad"]
OUTPUTS_COLUMNS = ["evap", "potevap", "qr", "qtot"]

## Load Gnann data

In [None]:
data_df = pd.read_csv(PROCESSED_DATA_FOLDER_PATH.joinpath(f"gnann_data_{MODEL}.csv"), index_col=False)
data_df

In [None]:
data_df.shape

## Compute bi-variate metrics

In [8]:
STORE_RESULTS = True
LOAD_RESULTS = True

# data_df = data_df.iloc[:1000]
# data_df.shape

### On all data

In [None]:
scores_df = compute_bivariate_scores(data_df, input_cols=INPUTS_COLUMNS, output_cols=OUTPUTS_COLUMNS)

In [None]:
if STORE_RESULTS:
    scores_df.to_csv(PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics", MODEL, "scores_all.csv"))

scores_df

In [None]:
scores_df_styled = scores_df.style.format(lambda x: f"{x:.2f}")#.background_gradient(cmap="OrRd", axis=0)

display(scores_df_styled)
if STORE_RESULTS:
    dfi.export(scores_df_styled, f"../reports/tables/gnann_data/bivariate_metrics_{MODEL}.png", dpi=300)

#### Compute baseline

In [None]:
N_OF_SHUFFLES = 20

shuffle_scores_df_list = []

for _ in range(N_OF_SHUFFLES):
    shuffle_data_df = utils.shuffle_data(data_df)

    shuffle_scores_df = compute_bivariate_scores(shuffle_data_df, input_cols=INPUTS_COLUMNS, output_cols=OUTPUTS_COLUMNS)

    if STORE_RESULTS:
        import uuid
        
        shuffle_scores_folder = PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics", MODEL, "shuffled_baseline")
        shuffle_scores_df.to_csv(shuffle_scores_folder.joinpath(f"scores_{uuid.uuid4()}.csv"))

    shuffle_scores_df_list.append(shuffle_scores_df)

In [None]:
len(shuffle_scores_df_list)

#### Analyse results

In [None]:
if LOAD_RESULTS:
    scores_df = pd.read_csv(PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics", MODEL, "scores_all.csv"), index_col=["input", "output"])

scores_df

In [None]:
if LOAD_RESULTS:

    shuffle_data_folder = PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics", MODEL, "shuffled_baseline")
    shuffle_scores_df_list = []

    for file in shuffle_data_folder.rglob("*.csv"):
        shuffle_scores_df_list.append(pd.read_csv(file, index_col=["input", "output"]))

len(shuffle_scores_df_list)

In [103]:
# scores_df_styled = scores_df[["pearson"]].sort_values("pearson", ascending=False).style.format(lambda x: f"{x:.2f}")#.background_gradient(cmap="OrRd", axis=0)

# display(scores_df_styled)
# dfi.export(scores_df_styled, f"../reports/tables/gnann_data/bivariate_metrics_{MODEL}_pearson.png", dpi=300)

# scores_df_styled = scores_df[["spearman"]].sort_values("spearman", ascending=False).style.format(lambda x: f"{x:.2f}")#.background_gradient(cmap="OrRd", axis=0)

# display(scores_df_styled)
# dfi.export(scores_df_styled, f"../reports/tables/gnann_data/bivariate_metrics_{MODEL}_spearman.png", dpi=300)

# scores_df_styled = scores_df[["MIC"]].sort_values("MIC", ascending=False).style.format(lambda x: f"{x:.2f}")#.background_gradient(cmap="OrRd", axis=0)

# display(scores_df_styled)
# dfi.export(scores_df_styled, f"../reports/tables/gnann_data/bivariate_metrics_{MODEL}_MIC.png", dpi=300)

In [None]:
fig = visualize.plot_metric_values_and_rank(
    scores_df=scores_df,
    metrics=["pearson", "spearman", "mutual information (sklearn)", "MIC"],
    sort_values_by="MIC"
)

if STORE_RESULTS:
    fig.savefig("../reports/figures/gnann_data/metrics_values_and_rank.png")

In [159]:
# fig = visualize.plot_metric_values_and_rank_with_shuffle(
#     scores_df=scores_df,
#     metrics=["pearson", "spearman", "mutual information (sklearn)", "MIC"],
#     shuffle_scores_df_list=shuffle_scores_df_list,
#     sort_values_by="MIC"
# )

# if STORE_RESULTS:
#     fig.savefig("../reports/figures/gnann_data/metrics_values_and_rank.png")

In [None]:
fig = visualize.plot_metric_baseline_and_value(
    scores_df=scores_df,
    metrics=["pearson", "spearman", "mutual information (sklearn)", "MIC", "MAS", "MEV", "MCN_general"],
    shuffle_scores_df_list=shuffle_scores_df_list,
    sort_values_by="MIC"
)


In [None]:
scores_df_subsample = scores_df.sort_values("MIC", ascending=False)
scores_df_subsample = scores_df_subsample[[
    # "MIC",
    "MAS",
    "MEV",
    # "MCN_general"
]].iloc[:5]

scores_df_subsample_rank = scores_df_subsample.rank(axis=0)

fig, axis = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

axis[0].plot(scores_df_subsample.T,
             marker="o")
axis[0].set_title("Metrics value")
axis[0].set_xlabel("Metric")
axis[0].set_ylabel("Value")
axis[1].plot(scores_df_subsample_rank.T,
             label=list(scores_df_subsample_rank.T.columns),
             marker="o")
axis[1].set_title("Metrics rank")
axis[1].set_xlabel("Metric")
axis[1].set_ylabel("Rank")
axis[1].legend(loc="lower right")

plt.tight_layout()
plt.show()

# if STORE_RESULTS:
#     fig.savefig("../reports/figures/gnann_data/metrics_values_and_rank.png")