# Compute measures of dependence on CWatM data

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from pathlib import Path

from tqdm import tqdm
import numpy as np
import pandas as pd
import dataframe_image as dfi
import matplotlib.pyplot as plt

import src.visualization.visualize as visualize
import src.data.utils as utils
from src.dependence_measures.compare import compute_bivariate_scores

In [None]:
PROCESSED_DATA_FOLDER_PATH = Path("../data/processed")


## Load data

In [None]:
all_land_df = pd.read_parquet(PROCESSED_DATA_FOLDER_PATH.joinpath("CWatM_data", "all_land.parquet"))
forcings_land_df = pd.read_parquet(PROCESSED_DATA_FOLDER_PATH.joinpath("CWatM_data", "forcings_land.parquet"))
outputs_land_df = pd.read_parquet(PROCESSED_DATA_FOLDER_PATH.joinpath("CWatM_data", "outputs_land.parquet"))

data_df = pd.concat((all_land_df, forcings_land_df, outputs_land_df), axis=1)
data_df

In [None]:
INPUTS_COLUMNS = list(all_land_df.columns)
#     "porosity",
#     "firstStorDepth",
#     "percolationImp",
#     "tanslope",
#     "maxRootDepth_forest",
#     "maxRootDepth_grassland"
# ]
FORCINGS_COLUMNS = list(forcings_land_df.columns)
#     "pr",
#     "tas",
#     "tasmax",
#     "tasmin",
#     "ps",
#     "rlds",
#     "rsds",
#     "sfcwind",
#     "hurs",
#     "huss",
# ]
OUTPUTS_COLUMNS = list(outputs_land_df.columns)
#     "evap-total",
#     "potevap",
#     "qr",
#     "qtot"
# ]

# data_df = data_df.iloc[:1000]

data_df.shape

## Compute measures - Global

In [None]:

for inputs_columns_split in np.array_split(INPUTS_COLUMNS, len(INPUTS_COLUMNS)):

    for forcings_columns_split in np.array_split(FORCINGS_COLUMNS, 4):

        input_cols = inputs_columns_split.tolist() + forcings_columns_split.tolist()

        scores_df = compute_bivariate_scores(data_df,
                                             input_cols=input_cols,
                                             output_cols=OUTPUTS_COLUMNS,
                                             dst_file_path=PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics", "CWatM", "measures_global.csv"),
                                             return_all=True)


In [None]:
scores_df

#### Compute measures of shuffled data

In [None]:
N_OF_SHUFFLES = 20

for shuffled_id in range(N_OF_SHUFFLES):
    shuffled_data_df = utils.shuffle_data(data_df)

    for inputs_columns_split in np.array_split(INPUTS_COLUMNS, len(INPUTS_COLUMNS)):

        for forcings_columns_split in np.array_split(FORCINGS_COLUMNS, 4):

            input_cols = inputs_columns_split.tolist() + forcings_columns_split.tolist()

            shuffled_scores_df = compute_bivariate_scores(shuffled_data_df,
                                                          input_cols=input_cols,
                                                          output_cols=OUTPUTS_COLUMNS,
                                                          dst_file_path=PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics",
                                                                                                            "CWatM",
                                                                                                            "shuffled",
                                                                                                            f"measures_global-{shuffled_id}.csv"),
                                                          return_all=False)


## Compute measures - Gnann regions

In [None]:
RAW_DATA_FOLDER_PATH = Path("../data/raw")

domains_df = pd.read_csv(RAW_DATA_FOLDER_PATH.joinpath("ISIMIP_2b_aggregated_variables", "domains.csv"))
domains_df = domains_df[["lon", "lat", "domain_days_below_1_0.08_aridity_netrad"]]
regions_df = domains_df.rename(columns={"domain_days_below_1_0.08_aridity_netrad": "region"})
regions_df = regions_df.set_index(["lon", "lat"])

regions = regions_df["region"].unique()

In [None]:
for region in regions:

    region_indices = regions_df[regions_df["region"] == region].index
    region_indices = set(region_indices).intersection(data_df.index)

    print(region, len(region_indices))

    region_data_df = data_df.loc[list(region_indices)]

    for inputs_columns_split in np.array_split(INPUTS_COLUMNS, 25):

        for forcings_columns_split in np.array_split(FORCINGS_COLUMNS, 2):

            input_cols = inputs_columns_split.tolist() + forcings_columns_split.tolist()

            scores_df = compute_bivariate_scores(region_data_df,
                                                 input_cols=input_cols,
                                                 output_cols=OUTPUTS_COLUMNS,
                                                 dst_file_path=PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics",
                                                                                                   "CWatM",
                                                                                                   f"measures_{region}.csv"),
                                                 return_all=True)


### Compute measures of shuffled data

In [None]:
regions

In [None]:
def compute_shuffled_scores_region(data_df,
                                   regions_df,
                                   region,
                                   n_shuffles: int):

    region_indices = regions_df[regions_df["region"] == region].index
    region_indices = set(region_indices).intersection(data_df.index)

    region_data_df = data_df.loc[list(region_indices)]

    for shuffled_id in range(n_shuffles):
        shuffled_region_data_df = utils.shuffle_data(region_data_df)

        for inputs_columns_split in np.array_split(INPUTS_COLUMNS, 25):

            for forcings_columns_split in np.array_split(FORCINGS_COLUMNS, 2):

                input_cols = inputs_columns_split.tolist() + forcings_columns_split.tolist()

                shuffled_scores_df = compute_bivariate_scores(shuffled_region_data_df,
                                                             input_cols=input_cols,
                                                             output_cols=OUTPUTS_COLUMNS,
                                                             dst_file_path=PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics",
                                                                                                               "CWatM",
                                                                                                               "shuffled",
                                                                                                               f"measures_{region}-{shuffled_id}.csv"),
                                                             return_all=False)


#### Regions

In [None]:
REGION = "wet warm"
N_OF_SHUFFLES = 20

compute_shuffled_scores_region(data_df=data_df,
                               regions_df=regions_df,
                               region=REGION,
                               n_shuffles=N_OF_SHUFFLES)

In [None]:
REGION = "dry warm"
N_OF_SHUFFLES = 20

compute_shuffled_scores_region(data_df=data_df,
                               regions_df=regions_df,
                               region=REGION,
                               n_shuffles=N_OF_SHUFFLES)

In [None]:
REGION = "dry cold"
N_OF_SHUFFLES = 20

compute_shuffled_scores_region(data_df=data_df,
                               regions_df=regions_df,
                               region=REGION,
                               n_shuffles=N_OF_SHUFFLES)

In [None]:
REGION = "wet cold"
N_OF_SHUFFLES = 20

compute_shuffled_scores_region(data_df=data_df,
                               regions_df=regions_df,
                               region=REGION,
                               n_shuffles=N_OF_SHUFFLES)