# Explore Gnann's paper data

In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from pathlib import Path
from functools import reduce

import pandas as pd

from src.data import gnann_data
from src.features import gnann_features

In [2]:
RAW_DATA_FOLDER_PATH = Path("../data/raw/ISIMIP_2b_aggregated_variables")
PROCESSED_DATA_FOLDER_PATH = Path("../data/processed")

FORCINGS_FILES = ["pr", "netrad_median"]
OUTPUTS_FILES = ["evap", "potevap", "qr", "qtot"]

DOMAINS_FILE = ["domains"]
DOMAINS_COLUMN_SOURCE = "domain_days_below_1_0.08_aridity_netrad"
DOMAINS_COLUMN = "domain"

MODEL = "pcr-globwb"

FORCINGS_COLUMNS = ["pr", "netrad"]
OUTPUTS_COLUMNS = ["evap", "potevap", "qr", "qtot"]

## Load data for a model

In [3]:
forcings_df = gnann_data.load_and_merge_geo_csv_to_df(data_path=RAW_DATA_FOLDER_PATH,
                                                      files=FORCINGS_FILES)

outputs_df = gnann_data.load_and_merge_geo_csv_to_df(data_path=RAW_DATA_FOLDER_PATH.joinpath(MODEL),
                                                     files=OUTPUTS_FILES)

domains_df = gnann_data.load_and_merge_geo_csv_to_df(data_path=RAW_DATA_FOLDER_PATH,
                                                     files=DOMAINS_FILE,
                                                     cols_to_keep=[DOMAINS_COLUMN_SOURCE])

data_df = reduce(lambda x, y: pd.merge(left=x,
                                       right=y,
                                       how="inner",
                                       on=["lat", "lon"]),
                [forcings_df, outputs_df, domains_df])

In [None]:
data_df.head()

In [None]:
data_df = gnann_data.preprocess_data(df=data_df,
                                     columns_renamer={DOMAINS_COLUMN_SOURCE: DOMAINS_COLUMN})
data_df.head()

In [6]:
data_df.to_csv(PROCESSED_DATA_FOLDER_PATH.joinpath(f"gnann_data_{MODEL}.csv"), index=False)

## Visualize data

In [None]:
from src.visualization import visualize

visualize.plot_scatter_with_dropdown(df=data_df,
                                     default_x="pr",
                                     default_y="potevap",
                                     valid_x=FORCINGS_COLUMNS,
                                     valid_y=OUTPUTS_COLUMNS)

In [25]:
from itertools import product
from tqdm import tqdm

import matplotlib.pyplot as plt

def display_individual_scatterplots(df,
                                    dst_path: Path,
                                    valid_x=FORCINGS_COLUMNS,
                                    valid_y=OUTPUTS_COLUMNS,
                                    ):

    combinations = product(valid_x, valid_y)

    for input_col, output_col in tqdm(list(combinations), desc="Computing input-output combinations"):

        fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))

        # Assuming the DataFrame has 'x' and 'y' columns for the scatter plot
        axis.scatter(df[input_col], df[output_col],
                     alpha=0.2)
        axis.set_title(f"Input '{input_col}' - Output '{output_col}'")
        axis.set_xlabel(input_col)
        axis.set_ylabel(output_col)

        # Adjust layout and display the plots
        plt.tight_layout()

        fig.savefig(f"../reports/figures/gnann_data/{input_col}_{output_col}.png", dpi=300)


In [None]:
display_individual_scatterplots(df=data_df,
                                dst_path="")

## EDA

In [None]:
import plotly.express as px

In [None]:


px.histogram(data_df[FORCINGS_COLUMNS + OUTPUTS_COLUMNS])