# Explore Gnann's paper data

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from pathlib import Path
from functools import reduce

import pandas as pd

from src.data import gnann_data
from src.features import gnann_features

In [None]:
RAW_DATA_FOLDER_PATH = Path("../data/raw/ISIMIP_2b_aggregated_variables")
PROCESSED_DATA_FOLDER_PATH = Path("../data/processed")

FORCINGS_FILES = ["pr", "netrad_median"]
OUTPUTS_FILES = ["evap", "potevap", "qr", "qtot"]

DOMAINS_FILE = ["domains"]
DOMAINS_COLUMN_SOURCE = "domain_days_below_1_0.08_aridity_netrad"
DOMAINS_COLUMN = "domain"

MODEL = "pcr-globwb"

FORCINGS_COLUMNS = ["pr", "netrad"]
OUTPUTS_COLUMNS = ["evap", "potevap", "qr", "qtot"]

## Load data for a model

In [None]:
forcings_df = gnann_data.load_and_merge_geo_csv_to_df(data_path=RAW_DATA_FOLDER_PATH,
                                                      files=FORCINGS_FILES)

outputs_df = gnann_data.load_and_merge_geo_csv_to_df(data_path=RAW_DATA_FOLDER_PATH.joinpath(MODEL),
                                                     files=OUTPUTS_FILES)

domains_df = gnann_data.load_and_merge_geo_csv_to_df(data_path=RAW_DATA_FOLDER_PATH,
                                                     files=DOMAINS_FILE,
                                                     cols_to_keep=[DOMAINS_COLUMN_SOURCE])

data_df = reduce(lambda x, y: pd.merge(left=x,
                                       right=y,
                                       how="inner",
                                       on=["lat", "lon"]),
                [forcings_df, outputs_df, domains_df])

In [None]:
data_df.head()

In [None]:
data_df = gnann_features.preprocess_data(df=data_df,
                                         columns_renamer={DOMAINS_COLUMN_SOURCE: DOMAINS_COLUMN})
data_df.head()

In [None]:
data_df.to_csv(PROCESSED_DATA_FOLDER_PATH.joinpath(f"gnann_data_{MODEL}.csv"), index=False)

## Visualize data

In [None]:
import plotly.express as px

from src.visualization import visualize

In [None]:
visualize.plot_scatter_with_dropdown(df=data_df,
                                     default_x="pr",
                                     default_y="evap",
                                     valid_x=FORCINGS_COLUMNS,
                                     valid_y=OUTPUTS_COLUMNS)

In [None]:
px.scatter_matrix(data_frame=data_df[FORCINGS_COLUMNS + OUTPUTS_COLUMNS])

## EDA

In [None]:
import plotly.express as px

In [None]:


px.histogram(data_df[FORCINGS_COLUMNS + OUTPUTS_COLUMNS])