In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("..")

from pathlib import Path

from tqdm import tqdm
import numpy as np
import pandas as pd
import dataframe_image as dfi
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle

import src.visualization.visualize as visualize

In [None]:
PROCESSED_DATA_FOLDER_PATH = Path("../data/processed")


## Load data

In [None]:
all_land_df = pd.read_parquet(PROCESSED_DATA_FOLDER_PATH.joinpath("CWatM_data", "all_land.parquet"))
forcings_land_df = pd.read_parquet(PROCESSED_DATA_FOLDER_PATH.joinpath("CWatM_data", "forcings_land.parquet"))
outputs_land_df = pd.read_parquet(PROCESSED_DATA_FOLDER_PATH.joinpath("CWatM_data", "outputs_land.parquet"))

data_df = pd.concat((all_land_df, forcings_land_df, outputs_land_df), axis=1)
data_df

In [None]:
INPUTS_COLUMNS = list(all_land_df.columns)
FORCINGS_COLUMNS = list(forcings_land_df.columns)
OUTPUTS_COLUMNS = list(outputs_land_df.columns)

## Explore "chanleng - potevap" relation

<div>
<img src="../reports/figures/CWatM_data/scatterplots/chanleng_potevap.png" width="1000"/>
</div>

In [None]:
input_col = "chanleng"
output_col = "potevap"

marker_style = MarkerStyle(marker=".",
                           fillstyle="full")
fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
fig.suptitle(f"Input '{input_col}' - Output '{output_col}'")

axis.scatter(x=data_df[input_col], y=data_df[output_col],
             marker=marker_style,
             s=20,
             edgecolor="none",
             alpha=0.25,
             )

X = data_df[input_col].copy().to_numpy()
X.sort()

def line1(x_array):
    return 1/450000 * (x_array - 10000)**2

def line2(x_array):
    return 1/1200000 * (x_array - 10000)**2

def line3(x_array):
    return 1/53000000000 * (x_array - 20000)**3

axis.plot(X, line1(X), c="red")
axis.plot(X, line2(X), c="orange")
axis.plot(X, line3(X), c="green")

vline1 = 30000
vline2 = 40000
vline3 = 58000

axis.vlines(vline1, -100, 3400, color="red")
axis.vlines(vline2, -100, 3400, color="orange")
axis.vlines(vline3, -100, 3400, color="green")

axis.set_xlim([7000, 80000])
axis.set_ylim([-100, 3400])

axis.set_xlabel(input_col)
axis.set_ylabel(output_col)

fig.tight_layout()
fig.show()

In [None]:
temp_df = data_df.copy()

region_1 = temp_df[(temp_df["chanleng"] < vline1) &
                   (temp_df["potevap"] > line1(temp_df["chanleng"]))]
temp_df = temp_df.loc[list(set(temp_df.index).difference(set(region_1.index)))].copy()

region_2 = temp_df[(temp_df["chanleng"] < vline2) &
                   (temp_df["potevap"] > line2(temp_df["chanleng"]))]
temp_df = temp_df.loc[list(set(temp_df.index).difference(set(region_2.index)))].copy()

region_3 = temp_df[(temp_df["chanleng"] < vline3) &
                   (temp_df["potevap"] > line3(temp_df["chanleng"]))]
temp_df = temp_df.loc[list(set(temp_df.index).difference(set(region_3.index)))].copy()

region_4 = temp_df

####

region_1["region"] = 1
region_2["region"] = 2
region_3["region"] = 3
region_4["region"] = 4

all_regions_df = pd.concat([region_1, region_2, region_3, region_4], axis=0)
regions_df = all_regions_df["region"].copy().to_frame()


## Visualize the four regions

### On a scatterplot

In [None]:

regions = regions_df["region"].unique()

colors = ['red', 'blue', 'green', 'purple']
marker_style = MarkerStyle(marker=".",
                           fillstyle="full")

fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
fig.suptitle(f"Input '{input_col}' - Output '{output_col}'")

for i, region in enumerate(regions):

    # ensure regions_df has same indexes as data_df
    region_indices = regions_df[regions_df["region"] == region].index
    region_indices = set(region_indices).intersection(data_df.index)

    region_data_df = data_df.loc[list(region_indices)]

    axis.scatter(x=region_data_df[input_col], y=region_data_df[output_col],
                    label=f"Region {region}",
                    c=colors[i],
                    marker=marker_style,
                    s=10,
                    edgecolor="none",
                    alpha=0.25,
                    )
    
    region_data_df.sort_values(input_col, ascending=True, inplace=True)

    import matplotlib.patheffects as mpe
    outline = mpe.withStroke(linewidth=4, foreground='white')

    axis.plot(region_data_df[input_col], region_data_df[output_col].rolling(window=3000,
                                                                            # win_type="gaussian",
                                                                            center=True,
                                                                            ).mean(
                                                                                # std=2000
                                                                            ),
                c=colors[i],
                path_effects=[outline],
                label=f"_{region}"
                )

axis.set_xlabel(input_col)
axis.set_ylabel(output_col)
axis.label_outer()

fig.tight_layout()
fig.subplots_adjust(bottom=0.13)
legend = fig.legend(#labels=regions,
                    markerscale=3,
                    loc="lower center",
                    ncol=4)

for legobj in legend.legend_handles:
    legobj.set_alpha(1)
fig.show()


fig.savefig(f"../reports/figures/CwatM_data/scatterplots_regions_chanleng/{input_col}_{output_col}", dpi=300)

### On a world map

In [None]:
import cartopy
import cartopy.crs as ccrs
import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle

fig = plt.figure(figsize=(12, 6))
ax = plt.axes(projection=ccrs.PlateCarree())

# Add map features
ax.coastlines()
ax.add_feature(cartopy.feature.BORDERS, linestyle=':')

marker_style = MarkerStyle(marker=".",
                           fillstyle="full")

alpha = 0.5
size = 30

# Plot data points
ax.scatter(region_1.reset_index()['lon'], region_1.reset_index()['lat'],
           color='red', label='Region 1', 
           alpha=alpha, transform=ccrs.PlateCarree(),
           marker=marker_style, s=size, edgecolor="none",
           )
ax.scatter(region_2.reset_index()['lon'], region_2.reset_index()['lat'],
           color='blue', label='Region 2', 
           alpha=alpha, transform=ccrs.PlateCarree(),
           marker=marker_style, s=size, edgecolor="none",
           )
ax.scatter(region_3.reset_index()['lon'], region_3.reset_index()['lat'],
           color='green', label='Region 3', 
           alpha=alpha, transform=ccrs.PlateCarree(),
           marker=marker_style, s=size, edgecolor="none",
           )
ax.scatter(region_4.reset_index()['lon'], region_4.reset_index()['lat'],
           color='purple', label='Region 4', 
           alpha=alpha, transform=ccrs.PlateCarree(),
           marker=marker_style, s=size, edgecolor="none",
           )

plt.show()


In [None]:

# Create a figure with 2x2 subplots, each with the PlateCarree projection
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 10),
                         subplot_kw={'projection': ccrs.PlateCarree()})

# Flatten the axes array for easy iteration
axes = axes.flatten()

# List of regions, colors, and labels
regions = [region_1, region_2, region_3, region_4]
colors = ['red', 'blue', 'green', 'purple']
labels = ['Region 1', 'Region 2', 'Region 3', 'Region 4']

# Marker style
marker_style = MarkerStyle(marker=".", fillstyle="full")
alpha = 0.25
size = 3

# Plot each region in its own subplot
for ax, region, color, label in zip(axes, regions, colors, labels):
    # Add map features
    ax.coastlines()
    ax.add_feature(cartopy.feature.BORDERS, linestyle=':')
    
    # Plot data points
    ax.scatter(region.reset_index()['lon'], region.reset_index()['lat'],
               color=color, label=label, 
               alpha=alpha, transform=ccrs.PlateCarree(),
               marker=marker_style, s=size, edgecolor="none")
    
    # Set title and legend
    ax.set_title(label)
    ax.legend(loc='lower left')

# Adjust layout to prevent overlap
plt.tight_layout()

# Display the plot
plt.show()

## Make scatterplots with the new regions

In [None]:
from itertools import product
from tqdm import tqdm

import matplotlib.pyplot as plt
from matplotlib.markers import MarkerStyle

def display_individual_scatterplots(df: pd.DataFrame,
                                    dst_path: Path,
                                    valid_x,
                                    valid_y,
                                    regions_df = None,
                                    regions_2x2 = True
                                    ):

    marker_style = MarkerStyle(marker=".",
                               fillstyle="full")

    combinations = product(valid_x, valid_y)

    for input_col, output_col in tqdm(list(combinations), desc="Computing input-output combinations"):

        # Assuming the DataFrame has 'x' and 'y' columns for the scatter plot
        if regions_df is None:

            fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
            fig.suptitle(f"Input '{input_col}' - Output '{output_col}'")

            axis.scatter(x=df[input_col], y=df[output_col],
                         marker=marker_style,
                         edgecolor="none",
                         s=30,
                         alpha=0.25,
                         )
            
            axis.set_xlabel(input_col)
            axis.set_ylabel(output_col)
        else:
            
            palette = {1: "red",
                       2: "blue",
                       3: "green",
                       4: "purple"}
            
            regions = regions_df["region"].unique()

            if regions_2x2:
                fig, axis = plt.subplots(nrows=2, ncols=2, figsize=(8, 6),
                                        sharex=True, sharey=True, constrained_layout=False)
                axis = axis.flatten()
                fig.suptitle(f"Input '{input_col}' - Output '{output_col}'")

                for i, region in enumerate(regions):

                    # ensure regions_df has same indexes as data_df
                    region_indices = regions_df[regions_df["region"] == region].index
                    region_indices = set(region_indices).intersection(data_df.index)

                    region_data_df = data_df.loc[list(region_indices)]
                
                    axis[i].scatter(x=region_data_df[input_col], y=region_data_df[output_col],
                                    label=region,
                                    c=palette[region],
                                    marker=marker_style,
                                    s=10,
                                    edgecolor="none",
                                    alpha=0.25,
                                    )
                    
                    
                    axis[i].set_xlabel(input_col)
                    axis[i].set_ylabel(output_col)
                    axis[i].label_outer()
            else:
                fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
                fig.suptitle(f"Input '{input_col}' - Output '{output_col}'")

                for i, region in enumerate(regions):

                    # ensure regions_df has same indexes as data_df
                    region_indices = regions_df[regions_df["region"] == region].index
                    region_indices = set(region_indices).intersection(data_df.index)

                    region_data_df = data_df.loc[list(region_indices)]
                
                    axis.scatter(x=region_data_df[input_col], y=region_data_df[output_col],
                                 label=region,
                                 c=palette[region],
                                 marker=marker_style,
                                 s=10,
                                 edgecolor="none",
                                 alpha=0.25,
                                 )
                    
                    region_data_df.sort_values(input_col, ascending=True, inplace=True)

                    import matplotlib.patheffects as mpe
                    outline = mpe.withStroke(linewidth=4, foreground='white')

                    axis.plot(region_data_df[input_col], region_data_df[output_col].rolling(window=3000,
                                                                                            # win_type="gaussian",
                                                                                            center=True,
                                                                                            ).mean(
                                                                                                # std=2000
                                                                                            ),
                              c=palette[region],
                              path_effects=[outline],
                              label=f"_{region}"
                              )
                
                axis.set_xlabel(input_col)
                axis.set_ylabel(output_col)
                axis.label_outer()

        # Adjust layout and display the plots
        fig.tight_layout()
        if regions_df is not None:
            fig.subplots_adjust(bottom=0.13)
            legend = fig.legend(#labels=regions,
                                markerscale=3,
                                loc="lower center",
                                ncol=4)
            
            for legobj in legend.legend_handles:
                legobj.set_alpha(1)

        fig.savefig(dst_path.joinpath(f"{input_col}_{output_col}.png"), dpi=300)

        plt.close()


In [None]:
display_individual_scatterplots(df=data_df,
                                dst_path=Path("../reports/figures/CWatM_data/scatterplots_regions_chanleng"),
                                valid_x=INPUTS_COLUMNS + FORCINGS_COLUMNS,
                                valid_y=OUTPUTS_COLUMNS,
                                regions_df=regions_df,
                                regions_2x2=False
                                )

In [None]:
display_individual_scatterplots(df=data_df,
                                dst_path=Path("../reports/figures/CWatM_data/scatterplots_regions_2x2_chanleng"),
                                valid_x=INPUTS_COLUMNS + FORCINGS_COLUMNS,
                                valid_y=OUTPUTS_COLUMNS,
                                regions_df=regions_df,
                                regions_2x2=True
                                )

## Compute the measures based on the new regions

In [None]:
from src.dependence_measures.compare import compute_bivariate_scores

INPUTS_COLUMNS = ["chanleng"]#list(all_land_df.columns)
FORCINGS_COLUMNS = []#list(forcings_land_df.columns)
OUTPUTS_COLUMNS = list(outputs_land_df.columns)

regions = regions_df["region"].unique()

for region in regions:

    region_indices = regions_df[regions_df["region"] == region].index
    region_indices = set(region_indices).intersection(data_df.index)

    print(region, len(region_indices))

    region_data_df = data_df.loc[list(region_indices)]

    for inputs_columns_split in np.array_split(INPUTS_COLUMNS, 25):

        for forcings_columns_split in np.array_split(FORCINGS_COLUMNS, 2):

            input_cols = inputs_columns_split.tolist() + forcings_columns_split.tolist()

            scores_df = compute_bivariate_scores(region_data_df,
                                                 input_cols=input_cols,
                                                 output_cols=OUTPUTS_COLUMNS,
                                                 dst_file_path=PROCESSED_DATA_FOLDER_PATH.joinpath("bivariate_metrics",
                                                                                                   "CWatM",
                                                                                                   f"measures_chanleng_{region}.csv"),
                                                 return_all=True)


## Use a classifier to understand these regions

In [None]:
temp_df = all_regions_df.copy()

y = temp_df.pop("region").values

temp_df = temp_df.reset_index()
temp_df.pop("chanleng")
X = temp_df

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_cls = RandomForestClassifier(n_estimators=40,
                                max_depth=10,
                                min_samples_split=20,
                                min_samples_leaf=5,
                                max_features=0.25,
                                random_state=23,
                                verbose=2,
                                )

rf_cls.fit(X, y)

In [None]:
from sklearn.metrics import roc_auc_score

auc_score = roc_auc_score(y_true=y,
                          y_score=rf_cls.predict_proba(X), multi_class="ovr")

auc_score

In [None]:
# pd.DataFrame(data=rf_cls.feature_importances_,
#              index=rf_cls.feature_names_in_).sort_values(by=0, ascending=True).tail(15).plot.barh(legend=False)

In [None]:
all_regions_df.groupby("region")[["ups", "chanman"]].median()

In [None]:
from sklearn.inspection import permutation_importance

perm_importance = permutation_importance(estimator=rf_cls,
                                         X=X,
                                         y=y,
                                         n_repeats=10)

In [None]:
pd.DataFrame(data=perm_importance.importances_mean,
             index=rf_cls.feature_names_in_).sort_values(by=0, ascending=True).tail(15).plot.barh(legend=False)

In [None]:
input_col = "ups"
output_col = "potevap"

regions = regions_df["region"].unique()

colors = ['red', 'blue', 'green', 'purple']
marker_style = MarkerStyle(marker=".",
                           fillstyle="full")

fig, axis = plt.subplots(nrows=1, ncols=1, figsize=(8, 6))
fig.suptitle(f"Input '{input_col}' - Output '{output_col}'")

for i, region in enumerate(regions):

    # ensure regions_df has same indexes as data_df
    region_indices = regions_df[regions_df["region"] == region].index
    region_indices = set(region_indices).intersection(data_df.index)

    region_data_df = data_df.loc[list(region_indices)]

    axis.scatter(x=region_data_df[input_col], y=region_data_df[output_col],
                    label=f"Region {region}",
                    c=colors[i],
                    marker=marker_style,
                    s=10,
                    edgecolor="none",
                    alpha=0.25,
                    )
    
    region_data_df.sort_values(input_col, ascending=True, inplace=True)

    import matplotlib.patheffects as mpe
    outline = mpe.withStroke(linewidth=4, foreground='white')

    axis.plot(region_data_df[input_col], region_data_df[output_col].rolling(window=3000,
                                                                            # win_type="gaussian",
                                                                            center=True,
                                                                            ).mean(
                                                                                # std=2000
                                                                            ),
                c=colors[i],
                path_effects=[outline],
                label=f"_{region}"
                )

axis.set_xlabel(input_col)
axis.set_ylabel(output_col)
axis.label_outer()

axis.set_xlim([-100, 30000])

fig.tight_layout()
fig.subplots_adjust(bottom=0.13)
legend = fig.legend(#labels=regions,
                    markerscale=3,
                    loc="lower center",
                    ncol=4)

for legobj in legend.legend_handles:
    legobj.set_alpha(1)
fig.show()

In [None]:
all_regions_df.region

In [None]:
plt.scatter(x=all_regions_df.index.get_level_values("lon")%1, y=all_regions_df.region, alpha=0.005)

In [None]:
region.index.get_level_values("lat")%1