# XGBoost with variable combinations:

Trains an XGBoost model for all variable combinations to find the best combinations of temperature and total precipitation.

## Setting up:
### Imports:

In [None]:

from matplotlib import pyplot as plt
import os
import numpy as np
import pandas as pd
import re
import warnings
import re
import seaborn as sns
import itertools
from ast import literal_eval
import matplotlib

import math
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from scripts.xgb_helpers import *
from scripts.stakes_processing import *
from scripts.xgb_input import *
from scripts.xgb_model import *
from scripts.plots_clean import *
from scripts.xgb_metrics import *
from scripts.xgb_model_varcomb import *

from scripts.PDD_model_modules import *
from scripts.PDD_model_calibration import *
from scripts.PDD_helpers import *

warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2


### Constants:

In [None]:
# set seed everywhere:
seed_all(SEED)
print('Seed:', SEED)

param_grid = {
    'learning_rate': np.arange(0.01, 0.2, 0.01),
    'n_estimators': np.arange(50, 300, 15),
    'max_depth': np.arange(3, 10, 1),
}

feature_list = [
    't2m_Oct', 't2m_Nov', 't2m_Dec', 't2m_Jan', 't2m_Feb', 't2m_Mar',
    't2m_Apr', 't2m_May', 't2m_June', 't2m_July', 't2m_Aug', 't2m_Sep',
    'tp_Oct', 'tp_Nov', 'tp_Dec', 'tp_Jan', 'tp_Feb', 'tp_Mar', 'tp_Apr',
    'tp_May', 'tp_June', 'tp_July', 'tp_Aug', 'tp_Sep'
]

color_palette = sns.color_palette("husl", 13)
colors = np.tile("#8CA6D9", 6)
palette_grays = sns.color_palette(colors)

INPUT_TYPE = "MeteoSuisse"

KFOLD = True
if KFOLD:
    NUM_FOLDS = 5
    FOLD = 'kfold'
else:
    NUM_FOLDS = 1
    FOLD = 'single_fold'

path_style_sheet = 'scripts/example.mplstyle'
plt.style.use(path_style_sheet)

## Pre-processing:

In [None]:
# Get number of stakes per glacier and their names
glStakesNum, glStakes = get_StakesNum(path_GLAMOS_csv)
glStakes_sorted = sorted(glStakesNum.items(), key=lambda x: x[1])

# Get total number of stakes
num_stakes = 0
for (glacier, num) in (glStakes_sorted):
    num_stakes += num
print('Total number of stakes:', num_stakes)
print('Number of stakes per glacier:\n', glStakes_sorted)

# glacier names:
glaciers = list(glStakes.keys())
# Keep only the glaciers with more than 20 years of measurements
glStakes_20years, glStakes_20years_sorted, glStakes_20years_all = getStakesNyears(
    glaciers,
    glStakes,
    path_glacattr,
    path_era5_stakes,
    input_type=INPUT_TYPE,
    N=20)
print('After preprocessing:\n----\nNumber of glaciers:',
      len(glStakes_20years.keys()))
# num_stakes = 0
# for gl in glStakes_20years.keys():
#     num_stakes += len(glStakes_20years[gl])
print('Number of stakes:', len(glStakes_20years_all))

In [None]:
# Rename stakes so that ordered by elevation from P1 to P(X highest):
glaciers = list(glStakes_20years.keys())
s_end, gl_mb, = {}, {}
start_years, end_years = [], []
stakes_per_el = {}
var = "b_a_fix"
for g in range(len(glaciers)):
    gl = glaciers[g]  # One glacier
    height = {}
    for stake in glStakes_20years[gl]:
        # Get coordinates and time of file for this stake:
        fileName = re.split(".csv", stake)[0][:-3]
        df_stake = read_stake_csv(path_glacattr, stake,
                                  COI).sort_values(by="date_fix0")

        # remove category 0
        df_stake = df_stake[df_stake.vaw_id > 0]

        # remove 2021:
        df_stake = df_stake[df_stake.date_fix0.dt.year < 2021]

        # years:
        years = [
            df_stake.date_fix0.iloc[i].year
            for i in range(len(df_stake.date_fix0))
        ]

        start_years.append(years[0])
        end_years.append(years[-1])

        s_end[fileName] = years  # start and end years
        gl_mb[fileName] = df_stake[var].values / (
            1000)  # MB of stake (change to m w.e.)
        height[fileName] = df_stake.height.iloc[0]  # Height of stake

    # Sort stakes per elevation
    print(height)
    stakes_per_el[gl] = list(
        pd.Series(height).sort_values(ascending=True).index.values)
rename_stakes = {}
for gl in stakes_per_el.keys():
    for i, stake in enumerate(stakes_per_el[gl]):
        rename_stakes[stake] = f"{GLACIER_CORRECT[gl]}-P{i+1}"

rename_stakes

## XGBoost - run multi combinations:

In [None]:
# temperature variables
t2m_vars = [
    't2m_Oct',
    't2m_Nov',
    't2m_Dec',
    't2m_Jan',
    't2m_Feb',
    't2m_Mar',
    't2m_Apr',
    't2m_May',
    't2m_June',
    't2m_July',
    't2m_Aug',
    't2m_Sep',
]
# precipitation variables
tp_vars = [
    'tp_Oct',
    'tp_Nov',
    'tp_Dec',
    'tp_Jan',
    'tp_Feb',
    'tp_Mar',
    'tp_Apr',
    'tp_May',
    'tp_June',
    'tp_July',
    'tp_Aug',
    'tp_Sep',
]

In [None]:
# Get all combinations of months: (powerset so not consecutive)
def powerset(original_list):
    # The number of subsets is 2^n
    num_subsets = 2**len(original_list)

    # Create an empty list to hold all the subsets
    subsets = []

    # Iterate over all possible subsets
    for subset_index in range(num_subsets):
        # Create an empty list to hold the current subset
        subset = []
        # Iterate over all elements in the original list
        for index in range(len(original_list)):
            # Check if index bit is set in subset_index
            if (subset_index & (1 << index)) != 0:
                # If the bit is set, add the element at this index to the current subset
                subset.append(original_list[index])
        # Add the current subset to the list of all subsets
        if len(subset) > 0:
            subsets.append(subset)
    return subsets


combinations = powerset(t2m_vars)
len(combinations), combinations[15]

In [None]:
# Get all consecutive combinations of length max 6:
def consecutive_combinations(iterable, consec):
    begin = 0
    chunks = len(iterable) + 1 - consec
    return [iterable[x + begin:x + consec] for x in range(chunks)]


iterable = list(MONTH_VAL.keys())
consec_t2m, consec_tp = [], []
for i in range(1, 7):
    for el in consecutive_combinations(iterable, i):
        consec_t2m.append(['t2m_' + MONTH_VAL[j] for j in el])
        consec_tp.append(['tp_' + MONTH_VAL[j] for j in el])

# combinations for t2m and tp
combinations_t2m_tp = list(itertools.product(consec_t2m, consec_tp))
len(combinations_t2m_tp), combinations_t2m_tp[200:210]

In [None]:
# Check if all stakes were already run:
path_multi = f'../../data/MB_modeling/XGBoost/ind_stakes/kfold/{INPUT_TYPE}/multi_combi/sum_prec_all/'
stakes_processed = [
    re.split('_', f)[0] + '_' + re.split('_', f)[1][:-4]
    for f in os.listdir(path_multi)
]
remaining_stakes = Diff(list(stakes_processed), list(glStakes_20years_all))
remaining_stakes

In [None]:
! ls ../../data/MB_modeling/XGBoost/ind_stakes/kfold/MeteoSuisse/monthly/t2m_tp/match_annual/

In [None]:
# Because we're not running the models with grid search,
# we're getting the best hp from the standard XGBoost run (with all 12 months)
var_xg_monthly, metrics_monthly = assembleXGStakes(
    path_save_xgboost_stakes +
    f'{FOLD}/{INPUT_TYPE}/monthly/t2m_tp/match_annual/', glStakes_20years_all, rename_stakes)
hp_lr = metrics_monthly['hp_lr']
hp_ne = metrics_monthly['hp_ne']
hp_md = metrics_monthly['hp_md']

# Run XGBoost for all combinations of t2m variables and save the 20 bests
RUN = False
path_multi = '../../data/MB_modeling/XGBoost/ind_stakes/kfold/MeteoSuisse/multi_combi/sum_prec_all_mae/'
if RUN:
    runXGBoost_varcomb(
        combinations_t2m_tp,
        hp_lr,
        hp_ne,
        hp_md,
        glStakes_20years,
        param_grid,  # grid for HP search
        path_multi,
        mb_match='annual',
        input_type=INPUT_TYPE,
        log=False,
        empty_folder=False,
        tp_sum=True)

## Analyse best combinations:

In [None]:
allStakes = []
for gl in glStakes_20years.keys():
    allStakes.append(glStakes_20years[gl])
allStakes = np.concatenate(allStakes)
allStakes = [
    re.split('_', f)[0] + '_' + re.split('_', f)[1] for f in allStakes
]

# Check if all stakes were processed:
path_multi = path_save_xgboost_stakes + f'{FOLD}/{INPUT_TYPE}/multi_combi/sum_prec_all_mae/'
stakes_processed = [
    re.split('_', f)[0] + '_' + re.split('_', f)[1][:-4]
    for f in os.listdir(path_multi)
]
remaining_stakes = Diff(list(stakes_processed), list(allStakes))
glProcessed = {}
for stake in stakes_processed:
    glacier = re.split('_', stake)[0]
    updateDic(glProcessed, glacier, stake + '_mb.csv')

glProcessed

In [None]:
# Give a hash to each combination of t2m and tp:
stakes, glacier, rmse, rmse_val, rmse_train, t2m, tp = [], [], [], [], [], [], []
mae, mae_val, mae_train = [], [], []
for gl in glProcessed.keys():
    for stakeNb in range(len(glProcessed[gl])):
        # read multicombi
        stake = glProcessed[gl][stakeNb]
        stakeName = re.split(".csv", stake)[0][:-3]
        var_df = pd.read_csv(path_multi + f'{stakeName}.csv',
                             converters={
                                 "t2m": literal_eval,
                                 "tp": literal_eval
                             })
        if 'test_rmse' in var_df.columns:
            rmse.append(var_df['test_rmse'])
            rmse_val.append(var_df['val_rmse'])
            rmse_train.append(var_df['train_rmse'])
            mae.append(var_df['test_mae'])
            mae_val.append(var_df['val_mae'])
            mae_train.append(var_df['train_mae'])
            N_combi = len(var_df['test_rmse'])


        stakes.append(np.tile(stakeName, N_combi))
        glacier.append(np.tile(gl, N_combi))
        t2m.append(var_df['t2m'])
        tp.append(var_df['tp'])
        
dfAllStakes = pd.DataFrame({
    'glaciers': np.concatenate(glacier),
    'stakes': np.concatenate(stakes),
    'test_rmse': np.concatenate(rmse)/(1000),
    'val_rmse': np.concatenate(rmse_val)/(1000),
    'train_rmse': np.concatenate(rmse_train)/(1000),
    'test_mae': np.concatenate(mae)/(1000),
    'val_mae': np.concatenate(mae_val)/(1000),
    'train_mae': np.concatenate(mae_train)/(1000),
    't2m': np.concatenate(t2m),
    'tp': np.concatenate(tp)
})
dfAllStakes['t2m-tp-hash'] = [
    makeCombNum(dfAllStakes['t2m'].iloc[i], dfAllStakes['tp'].iloc[i])
    for i in range(len(dfAllStakes))
]
print('Number of unique hashes:', len(dfAllStakes['t2m-tp-hash'].unique()))
dfAllStakes.head(2)


### Best for individual stakes:
#### Fifty best:

In [None]:
METRIC = 'mae'
VAL_METRIC = f'val_{METRIC}'

In [None]:
feature_importdf_50best, dfWeights_50best = NBestCombinations(dfAllStakes,
                                                              INVERSE_MONTH_POS,
                                                              t2m_vars,
                                                              tp_vars,
                                                              N=50,
                                                              type=VAL_METRIC)

# Aggregate over all stakes:
dfWeights_Mean_50best = dfWeights_50best.groupby(['feature', 'month'
                                                  ]).sum().reset_index()
dfWeights_Mean_50best['freq_var'] = dfWeights_Mean_50best['weight'] / (50 * 30)
dfWeights_Mean_50best['type'] = np.tile('50 best', len(dfWeights_Mean_50best))
dfWeights_Mean_50best.head(2)

In [None]:
dfWeights_50best['stakes_new'] = dfWeights_50best['stakes'].apply(lambda x: rename_stakes[x]) 
dfWeights_50best['freq_var'] = dfWeights_50best['weight'] / 50
g = sns.FacetGrid(
    dfWeights_50best,
    col="stakes_new",
    col_wrap=6,
    hue="feature",
)
g.map(sns.barplot,
      "month",
      "freq_var",
      orient='v',
      order=INVERSE_MONTH_POS.keys(),
      alpha=0.5)
for col_val, ax in g.axes_dict.items():
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.tick_params(axis="x", rotation=90)
    ax.set_title(col_val)
g.add_legend()

In [None]:
dfWeights_50best['stakes_new'] = dfWeights_50best['stakes'].apply(lambda x: rename_stakes[x]) 
dfWeights_50best['freq_var'] = dfWeights_50best['weight'] / 50
dfWeights_50best_subset = dfWeights_50best[dfWeights_50best.stakes_new.apply(lambda x: 'Basodino' in x)].sort_values(by = 'stakes_new')
g = sns.FacetGrid(
    dfWeights_50best_subset,
    col="stakes_new",
    col_wrap=3,
    hue="feature",
)
g.map(sns.barplot,
      "month",
      "freq_var",
      orient='v',
      order=INVERSE_MONTH_POS.keys(),
      alpha=0.5)
for col_val, ax in g.axes_dict.items():
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.tick_params(axis="x", rotation=90)
    ax.set_title(col_val)
g.add_legend()

In [None]:
dfWeights_50best['stakes_new'] = dfWeights_50best['stakes'].apply(lambda x: rename_stakes[x]) 
dfWeights_50best['freq_var'] = dfWeights_50best['weight'] / 50
dfWeights_50best_subset = dfWeights_50best[dfWeights_50best.stakes_new.apply(lambda x: 'Aletsch' in x)].sort_values(by = 'stakes_new')
g = sns.FacetGrid(
    dfWeights_50best_subset,
    col="stakes_new",
    col_wrap=3,
    hue="feature",
)
g.map(sns.barplot,
      "month",
      "freq_var",
      orient='v',
      order=INVERSE_MONTH_POS.keys(),
      alpha=0.5)
for col_val, ax in g.axes_dict.items():
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.tick_params(axis="x", rotation=90)
    ax.set_title(col_val)
g.add_legend()

#### 1 % highest:

In [None]:
N_combinations = 3249
N_01best = math.ceil(N_combinations * 1 / 100)
print('Number of 1% best combinations:', N_01best)
feature_importdf_percbest, dfWeights_percbest = NBestCombinations(
    dfAllStakes, INVERSE_MONTH_POS, t2m_vars, tp_vars, N=N_01best, type=VAL_METRIC)
# Aggregate over all stakes:
dfWeights_Mean_perc = dfWeights_percbest.groupby(['feature', 'month'
                                                  ]).sum().reset_index()
dfWeights_Mean_perc['type'] = np.tile(f'1% ({N_01best}) best',
                                      len(dfWeights_Mean_perc))
dfWeights_Mean_perc['freq_var'] = dfWeights_Mean_perc['weight'] / (N_01best *
                                                                   30)

In [None]:
dfWeights_percbest['freq_var'] = dfWeights_percbest['weight'] / N_01best
dfWeights_percbest['stakes_new'] = dfWeights_percbest['stakes'].apply(lambda x: rename_stakes[x]) 
dfWeights_percbest.sort_values(by='stakes', inplace=True)
g = sns.FacetGrid(
    dfWeights_percbest,
    col="stakes_new",
    col_wrap=6,
    hue="feature",
)
g.map(sns.barplot,
      "month",
      "freq_var",
      orient='v',
      order=INVERSE_MONTH_POS.keys(),
      alpha=0.5)
for col_val, ax in g.axes_dict.items():
    ax.set_ylabel('')
    ax.set_xlabel('')
    ax.tick_params(axis="x", rotation=90)
g.add_legend()

In [None]:
dfWeights_all = pd.concat([dfWeights_Mean_perc, dfWeights_Mean_50best], axis=0)
g = sns.FacetGrid(
    dfWeights_all,
    col="feature",
    row='type',
    height=2.5,
    aspect=1.5,
)
g.map(sns.barplot,
      "month",
      "freq_var",
      orient='v',
      order=INVERSE_MONTH_POS.keys(),
      color='#4d4d4d')
colors = ['#e7e5f1', '#fef5e9']

for col_val, ax in g.axes_dict.items():
    ax.set_ylabel('Frequency of month', fontsize = 15)
    ax.set_xlabel('')
    ax.set_ylim(top=1)
    ax.tick_params(axis="x", rotation=90)
    if col_val[0] == '1% (33) best':
        ax.set_facecolor(colors[0])
    if col_val[0] == '50 best':
        ax.set_facecolor(colors[1])
    ax.set_title('')
    ax.grid()
g.add_legend()

### Best over stakes:

In [None]:
# Take best average over all stakes:
avgAllStakes = dfAllStakes.groupby('t2m-tp-hash').mean().sort_values(
    by=VAL_METRIC)
avgAllStakes.head(2)

In [None]:
avgAllStakes[VAL_METRIC].min(), avgAllStakes[VAL_METRIC].max()

In [None]:
feature_importdf_1best, dfWeights_1best = NBestCombinations_avgStakes(
    dfAllStakes, INVERSE_MONTH_POS, t2m_vars, tp_vars, N=1, type=VAL_METRIC)
dfWeights_1best['freq_var'] = dfWeights_1best['weight'] / 1
g = sns.FacetGrid(
    dfWeights_1best,
    col="feature",
    height=2.5,
    aspect=1.5,
)
g.map(sns.barplot,
      "month",
      "freq_var",
      orient='v',
      order=INVERSE_MONTH_POS.keys(),
      color='#4d4d4d')
for col_val, ax in g.axes_dict.items():
    ax.set_ylabel('Frequency of month', fontsize = 15)
    ax.set_xlabel('')
    ax.set_ylim(top=1)
    ax.tick_params(axis="x", rotation=90)


#### Fig 5b-e: Feature importance

In [None]:
# Get weights and feature importance over 50 and 1% best combinations:
feature_importdf_50all, dfWeights_50all = NBestCombinations_avgStakes(
    dfAllStakes, INVERSE_MONTH_POS, t2m_vars, tp_vars, N=50, type=VAL_METRIC)
dfWeights_50all['freq_var'] = dfWeights_50all['weight'] / 50
dfWeights_50all['type'] = np.tile('50 best', len(dfWeights_50all))

N_01best = math.ceil(N_combinations * 1 / 100)
feature_importdf_percall, dfWeights_percall = NBestCombinations_avgStakes(
    dfAllStakes, INVERSE_MONTH_POS, t2m_vars, tp_vars, N=N_01best, type=VAL_METRIC)

dfWeights_percall['type'] = np.tile(f'1% ({N_01best}) best',
                                    len(dfWeights_percall))
dfWeights_percall['freq_var'] = dfWeights_percall['weight'] / N_01best

In [None]:
dfWeights_all = pd.concat([dfWeights_percall, dfWeights_50all], axis=0)
g = sns.FacetGrid(
    dfWeights_all,
    col="feature",
    row='type',
    height=2.5,
    aspect=1.5,
)
g.map(sns.barplot,
      "month",
      "freq_var",
      orient='v',
      order=INVERSE_MONTH_POS.keys(),
      color='#4d4d4d')
colors = ['#e7e5f1', '#fef5e9']

for col_val, ax in g.axes_dict.items():
    ax.set_ylabel('Frequency of month', fontsize = 15)
    ax.set_xlabel('')
    ax.set_ylim(top=1)
    ax.tick_params(axis="x", rotation=90)
    if col_val[0] == '1% (33) best':
        val_mae = feature_importdf_percall[VAL_METRIC].unique()
        ax.set_facecolor(colors[0])
    if col_val[0] == '50 best':
        val_mae = feature_importdf_50all[VAL_METRIC].unique()
        ax.set_facecolor(colors[1])
    ax.set_title('')
    ax.grid()
g.add_legend()

#### Fig 5a: distribution of feature importance

In [None]:
# What is the MAE of putting the average measured PMB for each site: 
mae_mean = []
for stake in var_xg_monthly['feat_test'].keys():
    target = np.concatenate(var_xg_monthly['feat_test'][stake]['target_test'])
    pred_mean = np.tile(np.mean(target), len(target))
    mae_mean.append(mean_absolute_error(target, pred_mean))

# average over all stakes:
mae_mean = np.mean(mae_mean)/(1000)
mae_mean

In [None]:
perc, fifty = dfWeights_percall[VAL_METRIC].unique(), dfWeights_50all[VAL_METRIC].unique()
perc, fifty

In [None]:
ax = plt.subplot(111)
perc, fifty = dfWeights_percall[VAL_METRIC].unique(), dfWeights_50all[VAL_METRIC].unique()
sns.histplot(avgAllStakes, x=VAL_METRIC, kde=True, ax=ax, color = '#4d4d4d', alpha = 0.5)
ax.set_xlabel('Validation MAE [m w.e.]')

# colors = get_cmap_hex(cm.devon, 10)
colors = ['#b2abd2', '#fee0b6']
rect1 = matplotlib.patches.Rectangle((0,0), perc[0], 300, color=colors[0], alpha = 0.5)
rect2 = matplotlib.patches.Rectangle((perc[0],0), fifty[0]-perc[0], 300, color=colors[1], alpha = 0.5)
ax.add_patch(rect1)
ax.add_patch(rect2)
ax.axvline(x=mae_mean, color='grey', linestyle='--', label='Mean measured MB')

In [None]:
g = sns.FacetGrid(dfAllStakes, col="stakes", col_wrap=6)
g.map(sns.kdeplot, VAL_METRIC, palette=color_palette, fill=True
      #kde = True)
      )
for col_val, ax in g.axes_dict.items():
    ax.set_xlabel(VAL_METRIC)
    ax.set_ylabel('')

### Subset of best months:

In [None]:
best_combi = [(['t2m_Apr','t2m_May', 't2m_June', 't2m_July', 't2m_Aug', 't2m_Sep'], [
    'tp_Oct',
    'tp_Nov',
    'tp_Dec',
    'tp_Jan',
    'tp_Feb',
])]
best_months_t2m = [re.split('_', combi)[1] for combi in best_combi[0][0]]
best_months_tp = [re.split('_', combi)[1] for combi in best_combi[0][1]]

# Get all consecutive combinations of length max 6:
def consecutive_combinations(iterable, consec):
    begin = 0
    chunks = len(iterable) + 1 - consec
    return [iterable[x + begin:x + consec] for x in range(chunks)]


iterable = list(best_months_t2m)
consec_t2m = []
for i in range(1, 7):
    for el in consecutive_combinations(iterable, i):
        consec_t2m.append(['t2m_' + j for j in el])
consec_tp = ['tp_Oct', 'tp_Nov', 'tp_Dec', 'tp_Jan', 'tp_Feb']

In [None]:
val_mae, vals_mae, hash = [], [], []
for t2m_combi in consec_t2m:
    t2mdf = dfAllStakes[dfAllStakes['t2m'].apply(lambda x: x == t2m_combi)]
    val_mae.append(t2mdf[t2mdf['tp'].apply(lambda x: x == consec_tp)].groupby(
        't2m-tp-hash').mean().val_mae.values[0])
    vals_mae.append(
        t2mdf[t2mdf['tp'].apply(lambda x: x == consec_tp)].val_mae.values)
    hash.append(t2mdf[t2mdf['tp'].apply(lambda x: x == consec_tp)]
                ['t2m-tp-hash'].values[0])

df = pd.DataFrame({
    'val_mae': val_mae,
    'vals_mae': vals_mae,
    't2m_combi': consec_t2m,
    'hash': hash
}).sort_values(by='val_mae')

df_expl = df[['hash', 'vals_mae']].explode('vals_mae')
ax = plt.subplot(1, 1, 1)
sns.boxplot(data=df_expl, x='hash', y='vals_mae', ax=ax, showmeans=True, order=df['hash'].values)

t2mlabels = []
for xlabel in ax.get_xticklabels():
    hash = xlabel.get_text()
    t2mlabels.append(df[df['hash'] == int(hash)].t2m_combi.values[0])

ax.set_xticklabels(t2mlabels, rotation=90)

In [None]:
test_mae, vals_mae, hash = [], [], []
for t2m_combi in consec_t2m:
    t2mdf = dfAllStakes[dfAllStakes['t2m'].apply(lambda x: x == t2m_combi)]
    test_mae.append(t2mdf[t2mdf['tp'].apply(lambda x: x == consec_tp)].groupby(
        't2m-tp-hash').mean().test_mae.values[0])
    vals_mae.append(
        t2mdf[t2mdf['tp'].apply(lambda x: x == consec_tp)].test_mae.values)
    hash.append(t2mdf[t2mdf['tp'].apply(lambda x: x == consec_tp)]
                ['t2m-tp-hash'].values[0])

df = pd.DataFrame({
    'test_mae': test_mae,
    'vals_mae': vals_mae,
    't2m_combi': consec_t2m,
    'hash': hash
}).sort_values(by='test_mae')

df_expl = df[['hash', 'vals_mae']].explode('vals_mae')
ax = plt.subplot(1, 1, 1)
sns.boxplot(data=df_expl, x='hash', y='vals_mae', ax=ax, showmeans=True,order=df['hash'].values)

t2mlabels = []
for xlabel in ax.get_xticklabels():
    hash = xlabel.get_text()
    t2mlabels.append(df[df['hash'] == int(hash)].t2m_combi.values[0])

ax.set_xticklabels(t2mlabels, rotation=90)

## Train miniML-MB:

In [None]:
# Get hyper parameters for training all combinations
var_xg_monthly, metrics_monthly = assembleXGStakes(
    path_save_xgboost_stakes +
    f'{FOLD}/{INPUT_TYPE}/monthly/t2m_tp/match_annual/', glStakes_20years_all, rename_stakes, rename = False)
hp_lr = metrics_monthly['hp_lr']
hp_ne = metrics_monthly['hp_ne']
hp_md = metrics_monthly['hp_md']

hp_lr, hp_ne, hp_md

#### Normal miniML-MB

In [None]:
best_combi = [(['t2m_May', 't2m_June', 't2m_July', 't2m_Aug'], [
    'tp_Oct',
    'tp_Nov',
    'tp_Dec',
    'tp_Jan',
    'tp_Feb',
])]

best_months_t2m = [re.split('_', combi)[1] for combi in best_combi[0][0]]
best_months_tp = [re.split('_', combi)[1] for combi in best_combi[0][1]]

weights_t2m = np.ones(len(best_months_t2m))
weights_tp = np.ones(len(best_months_tp))

# Run XGBoost with best combinations of t2m variables
RUN = True
if RUN:
    runXGBoost_one_varcomb(
        best_combi,
        hp_lr,
        hp_ne,
        hp_md,
        glStakes_20years,
        param_grid,  # grid for HP search
        weights_t2m,
        weights_tp,
        mb_match='annual',
        input_type=INPUT_TYPE,
        log=False,
        empty_folder=True,
        grid_search=False,
        input_vars={
            "t2m": "temperature",
            "tp": "precipitation"
        })

#### miniML-MB with PDD instead of T:

In [None]:
best_combi = [(['t2m_May', 't2m_June', 't2m_July', 't2m_Aug'], [
    'tp_Oct',
    'tp_Nov',
    'tp_Dec',
    'tp_Jan',
    'tp_Feb',
])]

best_months_t2m = [re.split('_', combi)[1] for combi in best_combi[0][0]]
best_months_tp = [re.split('_', combi)[1] for combi in best_combi[0][1]]

weights_t2m = np.ones(len(best_months_t2m))
weights_tp = np.ones(len(best_months_tp))

# Run XGBoost with PDD instead of t2m
RUN = False
if RUN:
    runXGBoost_one_varcomb(
        best_combi,
        hp_lr,
        hp_ne,
        hp_md,
        glStakes_20years,
        param_grid,  # grid for HP search
        weights_t2m,
        weights_tp,
        mb_match='annual',
        input_type=INPUT_TYPE,
        log=False,
        empty_folder=True,
        grid_search=False,
        input_vars={
            "pdd": "temperature",
            "tp": "precipitation"
        })

#### miniML-MB with weighted T and P:

In [None]:
best_combi = [(['t2m_May', 't2m_June', 't2m_July', 't2m_Aug'], [
    'tp_Oct',
    'tp_Nov',
    'tp_Dec',
    'tp_Jan',
    'tp_Feb',
])]

best_months_t2m = [re.split('_', combi)[1] for combi in best_combi[0][0]]
best_months_tp = [re.split('_', combi)[1] for combi in best_combi[0][1]]

weights_all_t2m = dfWeights_all[(dfWeights_all.feature == 't2m')&(dfWeights_all.type == '50 best')]
weights_all_tp = dfWeights_all[(dfWeights_all.feature == 'tp')&(dfWeights_all.type == '50 best')]

weights_t2m = weights_all_t2m[weights_all_t2m.month.apply(lambda x: x in best_months_t2m)].freq_var.values
weights_tp = weights_all_tp[weights_all_tp.month.apply(lambda x: x in best_months_tp)].freq_var.values

weights_t2m, weights_tp

In [None]:
# Run XGBoost with best combinations of t2m variables
RUN = False
if RUN:
    runXGBoost_one_varcomb(
        best_combi,
        hp_lr,
        hp_ne,
        hp_md,
        glStakes_20years,
        param_grid,  # grid for HP search
        weights_t2m,
        weights_tp,
        mb_match='annual',
        input_type=INPUT_TYPE,
        log=False,
        empty_folder=True,
        grid_search=False,
        input_vars={
            "t2m": "temperature",
            "tp": "precipitation"
        })

## Clustering on stakes' combinations:

In [None]:
color_palette = sns.color_palette("husl", len(MONTH_VAL.keys()))
palette = {}
for ind in MONTH_VAL.keys():
    palette[MONTH_VAL[ind]] = color_palette[ind - 1]

feature_list = [
    't2m_Oct', 't2m_Nov', 't2m_Dec', 't2m_Jan', 't2m_Feb', 't2m_Mar',
    't2m_Apr', 't2m_May', 't2m_June', 't2m_July', 't2m_Aug', 't2m_Sep',
    'tp_Oct', 'tp_Nov', 'tp_Dec', 'tp_Jan', 'tp_Feb', 'tp_Mar', 'tp_Apr',
    'tp_May', 'tp_June', 'tp_July', 'tp_Aug', 'tp_Sep'
]

In [None]:
# dfWeights = dfWeights_percbest

dfWeights = dfWeights_50best

# Add a month key for clustering
dfWeights['month_key'] = [
    INVERSE_MONTH_POS[dfWeights.month.iloc[i]] for i in range(len(dfWeights))
]
dfWeights.sort_values(by=[
    'stakes',
    'feature',
    'month_key',
], inplace=True)

# Transform dataframe for clustering so that there is one column per variable:
df_cluster = pd.DataFrame()
for stake in dfWeights.stakes.unique():
    df_cluster = pd.concat([
        df_cluster,
        pd.DataFrame(dfWeights[dfWeights.stakes == stake].freq_var.values)
    ],
                           axis=1)
df_cluster = pd.DataFrame()
for stake in dfWeights.stakes.unique():
    df_cluster = pd.concat([
        df_cluster,
        pd.DataFrame(dfWeights[dfWeights.stakes == stake].freq_var.values)
    ],
                           axis=1)
df_cluster = df_cluster.transpose()
df_cluster.columns = feature_list
df_cluster['stake'] = dfWeights.stakes.unique()
df_cluster.set_index('stake', inplace=True)
df_cluster.head(2)

### Clustering on TP & T2M:

In [None]:
# K-means++:
# Here cluster on total temperature only
X = df_cluster.values
scl = StandardScaler()
Xnorm = scl.fit_transform(X)
# Kmeans params
kmeans_params = {
    'init': 'k-means++',
    'max_iter': 300,
    'n_init': 10,
    'random_state': SEED
}
# Elbow method:
model = KMeans(**kmeans_params)
visualizer = KElbowVisualizer(model, k=(1, 11), timings=False)
visualizer.fit(X)  # Fit the data to the visualizer
visualizer.show()  # Finalize and render the figure

#### Fig 11: Clustering of T & P

In [None]:
# Applying K-Means to the dataset:
N_c = 3  # number of clusters
kmeans = KMeans(n_clusters=N_c, **kmeans_params)
y_kmeans = kmeans.fit_predict(X)
DF_cluster = df_cluster.copy()
DF_cluster['cluster'] = y_kmeans

# Assemble in a dataframe where columns are months (for plotting):
# T2m:
df_cluster_t2m = DF_cluster[[
    't2m_Oct', 't2m_Nov', 't2m_Dec', 't2m_Jan', 't2m_Feb', 't2m_Mar',
    't2m_Apr', 't2m_May', 't2m_June', 't2m_July', 't2m_Aug', 't2m_Sep',
    'cluster'
]]
df_cluster_t2m.rename(columns={
    't2m_Oct': 0,
    't2m_Nov': 1,
    't2m_Dec': 2,
    't2m_Jan': 3,
    't2m_Feb': 4,
    't2m_Mar': 5,
    't2m_Apr': 6,
    't2m_May': 7,
    't2m_June': 8,
    't2m_July': 9,
    't2m_Aug': 10,
    't2m_Sep': 11
},inplace=True)
df_cluster_t2m['feature'] = np.tile('t2m', len(df_cluster_t2m))

# TP:
df_cluster_tp = DF_cluster[[
    'tp_Oct', 'tp_Nov', 'tp_Dec', 'tp_Jan', 'tp_Feb', 'tp_Mar', 'tp_Apr',
    'tp_May', 'tp_June', 'tp_July', 'tp_Aug', 'tp_Sep', 'cluster'
]]
df_cluster_tp.rename(columns={
    'tp_Oct': 0,
    'tp_Nov': 1,
    'tp_Dec': 2,
    'tp_Jan': 3,
    'tp_Feb': 4,
    'tp_Mar': 5,
    'tp_Apr': 6,
    'tp_May': 7,
    'tp_June': 8,
    'tp_July': 9,
    'tp_Aug': 10,
    'tp_Sep': 11
},inplace=True)
df_cluster_tp['feature'] = np.tile('tp', len(df_cluster_tp))
df_clusters_per_feat = pd.concat([df_cluster_t2m, df_cluster_tp], axis=0)

PlotFeatClusters(df_clusters_per_feat)

In [None]:
# Get attributes of clusters:
cl_elev, clnb, cl_lat, cl_lon, stakes, glaciers, glshort = [], [], [], [], [], [], []
training_mb, training_y, len_training = [], [], []
for cl_nb in DF_cluster['cluster'].unique():
    cl_stakes = DF_cluster[DF_cluster['cluster'] == cl_nb].index
    for stake in cl_stakes:
        f_stake = read_stake_csv(path_glacattr, f'{stake}_mb.csv')
        cl_elev.append(np.mean(f_stake.height))
        cl_lat.append(np.mean(f_stake.lat))
        cl_lon.append(np.mean(f_stake.lon))
        len_training.append(
            len(var_xg_monthly['feat_train'][stake]['target']) / NUM_FOLDS)
        # mean training mb
        training_mb.append(
            np.mean(var_xg_monthly['feat_train'][stake]['target']))
        training_y.append(
            int(np.mean(var_xg_monthly['feat_train'][stake]['time'])))
        clnb.append(cl_nb+1)
        stakes.append(stake)
        glaciers.append(re.split('_', stake)[0])
        glshort.append(GL_SHORT[re.split('_', stake)[0].title()] + '_' +
                       re.split('_', stake)[1])

df_info = pd.DataFrame({
    'elevation': cl_elev,
    'lon': cl_lon,
    'lat': cl_lat,
    'training_mb': training_mb,
    'training_time': training_y,
    'training_length': len_training,
    'cluster': clnb,
    'stakes': stakes,
    'glaciers': glaciers,
    'glshort': glshort
})
df_info["cluster"] = df_info["cluster"].astype("category")

mean_df = pd.DataFrame({
    'cluster':
    range(1, N_c+1),
    'mean_el':
    df_info.groupby('cluster').mean().elevation,
    'mean_mb':
    df_info.groupby('cluster').mean()['training_mb']
})
for clusterNb in range(df_info['cluster'].nunique()):
    stakes_c0 = df_info[df_info['cluster'] == clusterNb].stakes.values
    print(f'Stakes of cluster {clusterNb}:\n {stakes_c0}')
plotClusterStats(df_info, mean_df)

### Clustering on TP:

In [None]:
# K-means++:
# Here cluster on total precipitation only
X = df_cluster[[
    'tp_Oct', 'tp_Nov', 'tp_Dec', 'tp_Jan', 'tp_Feb', 'tp_Mar', 'tp_Apr',
    'tp_May', 'tp_June', 'tp_July', 'tp_Aug', 'tp_Sep'
]].values
scl = StandardScaler()
Xnorm = scl.fit_transform(X)
# Kmeans params
kmeans_params = {
    'init': 'k-means++',
    'max_iter': 300,
    'n_init': 10,
    'random_state': SEED
}
# Elbow method:
model = KMeans(**kmeans_params)
visualizer = KElbowVisualizer(model, k=(1, 11))
visualizer.fit(X)  # Fit the data to the visualizer
visualizer.show()  # Finalize and render the figure

In [None]:
# Applying K-Means to the dataset:
N_c = 3  # number of clusters
kmeans = KMeans(n_clusters=N_c, **kmeans_params)
y_kmeans = kmeans.fit_predict(X)
DF_cluster = df_cluster.copy()
DF_cluster['cluster'] = y_kmeans

# Assemble in a dataframe where columns are months (for plotting):
# T2m:
df_cluster_t2m = DF_cluster[[
    't2m_Oct', 't2m_Nov', 't2m_Dec', 't2m_Jan', 't2m_Feb', 't2m_Mar',
    't2m_Apr', 't2m_May', 't2m_June', 't2m_July', 't2m_Aug', 't2m_Sep',
    'cluster'
]]
df_cluster_t2m.rename(columns={
    't2m_Oct': 0,
    't2m_Nov': 1,
    't2m_Dec': 2,
    't2m_Jan': 3,
    't2m_Feb': 4,
    't2m_Mar': 5,
    't2m_Apr': 6,
    't2m_May': 7,
    't2m_June': 8,
    't2m_July': 9,
    't2m_Aug': 10,
    't2m_Sep': 11
},inplace=True)
df_cluster_t2m['feature'] = np.tile('t2m', len(df_cluster_t2m))

# TP:
df_cluster_tp = DF_cluster[[
    'tp_Oct', 'tp_Nov', 'tp_Dec', 'tp_Jan', 'tp_Feb', 'tp_Mar', 'tp_Apr',
    'tp_May', 'tp_June', 'tp_July', 'tp_Aug', 'tp_Sep', 'cluster'
]]
df_cluster_tp.rename(columns={
    'tp_Oct': 0,
    'tp_Nov': 1,
    'tp_Dec': 2,
    'tp_Jan': 3,
    'tp_Feb': 4,
    'tp_Mar': 5,
    'tp_Apr': 6,
    'tp_May': 7,
    'tp_June': 8,
    'tp_July': 9,
    'tp_Aug': 10,
    'tp_Sep': 11
},inplace=True)
df_cluster_tp['feature'] = np.tile('tp', len(df_cluster_tp))
df_clusters_per_feat = pd.concat([df_cluster_t2m, df_cluster_tp], axis=0)
df_clusters_per_feat.head(2)

In [None]:
PlotFeatClusters(df_clusters_per_feat)

#### Attributes per cluster:

In [None]:
GL_SHORT = {'basodino': 'BAS',
 'gries': 'GRI',
 'schwarzberg': 'SCH',
 'aletsch': 'ALE',
 'limmern': 'LIM',
 'clariden': 'CLA',
 'allalin': 'ALL',
 'silvretta': 'SIL',
 'hohlaub': 'HOH',
 'pers': 'PERS',
 'corbassiere': 'COR',
 'plattalva': 'PLA',
 'gietro': 'GIE'}

# Get attributes of clusters:
cl_elev, clnb, cl_lat, cl_lon, stakes, glaciers, glshort = [], [], [], [], [], [], []
training_mb, training_y, len_training = [], [], []
for cl_nb in DF_cluster['cluster'].unique():
    cl_stakes = DF_cluster[DF_cluster['cluster'] == cl_nb].index
    for stake in cl_stakes:
        f_stake = read_stake_csv(path_glacattr, f'{stake}_mb.csv')
        cl_elev.append(np.mean(f_stake.height))
        cl_lat.append(np.mean(f_stake.lat))
        cl_lon.append(np.mean(f_stake.lon))
        len_training.append(
            len(var_xg_monthly['feat_train'][stake]['target']) / NUM_FOLDS)
        # mean training mb
        training_mb.append(
            np.mean(var_xg_monthly['feat_train'][stake]['target']))
        training_y.append(
            int(np.mean(var_xg_monthly['feat_train'][stake]['time'])))
        clnb.append(cl_nb)
        stakes.append(stake)
        glaciers.append(re.split('_', stake)[0])
        glshort.append(GL_SHORT[re.split('_', stake)[0]] + '_' +
                       re.split('_', stake)[1])

df_info = pd.DataFrame({
    'elevation': cl_elev,
    'lon': cl_lon,
    'lat': cl_lat,
    'training_mb': training_mb,
    'training_time': training_y,
    'training_length': len_training,
    'cluster': clnb,
    'stakes': stakes,
    'glaciers': glaciers,
    'glshort': glshort
})
df_info["cluster"] = df_info["cluster"].astype("category")

mean_df = pd.DataFrame({
    'cluster':
    range(0, N_c),
    'mean_el':
    df_info.groupby('cluster').mean().elevation,
    'mean_mb':
    df_info.groupby('cluster').mean()['training_mb']
})
stakes_c0 = df_info[df_info['cluster'] == 0].stakes.values
stakes_c1 = df_info[df_info['cluster'] == 1].stakes.values
stakes_c2 = df_info[df_info['cluster'] == 2].stakes.values
print(f'Stakes of cluster 0:\n {stakes_c0}')
print(f'Stakes of cluster 1:\n {stakes_c1}')
print(f'Stakes of cluster 2:\n {stakes_c2}')

plotClusterStats(df_info, mean_df)


### Clustering on T2M:

In [None]:
# K-means++:
# Here cluster on total temperature only
X = df_cluster[[
    't2m_Oct', 't2m_Nov', 't2m_Dec', 't2m_Jan', 't2m_Feb', 't2m_Mar',
    't2m_Apr', 't2m_May', 't2m_June', 't2m_July', 't2m_Aug', 't2m_Sep',
]].values
scl = StandardScaler()
Xnorm = scl.fit_transform(X)
# Kmeans params
kmeans_params = {
    'init': 'k-means++',
    'max_iter': 300,
    'n_init': 10,
    'random_state': SEED
}
# Elbow method:
model = KMeans(**kmeans_params)
visualizer = KElbowVisualizer(model, k=(1, 11))
visualizer.fit(X)  # Fit the data to the visualizer
visualizer.show()  # Finalize and render the figure

In [None]:
# Applying K-Means to the dataset:
N_c = 3  # number of clusters
kmeans = KMeans(n_clusters=N_c, **kmeans_params)
y_kmeans = kmeans.fit_predict(X)
DF_cluster = df_cluster.copy()
DF_cluster['cluster'] = y_kmeans
# Assemble in a dataframe where columns are months (for plotting):
# T2m:
df_cluster_t2m = DF_cluster[[
    't2m_Oct', 't2m_Nov', 't2m_Dec', 't2m_Jan', 't2m_Feb', 't2m_Mar',
    't2m_Apr', 't2m_May', 't2m_June', 't2m_July', 't2m_Aug', 't2m_Sep',
    'cluster'
]]
df_cluster_t2m.rename(columns={
    't2m_Oct': 0,
    't2m_Nov': 1,
    't2m_Dec': 2,
    't2m_Jan': 3,
    't2m_Feb': 4,
    't2m_Mar': 5,
    't2m_Apr': 6,
    't2m_May': 7,
    't2m_June': 8,
    't2m_July': 9,
    't2m_Aug': 10,
    't2m_Sep': 11
},inplace=True)
df_cluster_t2m['feature'] = np.tile('t2m', len(df_cluster_t2m))

# TP:
df_cluster_tp = DF_cluster[[
    'tp_Oct', 'tp_Nov', 'tp_Dec', 'tp_Jan', 'tp_Feb', 'tp_Mar', 'tp_Apr',
    'tp_May', 'tp_June', 'tp_July', 'tp_Aug', 'tp_Sep', 'cluster'
]]
df_cluster_tp.rename(columns={
    'tp_Oct': 0,
    'tp_Nov': 1,
    'tp_Dec': 2,
    'tp_Jan': 3,
    'tp_Feb': 4,
    'tp_Mar': 5,
    'tp_Apr': 6,
    'tp_May': 7,
    'tp_June': 8,
    'tp_July': 9,
    'tp_Aug': 10,
    'tp_Sep': 11
},inplace=True)
df_cluster_tp['feature'] = np.tile('tp', len(df_cluster_tp))
df_clusters_per_feat = pd.concat([df_cluster_t2m, df_cluster_tp], axis=0)
df_clusters_per_feat.head(2)

PlotFeatClusters(df_clusters_per_feat)

In [None]:
# Get attributes of clusters:
cl_elev, clnb, cl_lat, cl_lon, stakes, glaciers, glshort = [], [], [], [], [], [], []
training_mb, training_y, len_training = [], [], []
for cl_nb in DF_cluster['cluster'].unique():
    cl_stakes = DF_cluster[DF_cluster['cluster'] == cl_nb].index
    for stake in cl_stakes:
        f_stake = read_stake_csv(path_glacattr, f'{stake}_mb.csv')
        cl_elev.append(np.mean(f_stake.height))
        cl_lat.append(np.mean(f_stake.lat))
        cl_lon.append(np.mean(f_stake.lon))
        len_training.append(
            len(var_xg_monthly['feat_train'][stake]['target']) / NUM_FOLDS)
        # mean training mb
        training_mb.append(
            np.mean(var_xg_monthly['feat_train'][stake]['target']))
        training_y.append(
            int(np.mean(var_xg_monthly['feat_train'][stake]['time'])))
        clnb.append(cl_nb)
        stakes.append(stake)
        glaciers.append(re.split('_', stake)[0])
        glshort.append(GL_SHORT[re.split('_', stake)[0]] + '_' +
                       re.split('_', stake)[1])

df_info = pd.DataFrame({
    'elevation': cl_elev,
    'lon': cl_lon,
    'lat': cl_lat,
    'training_mb': training_mb,
    'training_time': training_y,
    'training_length': len_training,
    'cluster': clnb,
    'stakes': stakes,
    'glaciers': glaciers,
    'glshort': glshort
})
df_info["cluster"] = df_info["cluster"].astype("category")

mean_df = pd.DataFrame({
    'cluster':
    range(0, N_c),
    'mean_el':
    df_info.groupby('cluster').mean().elevation,
    'mean_mb':
    df_info.groupby('cluster').mean()['training_mb']
})
stakes_c0 = df_info[df_info['cluster'] == 0].stakes.values
stakes_c1 = df_info[df_info['cluster'] == 1].stakes.values
stakes_c2 = df_info[df_info['cluster'] == 2].stakes.values
print(f'Stakes of cluster 0:\n {stakes_c0}')
print(f'Stakes of cluster 1:\n {stakes_c1}')
print(f'Stakes of cluster 2:\n {stakes_c2}')

plotClusterStats(df_info, mean_df)