# Basic exploration with simple libraries (scikit-learn)

The non-production code in this notebook is used to explore the data and create a dataset for training machine learning models. It is not intended for production use.  
As a primer, we've already decided that gaussian processes are the best choice for our problem, since we don't have enough data to train a deep learning model.

Main questions: 
1) can we use the aspect-flipped data (S180) to train a model that can predict the permafrost distribution for N180 (true output)?
   - the aspect of S180 is flipped, meaning that it is effectively predicting for a mirror image (i.e., Southern Hemisphere). This would give us a bit more data to train with. 
2) how well can out-of-the-box scikit-learn models perform on this data?
   - extremely randomized trees

# Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# general libraries
import dotenv
import fsspec
import glob
import os
import pathlib
import site
import sys
import warnings
from loguru import logger
# data handling libraries
import geopandas as gpd
import numpy as np
import pandas as pd
import rioxarray as rxr
import xarray as xr
# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
# machine learning libraries
import gpytorch
from sklearn import gaussian_process as gp
from sklearn import model_selection 
from sklearn import ensemble 
from sklearn import preprocessing
from sklearn import metrics
from sklearn import pipeline
import torch

import cryogrid_pytools as cg
import pamir_mlpermafrost as pamir

dotenv.load_dotenv()

# Loading data

In [None]:
experiment = 'k1500-pamir_N180-exp1'

fname_cryogrid = f'simplecache::s3://spi-pamir-cryogrid/processed-cluster_config/cluster_config-{experiment}.zarr/'
fname_spatial = f'simplecache::s3://spi-pamir-cryogrid/processed-cluster_config/spatial_variables-710w365s750e400n-100m.zarr/'
fname_training = f'../data/training/training_data-k1500-pamir_ns180-expX.parquet'

In [None]:
y_name = 'ground_temp_2m'
x_names = [
    # 'latitude',
    # 'longitude',
    'altitude',
    'slope_angle',
    'aspect_cos',
    'aspect_sin',
    'albedo',
    'emissivity',
    'stratigraphy_index',
    'temperature',
    'precipitation',
    'snow_melt_doy',
]

## Training data

In [None]:
df_training = (
    pd.read_parquet(fname_training)
    .pipe(pamir.data.dem_utils.calc_aspect_cos_sin)
    [[y_name] + x_names]
    .dropna()
    .loc[(['S180'], slice(50001, 61501)),]
)

df_X = df_training[x_names]
df_y = df_training[y_name] 

## Inference data

In [None]:
with warnings.catch_warnings(category=UserWarning):
    warnings.simplefilter("ignore")
    
    ds_cryogrid = xr.open_zarr(fname_cryogrid, storage_options=pamir.data.s3.fsspec_kwargs)
    ds_spatial = xr.open_zarr(fname_spatial, storage_options=pamir.data.s3.fsspec_kwargs)
    
    da_cluster_labels = ds_cryogrid.cluster_labels.compute()

In [None]:
rename_dict = {
    'albedo_modis': 'albedo',
    'emissivity_aster': 'emissivity',
    'surface_index': 'stratigraphy_index',
}

ds_inference = (
    ds_spatial
    .rename(rename_dict)
    # .isel(y=slice(1000, 2500), x=slice(500, 2000))
)

df_inference = (
    ds_inference
    .to_dataframe()
    .pipe(pamir.data.dem_utils.calc_aspect_cos_sin)
    [x_names]
    .dropna())

In [None]:
df_inference.to_parquet('../data/inference/inference_data-pamir_N180.parquet')

In [None]:
df_inference.shape[0] // 10_000

## Creating mapped target

In [None]:
key = y_name
da_y = cg.spatial_clusters.map_gridcells_to_clusters(
    df_training[key].loc['S180'].loc[50000:60000].to_xarray().rename(tag='index').assign_coords(index=lambda x: x.index.astype(str).str[-4:].astype(int)),
    ds_cryogrid.cluster_labels.compute()
).reindex_like(ds_inference, method='nearest')

mask = (ds_spatial.altitude > 3100) & (ds_spatial.surface_index > 0)

# Modelling 

In [None]:
def make_pipeline(q):
    return pipeline.make_pipeline(
        # preprocessing.StandardScaler(),
        # ensemble.HistGradientBoostingRegressor(
        #     max_iter=450,
        #     max_depth=5,
        #     learning_rate=0.1,
        #     min_samples_leaf=10,
        #     max_leaf_nodes=42,
        #     random_state=42,
        #     validation_fraction=0.2,
        #     max_features=1., 
        #     categorical_features=[df_X.columns.get_loc('stratigraphy_index')],  # Use index of categorical feature
        # )
        ensemble.RandomForestRegressor(n_jobs=-1, n_estimators=48, min_samples_leaf=4)
    )

In [None]:
models = {
    # 'q25': make_pipeline(0.25),
    # 'q50': make_pipeline(0.5),
    # 'q75': make_pipeline(0.75),
    'mse': make_pipeline(0.5),
}

In [None]:
scores = {}
for q in models:
    model = models[q]

    print(f"Training model for quantile {q}")
    model.fit(df_X, df_y)
    
    scores[q] = model.score(df_X, df_y)

scores

In [None]:
pd.Series(model[0].feature_importances_, index=x_names)

In [None]:
def predict_to_xarray(model, df_inference):
    yhat = model.predict(df_inference)

    da_yhat = pd.Series(
        yhat,
        index=df_inference.index,
        name='yhat'
    ).to_xarray()

    return da_yhat

In [None]:
da_yhat = []
for q in models:
    print(f"Predicting for quantile {q}")
    model = models[q]
    da_yhat += predict_to_xarray(model, df_inference).rename(q),

da_yhat = xr.merge(da_yhat)

In [None]:
# iqr = da_yhat.q75 - da_yhat.q25
# iqr.where(mask).plot.imshow(robust=True,  vmin=0, cmap='viridis', size=10, aspect=1.2)

In [None]:
water = ds_spatial.land_cover == 1
ice = ds_spatial.land_cover == 9

In [None]:
img_yhat = da_yhat.mse.where(~water & ~ice).astype(float).plot.imshow(robust=True, cmap='Spectral_r', size=10, aspect=1.2)
img_yhat.figure.set_dpi(400)

In [None]:
img_y = da_y.where(~water & ~ice).plot.imshow(cmap='Spectral_r', size=10, aspect=1.2)
img_y.set_clim(img_yhat.get_clim())
img_y.figure.set_dpi(400)

In [None]:
diff = (da_yhat.mse - da_y).astype(float)
img = diff.where(~water & ~ice).plot.imshow(cmap='Spectral_r', size=10, aspect=1.2)
img.figure.set_dpi(400)

In [None]:
ds_spatial['resid'] = da_yhat.mse - da_y

In [None]:
a = ds_spatial[['altitude', 'resid', 'aspect', 'temperature']].to_dataframe()

In [None]:
sns.jointplot(data=a, x='altitude', y='resid', cmap='mako', cbar=False, bins=100, kind='hist', dropna=True)

In [None]:
sns.jointplot(data=a, x='aspect', y='resid', cmap='mako', cbar=False, bins=100, kind='hist')

In [None]:
sns.jointplot(data=a, x='temperature', y='resid', cmap='mako', cbar=False, bins=100, kind='hist')