In [1]:
from IPython.display import display, HTML
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from pathlib import Path

from time import sleep

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, multilabel_confusion_matrix
from sklearn.utils import shuffle

from importlib import reload
import sentinel_utils
import keras_model_creator

import itertools

import tensorflow as tf

from data_generator import DataGenerator

In [2]:
# %load_ext tensorboard
# %tensorboard --logdir=$tensorboard_dir
# tensorboard_dir = str(model_dir.joinpath('tensorboard_logs'))

List all bands in the data files and select which bands to use.

In [3]:
sentinel_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12']
soilgrids_band = ['bdod', 'cec', 'cfvo', 'clay', 'nitrogen', 'ocd',
                  'ocs', 'phh2o', 'sand', 'silt', 'soc']
all_bands = sentinel_bands + ['Elevation'] + soilgrids_band

# selected_sentinel = ['B2', 'B5', 'B8', 'B12']
selected_bands = sentinel_bands
bands = [all_bands.index(b) for b in selected_bands]

In [4]:
loss = 'binary_crossentropy'
batch_size = 64
base_filters = 32
shards_dir = Path.home().joinpath('sentinel_data', 'shards')

fixed_params = dict(
    shards_dir=shards_dir,
    bands=bands,
    loss=loss,
    batch_size=batch_size,
    base_filters=base_filters,
    dropout=0.2,
    epochs=10,
    overwrite=False,
    verbose=0
)

Select the season(s) to use and calculate the mean and standard deviation for each band if required. These are used to normalise the batches.

Select the classes to use based on minimum occorrences. This also removes labels that do not have any selected classes.

In [None]:
reload(sentinel_utils)
reload(keras_model_creator)

all_seasons = ['03', '06', '09', '12']
season_combinations = itertools.chain.from_iterable(
    itertools.combinations(all_seasons, r) for r in range(1, len(all_seasons)+1))

for seasons in (pbar := tqdm(list(season_combinations))):
    pbar.set_description('-'.join(seasons))
    
    utils = sentinel_utils.SentinelUtils(
        seasons, shards_dir, all_bands, sample_shards=40000,
        min_occurrences=20000, overwrite_existing=False
    )

    model_dir = Path('models',
        f'{loss}-{len(utils.selected_classes.index)}'
        f'-{utils.selected_classes.shape[1]}-{len(bands)}'
        f'-{"_".join(seasons)}-{batch_size}-{base_filters}'
    )
    model_dir.mkdir(parents=True, exist_ok=True)

    changing_params = dict(
        utils=utils, model_dir=model_dir, seasons=seasons
    )
    params = fixed_params | changing_params
    model, testing_generator = keras_model_creator.KerasModelCreator(**params).run()

  0%|          | 0/15 [00:00<?, ?it/s]

Previous training:


epoch,accuracy,val_accuracy,auc,val_auc,loss,val_loss,macrof1score,val_macrof1score,microf1score,val_microf1score,prc,val_prc,precision,val_precision,recall,val_recall,weightedf1score,val_weightedf1score
1,0.4736531674861908,0.5002003312110901,0.8763253688812256,0.901329219341278,0.2839558720588684,0.3287153840065002,0.4992201328277588,0.533677339553833,0.6338666081428528,0.6788405179977417,0.7057992815971375,0.7666475772857666,0.700661301612854,0.7397071719169617,0.5786988735198975,0.6272292733192444,0.6190537214279175,0.6603562831878662
2,0.5104445219039917,0.5287460088729858,0.9000358581542969,0.9088353514671326,0.2630595564842224,0.3193894326686859,0.5586109161376953,0.6021475195884705,0.678867757320404,0.7017511129379272,0.7536153197288513,0.7827010154724121,0.7197539210319519,0.7293932437896729,0.6423771977424622,0.6761277318000793,0.6686074137687683,0.6980912685394287
epoch,accuracy,val_accuracy,auc,val_auc,loss,val_loss,macrof1score,val_macrof1score,microf1score,val_microf1score,prc,val_prc,precision,val_precision,recall,val_recall,weightedf1score,val_weightedf1score


I0000 00:00:1721916545.432566    9438 service.cc:145] XLA service 0x7f8eac0023d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1721916545.432612    9438 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Ti, Compute Capability 8.9
I0000 00:00:1721916560.589018    9438 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.




  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

  0%|          | 0/40000 [00:00<?, ?it/s]

Evaluate the model for given years and save the results in the model's directory.

In [None]:
# for eval_year in [2017, 2020, 2021, 2022, 2023]:
    # eval_generator = DataGenerator(
        # utils.selected_classes.index, shuffle=False, year=eval_year, **params)
#     preds_path = model_dir.joinpath(f"preds_{params['year']}.npy")
#     if preds_path.is_file():
#         y_pred = np.load(preds_path)
#     else:
#         y_pred = model.predict(x=eval_generator, verbose=1)
#         np.save(preds_path, y_pred)

#     y_true = utils.selected_classes.iloc[:y_pred.shape[0]].to_numpy()
#     cm = confusion_matrix(y_true.flatten(), (y_pred > 0.5).flatten().astype(int))
#     plot = ConfusionMatrixDisplay(confusion_matrix=cm).plot()
#     display(plot)

Visualise the confusion matrix for each class.

In [None]:
# import matplotlib.pyplot as plt

# class_names = utils.selected_classes.columns

# f, axes = plt.subplots(4, 2, figsize=(25, 30))
# axes = axes.ravel()
# for label in range(y_true.shape[1]):
#     cm = confusion_matrix(y_true[..., label].astype(int), (y_pred[..., label] > 0.5).astype(int))
#     disp = ConfusionMatrixDisplay(cm)
#     disp.plot(ax=axes[label], values_format='.4g')
#     disp.ax_.set_title(f'{class_names[label]}')
#     if label < 25:
#         disp.ax_.set_xlabel('')
#     if label % 5 != 0:
#         disp.ax_.set_ylabel('')
#     disp.im_.colorbar.remove()

# plt.subplots_adjust(wspace=0.2, hspace=0.001)
# f.colorbar(disp.im_, ax=axes)
# plt.show()


In [None]:
# import tensorflow
# tensorflow.keras.utils.plot_model(model, show_shapes=True)
# model.summary()

Visualise the model.

In [None]:
# import tensorflow
# tensorflow.keras.utils.plot_model(
#     model, to_file=model_dir.joinpath('model.png'),
#     show_shapes=True, show_layer_activations=True,
# )

Correlate the model with climate variables (to move?)

In [None]:
# import plotly.express as px

# class_names = utils.selected_classes.columns
# indices = utils.selected_classes.index

# preds_path = model_dir.joinpath(f'preds_2020.npy')
# y_prev = np.load(preds_path)

# weather_prev = pd.read_csv(Path('weather_data', 'era5_2020.csv'))
# eval_years = [2021, 2022, 2023]
# corrs = []

# for eval_year in eval_years:
#     preds_path = model_dir.joinpath(f'preds_{eval_year}.npy')
#     y_pred = np.load(preds_path)
#     y_diff = pd.DataFrame(y_pred - y_prev, columns=class_names)
    
#     weather = pd.read_csv(Path('weather_data', f'era5_{eval_year}.csv'))
#     weather_diff = ((weather - weather_prev)
#                     .loc[indices]
#                     .iloc[:y_pred.shape[0]]
#                     .iloc[y_diff.index])
    
#     corr = y_diff.join(weather_diff).corr(method='pearson').round(2)
#     corrs.append(corr)

#     y_prev = y_pred
#     weather_prev = weather

# fig = px.imshow(
#     np.array(corrs),
#     animation_frame=0,
#     labels=dict(color="Corr coef"),
#     x=corrs[0].index,
#     y=corrs[0].columns,
#     title='Annual correlation heatmap',
#     text_auto=True, aspect='auto', zmin=0, height=500
# )
# fig.layout.sliders[0]['currentvalue']['prefix'] = ''
# for year, step in zip(eval_years, fig.layout.sliders[0].steps):
#     step.label = str(year)

# fig