In [1]:
from IPython.display import display, HTML
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from pathlib import Path

from time import sleep

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, multilabel_confusion_matrix
from sklearn.utils import shuffle

from importlib import reload
import sentinel_utils
import keras_model_creator

from data_generator import DataGenerator

In [2]:
sentinel_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12']
soilgrids_band = ['bdod', 'cec', 'cfvo', 'clay', 'nitrogen', 'ocd',
                  'ocs', 'phh2o', 'sand', 'silt', 'soc']
all_bands = sentinel_bands + ['Elevation'] + soilgrids_band
selected_bands = all_bands
bands = [all_bands.index(b) for b in selected_bands]
bands

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [3]:
reload(sentinel_utils)

shards_dir = Path.home().joinpath('sentinel_data').joinpath('shards')

# sort_key = lambda x: int(x.stem.split('_')[-1])
sentinel_shards = []
seasons = ['06']
year = 2017
for s in seasons:
    path_list = list(shards_dir.joinpath(f'features_{year}{s}').glob('feature_*.npy'))
    sentinel_shards.extend(path_list)

all_labels = pd.read_csv(Path('data').joinpath('full_dummies.csv'))
sample_shards = 20000
utils = sentinel_utils.SentinelUtils(
    all_labels, all_bands, seasons,
    shuffle(sentinel_shards, random_state=42)[:sample_shards],
    min_occurrences=5000, overwrite_existing=False
)
shard_ids = utils.selected_classes.index

Calculating feature statistics...


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

Dropped 225 columns, 4132 rows


In [4]:
loss = 'binary_focal_crossentropy'
batch_size = 32
tag = 'grouped'

model_dir = Path('models').joinpath(
    f'{loss}-{len(shard_ids)}-{utils.selected_classes.shape[1]}'
    f'-{len(bands)}-{year}-{"_".join(seasons)}-{batch_size}-{tag}'
)
model_dir.mkdir(parents=True, exist_ok=True)
model_dir

PosixPath('models/binary_focal_crossentropy-246437-17-22-2017-06-32-grouped')

In [None]:
reload(keras_model_creator)
params = dict(
    utils=utils,
    shards_dir=shards_dir,
    shard_ids=shard_ids,
    model_dir=model_dir,
    seasons=seasons,
    year=year,
    bands=bands,
    loss=loss,
    batch_size=batch_size,
    base_filters=32,
    dropout=0.2,
    epochs=20,
    overwrite=False
)
 
model, testing_generator = keras_model_creator.KerasModelCreator(**params).run()

Building model...
Fitting...
Epoch 1/20


I0000 00:00:1720694560.038749   14033 service.cc:145] XLA service 0xb551490 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1720694560.038930   14033 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Ti, Compute Capability 8.9
I0000 00:00:1720694573.909027   14033 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m7388/7388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1463s[0m 195ms/step - accuracy: 0.2683 - auc: 0.7950 - loss: 0.0518 - macrof1score: 0.1600 - microf1score: 0.3439 - prc: 0.3891 - precision: 0.5113 - recall: 0.2592 - weightedf1score: 0.3245 - val_accuracy: 0.4313 - val_auc: 0.9095 - val_loss: 0.0604 - val_macrof1score: 0.2391 - val_microf1score: 0.5502 - val_prc: 0.6189 - val_precision: 0.7310 - val_recall: 0.4411 - val_weightedf1score: 0.4868 - learning_rate: 0.0010
Epoch 2/20
[1m1372/7388[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m19:12[0m 192ms/step - accuracy: 0.3681 - auc: 0.8875 - loss: 0.0388 - macrof1score: 0.2305 - microf1score: 0.4873 - prc: 0.5684 - precision: 0.6723 - recall: 0.3821 - weightedf1score: 0.4435

In [None]:
for eval_year in [2020, 2021, 2022]:
    params['year'] = eval_year
    eval_generator = DataGenerator(shard_ids, shuffle=False, **params)
    preds_path = model_dir.joinpath(f"preds_{params['year']}.npy")
    if preds_path.is_file():
        y_pred = np.load(preds_path)
    else:
        y_pred = model.predict(x=eval_generator, verbose=1)
        np.save(preds_path, y_pred)

    y_true = utils.selected_classes.iloc[:y_pred.shape[0]].to_numpy()
    cm = confusion_matrix(y_true.flatten(), (y_pred > 0.5).flatten().astype(int))
    plot = ConfusionMatrixDisplay(confusion_matrix=cm).plot()
    display(plot)

In [None]:
# import matplotlib.pyplot as plt

# class_names = utils.selected_classes.columns

# f, axes = plt.subplots(6, 5, figsize=(25, 30))
# axes = axes.ravel()
# for label in range(y_true.shape[1]):
#     cm = confusion_matrix(y_true[..., label].astype(int), (y_pred[..., label] > 0.5).astype(int))
#     disp = ConfusionMatrixDisplay(cm)
#     disp.plot(ax=axes[label], values_format='.4g')
#     disp.ax_.set_title(f'{class_names[label]}')
#     if label < 25:
#         disp.ax_.set_xlabel('')
#     if label % 5 != 0:
#         disp.ax_.set_ylabel('')
#     disp.im_.colorbar.remove()

# plt.subplots_adjust(wspace=0.2, hspace=0.001)
# f.colorbar(disp.im_, ax=axes)
# plt.show()


In [None]:
import tensorflow
tensorflow.keras.utils.plot_model(model, show_shapes=True)
model.summary()

In [None]:
tensorflow.keras.utils.plot_model(
    model, to_file=model_dir.joinpath('model.png'),
    show_shapes=True, show_layer_activations=True,
)

In [None]:
# import tensorflow.keras.backend as K
# K.eval(model.optimizer.learning_rate)

In [None]:
# for eval_year in [2020, 2021]:
#     params['year'] = eval_year
#     eval_generator = DataGenerator(shard_ids, shuffle=False, **params)
#     model.evaluate(x=eval_generator, verbose=1, return_dict=True)
    
#     eval_path = model_dir.joinpath(f'eval_{eval_year}.csv')
#     if eval_path.is_file():
#         print(pd.read_csv(eval_path))
#     else:
#         r = model.evaluate(x=testing_generator, verbose=1, return_dict=True)
#         df = pd.DataFrame.from_dict(r, orient='index', columns=['score'])
#         df.index.name = 'metric'
#         df.to_csv(eval_path)
#         print(df)

In [None]:
# import subprocess
# subprocess.run(['sudo', 'shutdown', 'now'])