In [1]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from pathlib import Path

from time import sleep

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, multilabel_confusion_matrix
from sklearn.utils import shuffle

from importlib import reload
import sentinel_utils
import keras_model_creator

2024-07-02 22:05:46.535512: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-02 22:05:46.537071: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-02 22:05:46.616989: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-02 22:05:46.950029: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
sentinel_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12']
soilgrids_band = ['bdod', 'cec', 'cfvo', 'clay', 'nitrogen', 'ocd',
                  'ocs', 'phh2o', 'sand', 'silt', 'soc']
all_bands = sentinel_bands + ['Elevation'] + soilgrids_band
selected_bands = all_bands
bands = [all_bands.index(b) for b in selected_bands]
bands

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [6]:
reload(sentinel_utils)

shards_dir = Path.home().joinpath('sentinel_data').joinpath('shards')

# sort_key = lambda x: int(x.stem.split('_')[-1])
sentinel_shards = []
seasons = ['03', '06']
year = 2017
for s in seasons:
    path_list = list(shards_dir.joinpath(f'features_{year}{s}').glob('feature_*.npy'))
    sentinel_shards.extend(path_list)

all_labels = pd.read_csv(Path('data').joinpath('full_dummies.csv'))
sample_shards = 20000
utils = sentinel_utils.SentinelUtils(
    all_labels, all_bands,
    shuffle(sentinel_shards, random_state=42)[:sample_shards],
    min_occurrences=5000, overwrite_existing=False
)
shard_ids = utils.selected_classes.index

Calculating feature statistics...


  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

  0%|          | 0/250569 [00:00<?, ?it/s]

Dropped 212 columns, 12772 rows


In [7]:
loss = 'binary_crossentropy'

model_dir = Path('models').joinpath(
    f'{loss}-{len(shard_ids)}-{utils.selected_classes.shape[1]}'
    f'-{len(bands)}-{year}-{"_".join(seasons)}'
)
model_dir.mkdir(parents=True, exist_ok=True)
model_dir

PosixPath('models/binary_crossentropy-237797-30-22-2017-03_06')

In [None]:
reload(keras_model_creator)

params = dict(
    utils=utils,
    shards_dir=shards_dir,
    shard_ids=shard_ids,
    model_dir=model_dir,
    seasons=seasons,
    year=year,
    bands=bands,
    loss=loss,
    batch_size=64,
    base_filters=8,
    dropout=0.2,
    epochs=10,
    overwrite=True
)

model, testing_generator = keras_model_creator.KerasModelCreator(**params).run()

Building model...
Fitting...
Epoch 1/10
[1m   9/3559[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m16:51[0m 285ms/step - accuracy: 0.0414 - auc: 0.4945 - loss: 0.1621 - macrof1score: 0.0053 - microf1score: 0.0068 - prc: 0.0637 - precision: 0.0225 - recall: 0.0040 - weightedf1score: 0.0064        

In [None]:
eval_path = model_dir.joinpath('eval.csv')
if eval_path.is_file():
    print(pd.read_csv(eval_path))
else:
    r = model.evaluate(x=testing_generator, verbose=1, return_dict=True)
    df = pd.DataFrame.from_dict(r, orient='index', columns=['score'])
    df.to_csv(eval_path)
    print(df)

In [None]:
preds_path = model_dir.joinpath('preds.npy')
if preds_path.is_file():
    y_pred = np.load(preds_path)
else:
    y_pred = model.predict(x=testing_generator, verbose=1)
    np.save(preds_path, y_pred)

In [None]:
labels_path = model_dir.joinpath('true.npy')
total_batches = testing_generator.__len__()
if labels_path.is_file():
    y_true = np.load(labels_path)
else:
    y_true = []
    for i, (x, y) in enumerate(tqdm(testing_generator, total=total_batches-1)):
        y_true.append(y)
        if i > total_batches - 2:
            break
    y_true = np.vstack(y_true)
    np.save(labels_path, y_true)

In [None]:
cm = confusion_matrix(y_true.flatten().astype(int), (y_pred > 0.3).flatten().astype(int))
ConfusionMatrixDisplay(confusion_matrix=cm).plot()

In [None]:
import matplotlib.pyplot as plt

class_names = utils.selected_classes.columns

f, axes = plt.subplots(6, 5, figsize=(25, 30))
axes = axes.ravel()
for label in range(y_true.shape[1]):
    cm = confusion_matrix(y_true[..., label].astype(int), (y_pred[..., label] > 0.5).astype(int))
    disp = ConfusionMatrixDisplay(cm)
    disp.plot(ax=axes[label], values_format='.4g')
    disp.ax_.set_title(f'{class_names[label]}')
    if label < 25:
        disp.ax_.set_xlabel('')
    if label % 5 != 0:
        disp.ax_.set_ylabel('')
    disp.im_.colorbar.remove()

plt.subplots_adjust(wspace=0.2, hspace=0.001)
f.colorbar(disp.im_, ax=axes)
plt.show()


In [None]:
# tf.keras.utils.plot_model(model, show_shapes=True)
# model.summary()

In [None]:
# import subprocess
# subprocess.run(['sudo', 'shutdown', 'now'])