In [1]:
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

from pathlib import Path

from time import sleep

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, multilabel_confusion_matrix
from sklearn.utils import shuffle

from importlib import reload
import sentinel_utils
import keras_model_creator

2024-06-30 11:04:14.296232: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-30 11:04:14.296704: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-30 11:04:14.299370: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-30 11:04:14.325390: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
sentinel_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12']
all_bands = sentinel_bands + ['Elevation']
selected_bands = all_bands
bands = [all_bands.index(b) for b in selected_bands]

In [3]:
reload(sentinel_utils)

shards_dir = Path.home().joinpath('sentinel_data').joinpath('shards')

data_tag = '202003'
sort_key = lambda x: int(x.stem.split('_')[-1])
feature_shards = sorted(list(shards_dir.joinpath(f'features_{data_tag}').glob('feature_*.npy')), key=sort_key)

all_labels = pd.read_csv(Path('data').joinpath('full_dummies.csv'))
sample_shards = 50000
utils = sentinel_utils.SentinelUtils(
    shuffle(feature_shards, random_state=42)[:sample_shards],
    all_labels, min_occurrences=5000, 
    overwrite_existing=False
)

shard_ids = utils.selected_classes.index

In [4]:
architecture = 'simple'
loss = 'binary_crossentropy'
normal_type = 'zscore_clip'
dim = (np.load(feature_shards[0]).shape[:2])

model_dir = Path('models').joinpath(
    f'{architecture}-{loss}-{len(shard_ids)}-{utils.selected_classes.shape[1]}-{normal_type}'
)
model_dir.mkdir(parents=True, exist_ok=True)
model_dir

PosixPath('models/simple-binary_crossentropy-237797-30-zscore_clip')

In [None]:
reload(keras_model_creator)

params = dict(
    normal_type=normal_type,
    utils=utils,
    dim=dim,
    shards_dir=shards_dir,
    data_tag=data_tag,
    shard_ids=shard_ids,
    model_dir=model_dir,
    bands=bands,
    architecture=architecture,
    loss=loss,
    batch_size=64,
    base_filters=32,
    dropout=0.2,
    epochs=5,
    overwrite=False
)

model, testing_generator = keras_model_creator.KerasModelCreator(**params).run()

Building model...
Fitting...
Epoch 1/5
[1m3559/3559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m965s[0m 270ms/step - accuracy: 0.2271 - auc: 0.8473 - loss: 0.1925 - macrof1score: 0.1245 - microf1score: 0.3184 - prc: 0.3925 - precision: 0.5926 - recall: 0.2181 - weightedf1score: 0.2661 - val_accuracy: 0.3263 - val_auc: 0.9206 - val_loss: 0.1544 - val_macrof1score: 0.1852 - val_microf1score: 0.4360 - val_prc: 0.5505 - val_precision: 0.6821 - val_recall: 0.3204 - val_weightedf1score: 0.3574 - learning_rate: 0.0010
Epoch 2/5
[1m3559/3559[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1198s[0m 336ms/step - accuracy: 0.3074 - auc: 0.9143 - loss: 0.1578 - macrof1score: 0.1919 - microf1score: 0.4245 - prc: 0.5261 - precision: 0.6806 - recall: 0.3085 - weightedf1score: 0.3550 - val_accuracy: 0.3387 - val_auc: 0.9298 - val_loss: 0.1476 - val_macrof1score: 0.1976 - val_microf1score: 0.4540 - val_prc: 0.5834 - val_precision: 0.7242 - val_recall: 0.3307 - val_weightedf1score: 0.3710 - learni

In [None]:
eval_path = model_dir.joinpath('eval.csv')
if eval_path.is_file():
    print(pd.read_csv(eval_path))
else:
    r = model.evaluate(x=testing_generator, verbose=1, return_dict=True)
    df = pd.DataFrame.from_dict(r, orient='index', columns=['score'])
    df.to_csv(eval_path)
    print(df)

In [None]:
preds_path = model_dir.joinpath('preds.npy')
if preds_path.is_file():
    y_pred = np.load(preds_path)
else:
    y_pred = model.predict(x=testing_generator, verbose=1)
    np.save(preds_path, y_pred)

In [None]:
labels_path = model_dir.joinpath('true.npy')
total_batches = testing_generator.__len__()
if labels_path.is_file():
    y_true = np.load(labels_path)
else:
    y_true = []
    for i, (x, y) in enumerate(tqdm(testing_generator, total=total_batches-1)):
        y_true.append(y)
        if i > total_batches - 2:
            break
    y_true = np.vstack(y_true)
    np.save(labels_path, y_true)

In [None]:
cm = confusion_matrix(y_true.flatten().astype(int), (y_pred > 0.3).flatten().astype(int))
ConfusionMatrixDisplay(confusion_matrix=cm).plot()

In [None]:
import matplotlib.pyplot as plt

class_names = utils.selected_classes.columns

f, axes = plt.subplots(6, 5, figsize=(25, 30))
axes = axes.ravel()
for label in range(y_true.shape[1]):
    cm = confusion_matrix(y_true[..., label].astype(int), (y_pred[..., label] > 0.5).astype(int))
    disp = ConfusionMatrixDisplay(cm)
    disp.plot(ax=axes[label], values_format='.4g')
    disp.ax_.set_title(f'{class_names[label]}')
    if label < 25:
        disp.ax_.set_xlabel('')
    if label % 5 != 0:
        disp.ax_.set_ylabel('')
    disp.im_.colorbar.remove()

plt.subplots_adjust(wspace=0.2, hspace=0.001)
f.colorbar(disp.im_, ax=axes)
plt.show()


In [None]:
# tf.keras.utils.plot_model(model, show_shapes=True)
# model.summary()

In [None]:
# import subprocess
# subprocess.run(['sudo', 'shutdown', 'now'])