In [1]:
from tqdm.notebook import tqdm
from IPython.display import display, HTML

import numpy as np
import pandas as pd

import plotly.graph_objects as go

import datetime
from pathlib import Path

import intake

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import itertools

from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras import backend as K

from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam, RMSprop, Nadam, SGD, AdamW

2024-06-10 21:17:37.383404: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']

In [3]:
model_dir = Path('models')
model_dir.mkdir(parents=True, exist_ok=True)

data_dir = Path('/sentinel_data').joinpath('shards')

sort_key = lambda x: int(x.stem.split('_')[-1])
feature_filepaths = sorted(list(data_dir.glob('feature_*.npy')), key=sort_key)

Adapted from https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly

In [4]:
class DataGenerator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, list_IDs, data_dir, batch_size=32, dim=(100,100), n_channels=13,
                 n_classes=242, shuffle=True, **kwargs):
        super().__init__(**kwargs)
        self.dim = dim
        self.batch_size = batch_size
        self.data_dir = data_dir
        self.list_IDs = list_IDs
        self.n_channels = n_channels
        self.n_classes = n_classes
        self.shuffle = shuffle
        
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(self.list_IDs) / self.batch_size))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        # Find list of IDs
        list_IDs_temp = [self.list_IDs[k] for k in indexes]

        # Generate data
        X, y = self.__data_generation(list_IDs_temp)

        return X, y

    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.list_IDs))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, list_IDs_temp):
        'Generates data containing batch_size samples'
        # Initialization
        X = np.empty((self.batch_size, *self.dim, self.n_channels))
        y = np.empty((self.batch_size, self.n_classes))

        # Generate data
        for i, ID in enumerate(list_IDs_temp):
            # Store sample
            X[i,] = np.load(data_dir.joinpath(f'feature_{ID}.npy'))
            # Store class
            y[i] = np.load(data_dir.joinpath(f'label_{ID}.npy'))
        
        return X, y

F1 score adapted from https://medium.com/@matrixB/modified-cross-entropy-loss-for-multi-label-classification-with-class-a8afede21eb9

In [5]:
tf.keras.utils.get_custom_objects().clear()
@tf.keras.utils.register_keras_serializable()
def custom_f1_score(y_true, y_pred):
    y_true = tf.cast(y_true, dtype='float32')
    
    true_positives = K.sum(K.round(y_true * y_pred))
    
    possible_positives = K.sum(K.round(y_true))
    
    recall = true_positives / (possible_positives + K.epsilon())
    
    predicted_positives = K.sum(K.round(y_pred))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    
    return (2 * precision * recall) / (precision + recall + K.epsilon())

In [6]:
class KerasModelCreator:
    def topless_vgg(self, input_shape, output_shape, metrics):
        m = tf.keras.Sequential()
        vgg = tf.keras.applications.VGG16(
            include_top=False,
            weights=None,
            input_shape=input_shape,
            classes=output_shape,
            classifier_activation='sigmoid',
        )
        m.add(vgg)
        m.add(Flatten())
        m.add(Dense(output_shape, activation='sigmoid'))
        m.compile(optimizer='adam', loss='categorical_crossentropy', metrics=metrics)
        return m
    
    def display_logger(self, log_file, metrics):
        metric_names = [m if isinstance(m, str) else m.__name__ for m in metrics]
        if log_file.is_file():
            val_metrics = ['val_loss'] + ['val_' + x for x in metric_names]
            df = pd.read_csv(log_file)[['epoch', 'loss'] + metric_names + val_metrics]
            df['epoch'] += 1
            print('Previous training:')
            display(HTML(df.to_html(index=False)))
    
    def define_callbacks_and_logger(self, model_path, model_savepoint, log_file, metrics):
        metric_names = [m if isinstance(m, str) else m.__name__ for m in metrics]

        callbacks = [
            tf.keras.callbacks.BackupAndRestore(
                model_savepoint, save_freq='epoch', delete_checkpoint=False
            ),
            tf.keras.callbacks.CSVLogger(log_file, append=True),
            tf.keras.callbacks.ModelCheckpoint(
                model_path, monitor='val_accuracy', save_best_only=True, 
                save_freq='epoch', initial_value_threshold=0.4,
                verbose=1,
            ),
            tf.keras.callbacks.ReduceLROnPlateau(
                monitor='val_accuracy', factor=0.6, patience=2, min_lr=3e-6,
                verbose=1,
            ),
        ]
        return callbacks
        
    def build_model(self, output_shape, input_shape, metrics):
        m = tf.keras.Sequential()
        m.add(Input(input_shape))
        
        m.add(Conv2D(
            filters=32, kernel_size=3, padding='valid', activation='relu',
        ))
        m.add(BatchNormalization())
        
        m.add(Flatten())
        
        m.add(Dense(64, activation='relu'))
        m.add(BatchNormalization())
        
        m.add(Dense(output_shape, activation='sigmoid'))

        m.compile(optimizer='adam', loss='binary_crossentropy', metrics=metrics)
        
        return m
    
    def run(self, IDs, model_path, batch_size=10, epochs=10, overwrite=False):
        ''' 
        If not overwrite and there's an existing model, the model will 
        continue trainingw if the given epoch is bigger than the previous,
        else just evaluate.
        Ensure train splits are the same across continuations / evaluations
        by not modifying the random_state in split_and_normalise.
        '''
        model_savepoint = model_path.parent.joinpath(model_path.stem)
        log_file = model_path.with_suffix('.log')
 
        metrics = ['accuracy', custom_f1_score, 'recall', 'precision', 'auc']       

        if overwrite:
            for f in [model_path, log_file] + list(model_savepoint.glob('*')):
                f.unlink(missing_ok=True)
                
        self.display_logger(log_file, metrics)
        
        callbacks = self.define_callbacks_and_logger(
            model_path, model_savepoint, log_file, metrics)
        
        params = {
            'dim': (100, 100),
            'batch_size': batch_size,
            'n_classes': 242,
            'n_channels': 13,
            'shuffle': True
        }
        
        
        training_ids, validation_ids = train_test_split(IDs, train_size=0.9, random_state=42)
        
        training_generator = DataGenerator(training_ids, data_dir, **params)
        validation_generator = DataGenerator(validation_ids, data_dir, **params)

        if model_path.is_file():
            model = tf.keras.models.load_model(model_path)
        else:
            model = self.build_model(params['n_classes'], (*params['dim'], params['n_channels']), metrics)

        # model = self.topless_vgg(input_shape, num_classes, metrics)
        
        model.fit(
            x=training_generator,
            validation_data=validation_generator,
            epochs=epochs,
            callbacks=callbacks
        )
        
        return model

In [7]:
parts = [f.stem.split('_')[-1] for f in feature_filepaths]
model_name = f'conv_parts_{parts[0]}_to_{parts[-1]}.keras'
model_path = model_dir.joinpath(model_name)

IDs = [int(f.stem.split('_')[-1]) for f in feature_filepaths]
IDs = shuffle(IDs, random_state=42)

In [None]:
%%time
KerasModelCreator().run(
    IDs, 
    model_path, 
    batch_size=100, 
    epochs=50, 
    overwrite=False
)

Previous training:


epoch,loss,accuracy,custom_f1_score,recall,precision,auc,val_loss,val_accuracy,val_custom_f1_score,val_recall,val_precision,val_auc
1,0.19639,0.707986,0.695076,0.81279,0.081795,0.962778,0.013933,0.897083,0.935285,0.879192,1.0,1.0
2,0.004631,0.867037,1.0,1.0,1.0,1.0,0.002081,0.876042,1.0,1.0,1.0,1.0
3,0.001433,0.866574,1.0,1.0,1.0,1.0,0.000891,0.876875,1.0,1.0,1.0,1.0
4,0.000691,0.861505,1.0,1.0,1.0,1.0,0.000482,0.876875,1.0,1.0,1.0,1.0
5,0.000397,0.857824,1.0,1.0,1.0,1.0,0.000297,0.876875,1.0,1.0,1.0,1.0
6,0.00027,0.858495,1.0,1.0,1.0,1.0,0.000222,0.876875,1.0,1.0,1.0,1.0
7,0.000207,0.861181,1.0,1.0,1.0,1.0,0.000167,0.876875,1.0,1.0,1.0,1.0
8,0.000166,0.860602,1.0,1.0,1.0,1.0,0.00014,0.876875,1.0,1.0,1.0,1.0
9,0.00014,0.859028,1.0,1.0,1.0,1.0,0.000122,0.876875,1.0,1.0,1.0,1.0
10,0.00012,0.855301,1.0,1.0,1.0,1.0,0.000106,0.876875,1.0,1.0,1.0,1.0


2024-06-10 21:17:39.859450: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 78675968 exceeds 10% of free system memory.
2024-06-10 21:17:39.882037: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 78675968 exceeds 10% of free system memory.
2024-06-10 21:17:39.897713: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 78675968 exceeds 10% of free system memory.
2024-06-10 21:17:39.964638: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 78675968 exceeds 10% of free system memory.
2024-06-10 21:17:39.978913: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 78675968 exceeds 10% of free system memory.


Epoch 17/50


2024-06-10 21:18:03.345209: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:2: Filling up shuffle buffer (this may take a while): 4 of 8
2024-06-10 21:18:17.923502: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


[1m  5/432[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22:59[0m 3s/step - accuracy: 0.8392 - auc: 1.0000 - custom_f1_score: 1.0000 - loss: 6.8272e-05 - precision: 1.0000 - recall: 1.0000

In [None]:
# keep_indices = np.where(y.sum(axis=1) > 0)[0]
# y = y[keep_indices]
# X = X[keep_indices]

In [None]:
# tf.keras.models.load_model(model_path).summary()

In [None]:
# import subprocess
# subprocess.run(['sudo', 'shutdown', 'now'])