In [1]:
from tqdm.notebook import tqdm
from IPython.display import display, HTML

import numpy as np
import pandas as pd

import plotly.graph_objects as go

import datetime
from pathlib import Path

import intake

from sklearn.model_selection import train_test_split

from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Conv3D, Activation, BatchNormalization, \
                                     Dropout, Flatten, Dense

2024-05-03 16:00:04.442154: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
catalog = intake.open_catalog(Path('../catalog.yml'))
source = getattr(catalog, 'treesat_multi')
df = source.read()

In [3]:
selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']

In [4]:
season = 3
target = source.metadata['categories']['multi'] # multi / trinary
# labels = np.ceil(df[target].to_numpy())
# labels = (df[target].to_numpy() > 0.3).astype(float)
labels = df[target].to_numpy()
all_data = []

filepaths = sorted(list(Path('seasonal_median').glob(f'processed*{season}.npy')))
for filepath in filepaths:
    with open(filepath, 'rb') as f:
        data = np.load(f)
    all_data.append(data)

features = np.stack(all_data, axis=3)

In [5]:
class Conv3DModelCreator:
    def normalise(self, a, p=1):
        upper = np.percentile(a, 100-p)
        lower = np.percentile(a, p)
    
        bounded_a = np.where(a > upper, np.median(a), a)
        bounded_a = np.where(a < lower, np.median(a), bounded_a)
        
        scaled_a = (bounded_a - lower)/(upper - lower)
        return scaled_a
        
    def split_and_normalise(self, y, X, random_state):
        """Split and max scale."""
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=random_state)

        for i in range(X_train.shape[-1]):
            X_train[...,i] = self.normalise(X_train[...,i])
            X_test[...,i] = self.normalise(X_test[...,i])
        
        return X_train, X_test, y_train, y_test

    def f1_score(self, y_true, y_logit):
        true_positives = K.sum(K.round(K.clip(y_true * y_logit, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        
        recall = true_positives / (possible_positives + K.epsilon())
        predicted_positives = K.sum(K.round(K.clip(y_logit, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return (2 * precision * recall) / (precision + recall + K.epsilon())

    def build_model_3d(self, y_train, input_size, metrics):
        m = tf.keras.Sequential()
        m.add(Input(input_size))
        for i in range(2): 
            m.add(Conv3D(
                64, 
                strides=1, 
                padding='same',
                kernel_size=(3, 3, 3),
                kernel_initializer='he_normal',
                kernel_regularizer=l2(1e-6)
            ))
            m.add(BatchNormalization(axis=-1))
            m.add(Activation('relu'))
            m.add(Dropout(0.25))
            
        m.add(Flatten())
        m.add(Dense(128, 
                    kernel_initializer='he_normal', 
                    kernel_regularizer=l2(1e-6)
                   )
             )
        m.add(BatchNormalization(axis=-1))
        m.add(Activation('relu'))
        m.add(Dropout(0.25))
        
        m.add(Dense(
            y_train.shape[1], 
            activation='softmax', 
            kernel_initializer='glorot_uniform',
            kernel_regularizer=l2(1e-6)
        ))

        # print(m.summary())

        opt = tf.keras.optimizers.Adam(
            learning_rate=0.001,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-07
        )
        m.compile(
            optimizer=opt,
            # loss=tf.keras.losses.MeanSquaredError(),
            loss=tf.keras.losses.MeanAbsoluteError(),
            # loss=tf.keras.losses.CategoricalCrossentropy(),
            metrics=metrics
        )
        return m
        
    def run(self, y, X, model_name, epochs, overwrite=False):
        model_dir = Path('models')
        
        model_path = model_dir.joinpath(model_name)
        model_savepoint = model_dir.joinpath(model_path.stem)
        log_file = model_path.with_suffix('.log')

        if overwrite:
            for f in [model_path, log_file] + list(model_savepoint.glob('*')):
                f.unlink(missing_ok=True)

        X_train, X_test, y_train, y_test = self.split_and_normalise(y, X, random_state=42)
        
        builtin_metrics = ['accuracy', 'binary_accuracy', 'mean_squared_error', 'mean_absolute_error']
        custom_metrics = [self.f1_score]
        
        if model_path.is_file():
            model = tf.keras.models.load_model(model_path)
        else:
            model = self.build_model_3d(y_train, X_train.shape[1:], builtin_metrics + custom_metrics)

        callbacks = [
            tf.keras.callbacks.BackupAndRestore(
                model_savepoint, save_freq='epoch', delete_checkpoint=False
            ),
            tf.keras.callbacks.CSVLogger(
                log_file, append=True
            )
        ]

        if log_file.is_file():
            df = pd.read_csv(log_file)[['epoch', 'loss'] + builtin_metrics + ['f1_score']]
            df['epoch'] += 1
            print('Previous training:')
            display(HTML(df.to_html(index=False)))
        
        model.fit(
            X_train, y_train, epochs=epochs, verbose=1, batch_size=4, callbacks=callbacks,
            validation_data=(X_test, y_test))

        model.save(model_path)
            
        # preds = model.predict(X_test, verbose=0)
        return model.evaluate(X_test, y_test, verbose=0)

In [None]:
%%time
model_name = f'conv_all_{season}_multi_multi.keras'
# If not overwrite and there's an existing model, the model will 
# continue training if the given epoch is bigger than the previous,
# else just evaluate.
# Ensure train splits are the same across continuations / evaluations
# by not modifying the random_state in split_and_normalise.
Conv3DModelCreator().run(
    labels, features, model_name, epochs=100, overwrite=True)

Epoch 1/100
[1m11336/11336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m359s[0m 32ms/step - accuracy: 0.4343 - binary_accuracy: 0.8666 - f1_score: 0.4139 - loss: 0.0785 - mean_absolute_error: 0.0758 - mean_squared_error: 0.0531 - val_accuracy: 0.5289 - val_binary_accuracy: 0.8716 - val_f1_score: 0.5188 - val_loss: 0.0709 - val_mean_absolute_error: 0.0642 - val_mean_squared_error: 0.0473
Epoch 2/100
[1m11336/11336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m356s[0m 31ms/step - accuracy: 0.5103 - binary_accuracy: 0.8708 - f1_score: 0.4979 - loss: 0.0735 - mean_absolute_error: 0.0663 - mean_squared_error: 0.0487 - val_accuracy: 0.5813 - val_binary_accuracy: 0.8763 - val_f1_score: 0.5722 - val_loss: 0.0661 - val_mean_absolute_error: 0.0576 - val_mean_squared_error: 0.0428
Epoch 3/100
[1m11336/11336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m354s[0m 31ms/step - accuracy: 0.5453 - binary_accuracy: 0.8733 - f1_score: 0.5345 - loss: 0.0705 - mean_absolute_error: 0.0620 - mean_sq

In [None]:
positive_weights = labels.shape[0] / (np.count_nonzero(labels, axis=0) * labels.shape[1])
negative_weights = labels.shape[0] / (np.count_nonzero(labels==0, axis=0) * labels.shape[1])

In [None]:
# cm = tf.math.confusion_matrix(labels, predictions, num_classes=None)
# px.imshow(
#     np.array(cm),
#     # animation_frame=0,
#     labels=dict(color="Corr coef"),
#     x=cm[0].index,
#     y=cm[0].columns,
#     title='Confusion Matrix',
#     text_auto=True, aspect='auto', zmin=0, height=500
# )

In [None]:
# target = source.metadata['categories']['multi'] # multi / trinary
# labels = df[target].astype('category').cat.codes
# seasons = ['Spring', 'Summer', 'Autumn', 'Winter']
# all_data = []
# for season in seasons:
#     filepath = sorted(list(Path('seasonal_median').glob(f'{season}.npy')))
#     with open(filepath[0], 'rb') as f:
#         all_data.append(np.load(f))
    
# model_name = f'conv_all_mean_seasons_multi.keras'

# features = np.stack(all_data, axis=3)
# score = ConvModelCreator().run(labels, features, model_name)
# score

In [None]:
# traces = [go.Scatter(x=seasons, y=seasonal_scores)]
# go.Figure(
#     data=traces,
#     layout={
#         "xaxis": {"title": "Season"},
#         "yaxis": {"title": "Accuracy"},
#         "title": "Conv2d accuracies"}
# )

In [None]:
# target = source.metadata['categories']['trinary'] # multi / trinary
# labels = df[target].astype('category').cat.codes
# seasons = ['Spring', 'Summer', 'Autumn', 'Winter']
# all_data = []
# for season in tqdm(seasons):
#     filepath = sorted(list(Path('seasonal_median').glob(f'{season}.npy')))
#     with open(filepath[0], 'rb') as f:
#         all_data.append(np.load(f))
    
# model_name = f'conv_all_mean_seasons_trinary.keras'
# score = ConvModelCreator().run(labels, np.stack(all_data, axis=3), model_name)
# score

In [None]:
# target = source.metadata['categories']['trinary'] # multi / trinary

# mask = df[target] != 'cleared'

# labels = df[target][mask].astype('category').cat.codes

# seasons = ['Spring', 'Summer', 'Autumn', 'Winter']
# all_data = []
# for season in tqdm(seasons):
#     filepath = sorted(list(Path('seasonal_median').glob(f'{season}.npy')))
#     with open(filepath[0], 'rb') as f:
#         data = np.load(f)
#         all_data.append(data[mask])
        
# model_name = f'conv_all_mean_seasons_binary.keras'
# score = ConvModelCreator().run(labels, np.stack(all_data, axis=3), model_name)
# score

In [None]:
# filepaths = sorted(list(Path('london').glob('*.npy')), key=lambda x: int(x.stem.split('_')[-1]))

# n_chunks = 50000
# chunks = [gdf[i: i + n_chunks] for i in range(0, gdf.shape[0], n_chunks)]
# scores = []

# chunk[target].cat.codes

# for chunk, filepath in tqdm(zip(chunks, filepaths), total=len(filepaths)
#                            ):
#     score = LightModelCreator().run_and_eval(chunk[target].cat.codes, [filepath])
#     scores.append(score)