In [1]:
from tqdm.notebook import tqdm
from IPython.display import display, HTML

import numpy as np
import pandas as pd

import plotly.graph_objects as go

import datetime
from pathlib import Path

import intake

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import shuffle
import itertools

from tensorflow.keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras import backend as K

from tensorflow.keras.layers import *

2024-05-28 16:01:20.781231: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
catalog = intake.open_catalog(Path('../catalog.yml'))
source = getattr(catalog, 'treesat_multi')
df = source.read()

In [3]:
selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']

In [4]:
target = source.metadata['categories']['multi'] # multi / trinary
labels = df[target].to_numpy()

model_dir = Path('models').joinpath('seasons')
model_dir.mkdir(parents=True, exist_ok=True)

data_dir = Path('seasonal_median')

In [5]:
label_year = df['YEAR'].astype(int).to_numpy()

label_year = np.where(label_year < 2017, 2017, label_year)

shuffled_indices = shuffle(np.arange(labels.shape[0]), random_state=42)
shuffled_indices = np.arange(labels.shape[0])

def get_features(years, seasons, data_dir, shuffled_indices, label_year):
    selected_data = np.zeros((label_year.shape[0], len(seasons), 6, 6, 13))
    for s, season in enumerate(seasons):
        for y, year in enumerate(years):
            filepath = data_dir.joinpath(f'processed_treesat_{year}{str(season).zfill(2)}.npy')
            with open(filepath, 'rb') as f:
                data = np.load(f)

            year_indices = np.where(label_year == year)
            selected_data[year_indices, s, ...] = data[year_indices]
        selected_data[:, s, ...] = selected_data[shuffled_indices, s, ...]
    return selected_data

In [6]:
class KerasModelCreator:
    def normalise_X(self, X, p=1):
        upper = np.percentile(X, 100-p)
        lower = np.percentile(X, p)
    
        bounded_X = np.where(X > upper, np.median(X), X)
        bounded_X = np.where(X < lower, np.median(X), bounded_X)
        
        scaled_X = (bounded_X - lower)/(upper - lower)
        return scaled_X

    def run(self, X, y, model_path, epochs=10, batch_size=10, overwrite=False):
        ''' 
        If not overwrite and there's an existing model, the model will 
        continue trainingw if the given epoch is bigger than the previous,
        else just evaluate.
        Ensure train splits are the same across continuations / evaluations
        by not modifying the random_state in split_and_normalise.
        '''
        model_savepoint = model_path.parent.joinpath(model_path.stem)
        log_file = model_path.with_suffix('.log')

        if overwrite:
            for f in [model_path, log_file] + list(model_savepoint.glob('*')):
                f.unlink(missing_ok=True)
                
        for i in range(X.shape[-1]):
            X[...,i] = self.normalise_X(X[...,i])
        
        default_metrics = ['accuracy', 'root_mean_squared_error', 'r2_score']
        
        if model_path.is_file():
            model = tf.keras.models.load_model(model_path)
        else:
            model = self.build_model(y.shape[1], X.shape[1:], default_metrics)

        callbacks = [
            tf.keras.callbacks.BackupAndRestore(
                model_savepoint, save_freq='epoch', delete_checkpoint=False
            ),
            tf.keras.callbacks.CSVLogger(log_file, append=True),
            tf.keras.callbacks.ModelCheckpoint(
                model_path, monitor='accuracy', save_best_only=True, 
                save_freq='epoch', initial_value_threshold=0.4
            )
        ]
        
        if log_file.is_file():
            val_metrics = ['val_' + x for x in default_metrics]
            df = pd.read_csv(log_file)[['epoch', 'loss'] + default_metrics + val_metrics]
            df['epoch'] += 1
            print('Previous training:')
                
            display(HTML(df.to_html(index=False)))
        
        model.fit(
            X, y, epochs=epochs, verbose=1, batch_size=batch_size, 
            callbacks=callbacks, shuffle=False
        )
        
    def build_model(self, output_shape, input_shape, metrics):
        m = tf.keras.Sequential()
        
        m.add(Input(input_shape))

        m.add(ConvLSTM2D(
            filters=8, kernel_size=3, 
            padding='same', activation='tanh',
            kernel_regularizer=l2(0.001),
            return_sequences=False,
            unroll=True,
        ))
        m.add(Flatten())
        m.add(Dense(output_shape, activation='softmax'))
        m.compile(optimizer='rmsprop', loss='mse', metrics=metrics)
        
        return m

In [7]:
train_years = [2017, 2018, 2019]
all_seasons = [3, 6, 9, 12]

model_name = f'lstm_seasons_{"_".join(map(str, all_seasons))}_years_{"_".join(map(str, train_years))}.keras'
model_path = model_dir.joinpath(model_name)

train_features = get_features(train_years, all_seasons, data_dir, shuffled_indices, label_year)

In [8]:
result = KerasModelCreator().run(
    train_features,
    labels[shuffled_indices],
    model_path, epochs=5, batch_size=10, overwrite=True
)

Epoch 1/5
[1m5039/5039[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - accuracy: 0.7315 - loss: 0.0223 - r2_score: 0.2333 - root_mean_squared_error: 0.1396
Epoch 2/5
[1m5039/5039[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 4ms/step - accuracy: 0.7556 - loss: 0.0186 - r2_score: 0.2815 - root_mean_squared_error: 0.1331
Epoch 3/5
[1m5039/5039[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 4ms/step - accuracy: 0.7634 - loss: 0.0181 - r2_score: 0.2667 - root_mean_squared_error: 0.1311
Epoch 4/5
[1m5039/5039[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - accuracy: 0.7622 - loss: 0.0180 - r2_score: 0.2894 - root_mean_squared_error: 0.1309
Epoch 5/5
[1m5039/5039[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 4ms/step - accuracy: 0.7750 - loss: 0.0170 - r2_score: 0.3070 - root_mean_squared_error: 0.1271


In [9]:
test_years = [y + 0 for y in train_years]
test_features = get_features(test_years, all_seasons, data_dir, shuffled_indices, label_year)

for i in range(test_features.shape[-1]):
    test_features[...,i] = KerasModelCreator().normalise_X(test_features[...,i])
    
model = tf.keras.models.load_model(model_path)

model.evaluate(test_features, labels[shuffled_indices], return_dict=True)

[1m1575/1575[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.0306 - loss: 0.0592 - r2_score: -23.0126 - root_mean_squared_error: 0.2430


{'accuracy': 0.03507274389266968,
 'loss': 0.05818498507142067,
 'r2_score': -2.5639801025390625,
 'root_mean_squared_error': 0.24083393812179565}

In [13]:
label_year = df['YEAR'].astype(int).to_numpy()
label_year = np.where(label_year == 2020)[0]

shuffled_indices = shuffle(np.arange(labels.shape[0]), random_state=42)
shuffled_indices = np.arange(labels.shape[0])

def get_features(year, seasons, data_dir, shuffled_indices, label_year):
    selected_data = np.zeros((label_year.shape[0], len(seasons), 6, 6, 13))
    for s, season in enumerate(seasons):
        for y, year in enumerate(years):
            filepath = data_dir.joinpath(f'processed_treesat_{year}{str(season).zfill(2)}.npy')
            with open(filepath, 'rb') as f:
                data = np.load(f)

            year_indices = np.where(label_year == year)
            selected_data[year_indices, s, ...] = data[year_indices]
        selected_data[:, s, ...] = selected_data[shuffled_indices, s, ...]
    return selected_data

test_year = 2020
test_features = get_features(test_year, all_seasons, data_dir, shuffled_indices, label_year)


In [10]:
tf.keras.models.load_model(model_path).summary()

In [11]:
# import subprocess
# subprocess.run(['sudo', 'shutdown', 'now'])

In [12]:
# season_combinations = itertools.chain.from_iterable(
#     itertools.combinations(all_seasons, r) for r in range(1, len(all_seasons)+1))
# season_combinations = list(season_combinations)