In [1]:
from IPython.display import display
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import datetime
from pathlib import Path

import intake

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report, accuracy_score

import pickle

import tensorflow as tf
from tensorflow.keras import layers

2024-03-23 00:28:27.787499: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
catalog = intake.open_catalog(Path('../catalog.yml'))
source = getattr(catalog, 'treesat')
gdf = source.read()[source.metadata['usecols']]
gdf.crs = 25832
gdf = gdf.to_crs(epsg=4326)

In [3]:
target = source.metadata['categories']['generic'] # generic / general
gdf[target] = gdf[target].astype('category')

In [4]:
selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']

In [13]:
class KerasModelCreator:        
    def load_data(self, filepath):
        """Load numpy files and flatten inner dimensions."""
        with open(filepath, 'rb') as f:
            a = np.load(f)
        # Array comes stacked, use tolist to unstack
        return np.array(a.tolist())#.reshape(len(a), -1)

    def split_and_preprocess(self, y, X, random_state=42):
        """Split, max scale, and fill masked values with the mean."""
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=random_state)
        
        band_max_values = [X_train[...,i].max() for i in range(X_train.shape[-1])]
        
        X_train = np.where(X_train > 0, X_train, X_train[X_train > 0].mean())/band_max_values
        
        X_test = np.where(X_test > 0, X_test, X_test[X_test > 0].mean())/band_max_values

        return X_train, X_test, y_train, y_test
        
        
    def build_model(self, num_classes):
        padding = 'same'
        activation = 'relu'
        model = tf.keras.Sequential([
          layers.Conv2D(16, (3, 3), padding=padding, activation=activation),
          layers.MaxPooling2D(padding=padding),
          layers.Conv2D(32, (3, 3), padding=padding, activation=activation),
          layers.MaxPooling2D(padding=padding),
          layers.Conv2D(64, (3, 3), padding=padding, activation=activation),
          layers.MaxPooling2D(pool_size=2),
          layers.Flatten(),
          layers.Dense(128, activation=activation),
          layers.Dense(num_classes)
        ])

        model.compile(
            optimizer='adam',
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=['accuracy']
        )

        return model
    
    def fit_eval(self, y, X, array_key, model):
        X_train, X_test, y_train, y_test = self.split_and_preprocess(y, X)
        
    
    def run(self, labels, filepaths, combined=False):
        zeros = {}
        
        y_test_combined = []
        X_test_combined = []
        model = self.build_model(labels.unique().size)
        
        for filepath in filepaths:
            X_key = filepath.stem
            
            X = self.load_data(filepath)

            X = X[...,:3]

            X_train, X_test, y_train, y_test = self.split_and_preprocess(labels, X)
            y_test_combined.append(y_test)
            X_test_combined.append(X_test)

            print(f'Fitting {X_key}...', end=' ')
            model.fit(X_train, y_train, epochs=20, verbose=1)
            
            model.evaluate(X_test, y_test, verbose=2)
            
            zeros[X_key] = 100*(X == 0).sum()/X.size
            
        if combined:    
            model.evaluate(X_test_combined, y_test_combined, verbose=2)
        
        return zeros

In [14]:
%%time
filepaths = [Path('data').joinpath('treesat_042019.npy')]
# filepaths = sorted(list(Path('data').glob('treesat_*.npy')))
zeros = KerasModelCreator().run(
    gdf[target].cat.codes, filepaths, combined=True)

Fitting treesat_042019... Epoch 1/20
[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.2463 - loss: 1.9955
Epoch 2/20
[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.3825 - loss: 1.6810
Epoch 3/20
[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4258 - loss: 1.5877
Epoch 4/20
[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4339 - loss: 1.5533
Epoch 5/20
[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4421 - loss: 1.5332
Epoch 6/20
[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.4455 - loss: 1.5196
Epoch 7/20
[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.4524 - loss: 1.5104
Epoch 8/20
[1m1417/1417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4499 - loss: 1.503