In [1]:
# import warnings
# warnings.simplefilter(action='ignore', category=FutureWarning)

from IPython.display import display
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import datetime
from pathlib import Path

import intake

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report, accuracy_score

import pickle

import tensorflow as tf

import lightgbm

In [2]:
catalog = intake.open_catalog(Path('../catalog.yml'))
source = getattr(catalog, 'treesat')
gdf = source.read()[source.metadata['usecols']]
gdf.crs = 25832
gdf = gdf.to_crs(epsg=4326)

In [3]:
target = source.metadata['categories']['generic']
gdf[target] = gdf[target].astype('category')

In [4]:
selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']

In [5]:
class KerasModelCreator:
    def __init__(self):
        self.zeros = {}
        
    def load_and_fill_data(self, filepath):
        """Load numpy files and replace masks with the mean."""
        print(f'Loading {filepath}')
        with open(filepath, 'rb') as f:
            a = np.load(f)
            
        data_array = np.array(a.tolist()).reshape(len(a), -1)

        self.zeros[filepath.stem] = 100*(data_array == 0).sum()/data_array.size
        print(f'{self.zeros[filepath.stem]:.1f}% of dataset is masked.')
        nonzero_mean = data_array[data_array > 0].mean()
        return np.where(data_array > 0, data_array, nonzero_mean)

    
    def build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(10)
        ])
        
        model.compile(
            optimizer='adam',
            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
            metrics=['accuracy'])

        return model
    
    def fit_eval(self, y, X, array_key, random_state=42):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=random_state)
        
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model = self.build_model()
        
        print(f'Fitting {array_key}...')
        model.fit(X_train, y_train, epochs=10, verbose=0)
        print(f'Evaluating {array_key}...')
        model.evaluate(X_test, y_test, verbose=2)
        
    
    def run(self, gdf, filepaths, individual=True, combined=False, chunk_keys=None):
        labels = gdf[target].cat.codes.to_numpy()
        report_dfs = {}
        data_dict = {}
        for filepath in filepaths:
            array_key = filepath.stem
            data_array = self.load_and_fill_data(filepath)
            
            data_dict[array_key] = data_array
            
            if individual:
                self.fit_eval(labels, data_array, array_key)
        
        if chunk_keys:
            chunk = np.mean([data_dict[k] for k in chunk_keys], axis=0)
            self.fit_eval(labels, chunk, array_key)
            
            chunk = np.concatenate([data_dict[k] for k in chunk_keys])
            y = np.repeat(labels, len(chunk_keys))
            self.fit_eval(labels, chunk, array_key)

        if combined:
            combined_data = np.mean(list(data_dict.values()), axis=0)
            self.fit_eval(labels, combined_data, array_key)
        
            combined_data = np.concatenate(list(data_dict.values()))
            y = np.repeat(labels, len(data_dict))
            self.fit_eval(labels, combined_data, array_key)
        
        return self.zeros

In [None]:
%%time
# filepaths = [Path('data').joinpath('treesat_042019.npy')]
filepaths = Path('data').glob('treesat_*.npy')
report_dfs = KerasModelCreator().run(
    gdf, filepaths, individual=True, combined=False, chunk_keys=None)

Loading data\treesat_012018.npy
31.5% of dataset is masked.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Fitting treesat_012018...
Evaluating treesat_012018...
5039/5039 - 0s - loss: 1.8063 - acc: 0.3312
Loading data\treesat_012019.npy
5.8% of dataset is masked.
Fitting treesat_012019...
Evaluating treesat_012019...
5039/5039 - 0s - loss: 1.5603 - acc: 0.4372
Loading data\treesat_012020.npy
2.8% of dataset is masked.
Fitting treesat_012020...
Evaluating treesat_012020...
5039/5039 - 0s - loss: 1.5709 - acc: 0.4487
Loading data\treesat_012021.npy
23.5% of dataset is masked.
Fitting treesat_012021...
Evaluating treesat_012021...
5039/5039 - 0s - loss: 1.7323 - acc: 0.3663
Loading data\treesat_012022.npy
0.0% of dataset is masked.
Fitting treesat_012022...
Evaluating treesat_012022...
5039/5039 - 0s - loss: 1.8602 - acc: 0.3362
Loading data\treesat_012023.npy
0.0% of dataset is masked.
Fitting treesat_012023...
Evalu