In [16]:
from IPython.display import display
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import datetime
from pathlib import Path

import intake

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report, accuracy_score

import pickle

import tensorflow as tf

import lightgbm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
catalog = intake.open_catalog(Path('../catalog.yml'))
source = getattr(catalog, 'treesat')
gdf = source.read()[source.metadata['usecols']]
gdf.crs = 25832
gdf = gdf.to_crs(epsg=4326)

In [3]:
target = source.metadata['categories']['generic']
gdf[target] = gdf[target].astype('category')

In [4]:
selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']

In [5]:
class LightModelCreator:
    def __init__(self):
        self.zeros = {}
        
    def load_and_fill_data(self, filepath):
        """Load numpy files and replace masks with the mean."""
        print(f'Loading {filepath}')
        with open(filepath, 'rb') as f:
            a = np.load(f)
            
        data_array = np.array(a.tolist()).reshape(len(a), -1)

        self.zeros[filepath.stem] = 100*(data_array == 0).sum()/data_array.size
        print(f'{self.zeros[filepath.stem]:.1f}% of dataset is masked.')
        nonzero_mean = data_array[data_array > 0].mean()
        return np.where(data_array > 0, data_array, nonzero_mean)

    
    def train(self, X_train, y_train):
        params = dict(
            verbose = 0,
            device_type = 'gpu'
        )
    
        lgb = lightgbm.LGBMClassifier(**params)
        
        stacked_model = OneVsRestClassifier(lgb)
        
        stacked_model.fit(X_train, y_train)
        
        return stacked_model
    
    def predict(self, y, X, array_key, random_state=42):
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=random_state)
        
        print(f'Training {array_key}...')
        stacked_model = self.train(X_train, y_train)
        
        print(f'Evaluating {array_key}...')
        y_pred = stacked_model.predict(X_test)

        print(f'accuracy_score {accuracy_score(y_test, y_pred):.2f}')

        print()
        
        # return pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T
        return accuracy_score(y_test, y_pred)
    
    def run_and_eval(self, gdf, filepaths, individual=True, all=False, chunk_keys=None):
        labels = gdf[target].to_numpy()
        report_dfs = {}
        data_dict = {}
        for filepath in filepaths:
            
            array_key = filepath.stem
            data_array = self.load_and_fill_data(filepath)
            
            data_dict[array_key] = data_array
            
            if individual:
                report_dfs[array_key] = self.predict(labels, data_array, array_key)
        
        if chunk_keys:
            chunk = np.mean([data_dict[k] for k in chunk_keys], axis=0)
            report_dfs['chunk_concat'] = self.predict(labels, chunk, 'chunk_mean')
            
            chunk = np.concatenate([data_dict[k] for k in chunk_keys])
            y = np.repeat(labels, len(chunk_keys))
            report_dfs['chunk_concat'] = self.predict(y, chunk, 'chunk_concat')

        if all:
            all = np.mean(list(data_dict.values()), axis=0)
            report_dfs['all_mean'] = self.predict(labels, all, 'all_mean')
            
            all = np.concatenate(list(data_dict.values()))
            y = np.repeat(labels, len(data_dict))
            report_dfs['all_concat'] = self.predict(y, all, 'all_concat')  
        
        return self.zeros

In [6]:
# %%time
# filepaths = sorted(list(Path('data').glob('*.npy')))
# report_dfs = LightModelCreator().run_and_eval(gdf, filepaths, individual=True, all=True, chunk_keys=None)

Loading data\treesat_042019.npy
0.0% of dataset is masked.
Training treesat_042019...
Evaluating treesat_042019...
accuracy_score 0.60

Wall time: 43.3 s
