In [1]:
from IPython.display import display
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import datetime
from pathlib import Path

import intake

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score

import pickle

import lightgbm
from flaml.default import LGBMClassifier

import matplotlib.pyplot as plt

from tqdm import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
catalog = intake.open_catalog(Path('../catalog.yml'))
source = getattr(catalog, 'treesat')
gdf = source.read()[source.metadata['usecols']]
gdf.crs = 25832
gdf = gdf.to_crs(epsg=4326)

In [3]:
target = source.metadata['categories']['generic']
gdf[target] = gdf[target].astype('category')

In [4]:
selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']

In [5]:
class ModelCreator:
    def load_and_fill_data(self, filepath):
        """Load numpy files and replace masks with the mean."""
        print(f'Loading {filepath}')
        with open(filepath, 'rb') as f:
            a = np.load(f)
            
        data_array = np.array(a.tolist()).reshape(len(a), -1)  
        return np.where(data_array > 0, data_array, data_array.mean())

    
    def train(self, X_train, y_train):
        params = dict(
            boosting_type = 'rf',
            # "num_leaves": 31,
            # "learning_rate": 0.05,
            # "feature_fraction": 0.9,
            # "bagging_fraction": 0.8,
            # "bagging_freq": 5,
            verbose = 0,
            # device_type = 'gpu',
            # max_bin = 255,
        )
    
        # lgb = lightgbm.LGBMClassifier(**params)
        # stacked_model = OneVsRestClassifier(lgb)

        # stacked_model = OneVsRestClassifier(RandomForestClassifier())
        stacked_model = OneVsRestClassifier(SVC())
        stacked_model.fit(X_train, y_train)
        
        return stacked_model
    
    def predict(self, y, X, array_key, random_state=42):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=random_state)
        
        print(f'Training {array_key}...')
        stacked_model = self.train(X_train, y_train)
        
        print(f'Evaluating {array_key}...')
        y_pred = stacked_model.predict(X_test)

        print(f'accuracy_score {accuracy_score(y_test, y_pred)}')
        print()
        
        return pd.DataFrame(classification_report(y_test, y_pred, output_dict=True)).T
    
    def run_and_eval(self, gdf, filepaths, individual=True, all=False, chunk_keys=None):
        labels = gdf[target].to_numpy()
        report_dfs = {}
        data_dict = {}
        for filepath in filepaths:
            
            array_key = filepath.stem
            data_array = self.load_and_fill_data(filepath)
            
            data_dict[array_key] = data_array
            
            if individual:
                report_dfs[array_key] = self.predict(labels, data_array, array_key)
        
        if chunk_keys:
            chunk = np.concatenate([data_dict[k] for k in chunk_keys])
            y = np.repeat(labels, len(chunk_keys))
            report_dfs['chunk_concat'] = self.predict(y, chunk, 'chunk_concat')

            chunk = np.mean([data_dict[k] for k in chunk_keys], axis=0)
            report_dfs['chunk_concat'] = self.predict(labels, chunk, 'chunk_mean')
            
        if all:
            all = np.concatenate(list(data_dict.values()))
            y = np.repeat(labels, len(data_dict))
            report_dfs['all_concat'] = self.predict(y, all, 'all_concat')

            all = np.mean(list(data_dict.values()), axis=0)
            report_dfs['all_mean'] = self.predict(labels, all, 'all_mean')
        
        return report_dfs

In [None]:
%%time
filepaths = Path('data').glob('*2019.npy')
# filepaths = Path('data').glob('*04*.npy')
summer_keys = [f'treesat_0{month}2019' for month in [2, 4, 6]]
report_dfs = ModelCreator().run_and_eval(gdf, filepaths, individual=True, all=False, chunk_keys=None)

Loading data\treesat_012019.npy
Training treesat_012019...


In [7]:
for k, v in report_dfs.items():
    print(k)
    print(v.mean())
    print()

NameError: name 'report_dfs' is not defined