##### **Run LGBM data exploration**

In [1]:
from tqdm.notebook import tqdm

import numpy as np

import datetime
from pathlib import Path

import intake

import joblib

from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import accuracy_score

import lightgbm

import plotly.graph_objects as go

Read in data and ensure correct CRS. Alternatively, use Geopandas directly.

In [2]:
catalog = intake.open_catalog(Path('../catalog.yml'))
source = getattr(catalog, 'treesat')
gdf = source.read()[source.metadata['usecols']]
gdf.crs = 25832
gdf = gdf.to_crs(epsg=4326)

In [3]:
# multi (e.g. oak, spruce...) or trinary (broadleaf, needleleaf, cleared)
target = source.metadata['categories']['multi'] 
gdf[target] = gdf[target].astype('category')

In [4]:
selected_bands = [f'B{x}' for x in range(2, 9)] + ['B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']
print(f'{len(selected_bands)} bands will be used: {selected_bands}')

13 bands will be used: ['B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B11', 'B12', 'TCI_R', 'TCI_G', 'TCI_B']


LightGBM is a gradient boosting estimator that uses tree based learning algorithms. It is designed for better performance while maintaining or improving accuracy. It is ideal for exploring the large amounts of data generated by Sentinel-2. 

For more information: https://lightgbm.readthedocs.io/en/stable/

In [5]:
class LightModelCreator:
    '''
    Use Scitkit-learn's OneVsRestClassifier to fit one
    LGBM classifier per class and return accuracy scores.
    '''
    def train(self, X_train, y_train):
        params = dict(
            verbose=0,
            # device_type='gpu' # does not work within WSL
        )
        lgb = lightgbm.LGBMClassifier(**params)

        stacked_model = OneVsRestClassifier(lgb)
        stacked_model.fit(X_train, y_train)
        return stacked_model
    
    def predict(self, X, y, model_name, random_state=42):
        X = X.reshape(len(y), -1)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.1, random_state=random_state)

        model_path = Path('models').joinpath(model_name)
        if model_path.is_file():
            stacked_model = joblib.load(model_path)
        else:
            stacked_model = self.train(X_train, y_train)
            joblib.dump(stacked_model, model_path)
            
        y_pred = stacked_model.predict(X_test)

        return accuracy_score(y_test, y_pred)
    
    def run_and_eval(self, labels, filepaths):
        scores = []
        for filepath in tqdm(filepaths, leave=False):
            with open(filepath, 'rb') as f:
                data = np.load(f)
                
            model_name = f'lgbm_{filepath.parent}_{filepath.stem}.joblib'
            scores.append(
                self.predict(data, labels, model_name)
            )
        return scores

The final image is a mosaic from a temporal collection of images. A month usually contains about 6 images due to the frequency of the Sentinel-2 satellites. In this case, the images of a given month are reduced using either a temporal mean or a temporal median of the pixel per band. This is done in order to reduce atmospheric variabilities such as cloud covering of the area of interest. 

The below is slow when first training, but the models are saved for re-use when the notebook is re-run.

In [None]:
traces = []
monthly_scores = []
months = list(range(1, 13))
for method in tqdm(['mean', 'median']):
    filepaths = sorted(list(Path(f'data_{method}').glob(f'processed_treesat_2019*.npy')))
    scores = LightModelCreator().run_and_eval(gdf[target].cat.codes, filepaths)
    monthly_scores.extend(scores)
    traces.append(go.Scatter(x=months, y=scores, name=method))

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
go.Figure(
    data=traces,
    layout={
        "xaxis": {"title": "Month"},
        "yaxis": {"title": "Accuracy"},
        "title": "LGBM accuracies for 2019"}
)

In [None]:
mean = sum(monthly_scores)/len(monthly_scores)
print(f'Monthly accuracy scores range from {min(monthly_scores):.2f} to {max(monthly_scores):.2f}, with an average accuracy of {mean:.2f}.')

The median appears to produce better results. As such, it is be used in subsequent analyses.

Next, perform a similar exploration for the processed seasonal data.

The seasons are taken as:
- winter: December, January, February
- spring: March, April, May
- summer: June, July, August
- autumn: September, October, November

The seasonal data and the median monthly data were processed similarly, with the exception that the seasonal data covers the 3 months of the corresponding season instead of 1 month like the monthly data.

As before, the processing below is slow when first training, but the models are saved for re-use when the notebook is re-run, which should be substantially faster.

In [None]:
traces = []
seasons = ['Spring', 'Summer', 'Autumn', 'Winter']
seasonal_scores = []
for year in tqdm(range(2017, 2024)):
    filepaths = sorted(list(Path('seasonal_median').glob(f'processed*{year}*.npy')))
    scores = LightModelCreator().run_and_eval(gdf[target].cat.codes, filepaths)
    seasonal_scores.extend(scores)
    traces.append(go.Scatter(
        x=seasons, y=scores, name=year))

In [None]:
go.Figure(
    data=traces,
    layout={
        "xaxis": {"title": "Season"},
        "yaxis": {"title": "Accuracy"},
        "title": "LGBM accuracies"}
)

In [None]:
mean = sum(seasonal_scores)/len(seasonal_scores)
print(f'Seasonal accuracy scores range from {min(seasonal_scores):.2f} to {max(seasonal_scores):.2f}, with an average accuracy of {mean:.2f}.')

This indicates a significant improvement when compared to the monthly approach.

And using combined seasons across all years available (2017 to 2023 inclusive):

In [None]:
seasons = ['Spring', 'Summer', 'Autumn', 'Winter']
mean_seasonal_scores = []
for season in tqdm(seasons):
    filepath = sorted(list(Path('seasonal_median').glob(f'{season}.npy')))
    score = LightModelCreator().run_and_eval(gdf[target].cat.codes, filepath)
    mean_seasonal_scores.extend(score)

In [None]:
traces = [go.Scatter(x=seasons, y=mean_seasonal_scores)]
go.Figure(
    data=traces,
    layout={
        "xaxis": {"title": "Season"},
        "yaxis": {"title": "Accuracy"},
        "title": "LGBM accuracies"}
)

In [None]:
mean = sum(mean_seasonal_scores)/len(mean_seasonal_scores)
print(f'Mean seasonal accuracy scores range from {min(mean_seasonal_scores):.2f} to {max(mean_seasonal_scores):.2f}, with an average accuracy of {mean:.2f}.')

A seasonal mean across all years available indicates a further improvement when compared to using individual seasons.