In [None]:
# new imports
from pathlib import Path
import plotly.express as px
from IPython.display import display
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import multilabel_confusion_matrix

from sklearn.svm import SVC

import rasterio
import json
import pickle
# import pca

# from sklearn.multioutput import MultiOutputClassifier

# original imports (edited)
import shelve
# import lightgbm as ltb

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, average_precision_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

from flaml import AutoML

# from skmultilearn.model_selection import IterativeStratification

In [None]:
labels_json = Path('TreeSatBA_v9_60m_multi_labels.json')

# load the data
with labels_json.open() as f:
    multilabels_dict = json.load(f)

# Convert the pair lists to dictionaries
# so that pandas can read it properly
new_values = []
for vs in multilabels_dict.values():
    this_dict = {}
    for key, value in vs:
        this_dict[key] = value
    new_values.append(this_dict)

labels_df = pd.DataFrame.from_records(new_values, index=multilabels_dict.keys())
labels_df = labels_df.fillna(0)

In [None]:
counts = labels_df.astype(bool).sum()
fig = px.histogram(x=counts.index, y=counts, text_auto=True)
fig.update_xaxes(categoryorder="total descending")
fig.update_layout(xaxis_title="Species", yaxis_title="Occurrences")

In [None]:
%%time
# It is very slow to read a high amount of tifs.
# Do it once then save as a pickle file through numpy.
tif_paths = Path('s2').joinpath('60m')
save_path = Path('s2').joinpath('s2_60m.npy')
if not save_path.is_file():
    tif_data = []
    for tif_name in tqdm(labels_df.index):
        with rasterio.open(tif_paths.joinpath(tif_name)) as img:
            tif_data.append(img.read())
    np.save(save_path, dict(zip(labels_df.index, tif_data)))
tif_dict = np.load(save_path, allow_pickle=True).item()

In [None]:
def train_and_eval(X_train, X_test, y_train, y_test, classes, save_path):
    if not save_path.is_file():
        automl = AutoML(
            time_budget=60*60,
            estimator_list=['lgbm'],
            n_jobs=2
        )
        clf = OneVsRestClassifier(automl, n_jobs=2)
        # clf = OneVsRestClassifier(SVC(probability=True), verbose=True)
        clf.fit(X_train, y_train)

        with save_path.open(mode='wb') as f:
            pickle.dump(clf, f)
    else:
        with save_path.open(mode='rb') as f:
            clf = pickle.load(f)
        
    y_pred = clf.predict(X_test)
    y_score = clf.predict_proba(X_test)

    accuracy = accuracy_score(y_pred, y_test)
    
    cr = classification_report(y_test, y_pred, target_names=classes, zero_division=0, output_dict=True)
    
    df = pd.DataFrame(cr).transpose()
    
    df.insert(3, 'mAP', list(average_precision_score(y_test, y_score, average=None)) + [
        average_precision_score(y_test, y_score, average=n) for n in ['micro', 'macro', 'weighted', 'samples']
    ])
    
    display(df.iloc[:-4].sort_index())
    display(df.iloc[-4:])

    return y_pred

In [None]:
y = labels_df.astype(bool).to_numpy()
# y = labels_df.to_numpy()

X = list(tif_dict.values())
print(type(X))
X = np.array(X)
print(X.shape)

# Flatten the bands and xy, 
# i.e. shape (samples, band, x, y) -> (samples, band*x*y)
X = X.reshape(len(y), -1)

classes = labels_df.columns

test_filenames = pd.read_csv('test_filenames.lst', header=None)
train_filenames = pd.read_csv('train_filenames.lst', header=None)
test_mask = labels_df.index.isin(test_filenames[0])
train_mask = labels_df.index.isin(train_filenames[0])

X_train, X_test, y_train, y_test = X[train_mask], X[test_mask], y[train_mask], y[test_mask]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)

In [None]:
%%time
save_path = Path('models').joinpath('lgbm_90split_fixed.pkl')
y_pred = train_and_eval(X_train, X_test, y_train, y_test, classes, save_path)

In [None]:
# Band order? B02, B03, B04, B08, B05, B06, B07, B8A, B11, B12, B01, B09
# blue, green, red, NIR, Red Edge 1, Red Edge 2, Red Edge 3, Red Edge 4,
# SWIR 1, SWIR 2, Aerosols, Water vapor
select_bands = [0, 1, 2, 3]

X = list(tif_dict.values())

X = np.array(X)

# Keep the bands as the principal dimension, flatten the rest
X = X.reshape(-1, X.shape[1])

X = StandardScaler().fit_transform(X)

# X = X[select_bands, :]

pca_model = PCA(n_components=None)

pca_transform = pca_model.fit_transform(X)


In [None]:
pca_model.explained_variance_ratio_

In [None]:
pca_transform.shape