This notebook explores the use synthetic data and Random Forest algorithm for forest stand classification.

In [1]:
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn import metrics
from sklearn.model_selection import KFold
import lightgbm as lgb

# Datasets
This notebook has random sampling of reference points.
The lightgbm and rf model is trained on all synthetic data and tested on all raw cloud free dates.

In [3]:
name_map = {'Larix decidua': 0,
 'Pinus sylvestris': 1,
 'Broadleaved trees': 2,
 'Picea abies': 3,
 'Pinus mugo': 4,
 'Pinus cembra': 5,
 'Abies alba': 6}
 # reverse the dictionary
value_map = {v: k for k, v in name_map.items()}

In [4]:
# load all features
bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09','B11', 'B12', 'ndvi']
years = ["2018", "2019", "2020", "2021", "2022", "2023", "2024"]
DATA_PATH = "data/"

# Each TS different model

In [5]:
raw_data = {}
syn_data = {}

for year in years:
    raw_df = pd.read_csv(DATA_PATH + year + "_raw.csv", index_col=0, low_memory=False)
    raw_df.columns = ["band", "date", *[int(x) for x in raw_df.columns[2:]]]
    raw_df = raw_df.drop("class")
    raw_df.date = pd.to_datetime(raw_df.date)
    raw_df = raw_df.groupby(["band", "date"]).mean()
    raw_data[year] = raw_df

    syn_df = pd.read_csv(DATA_PATH + year + "_syn_raw.csv", index_col=0, low_memory=False)
    syn_df.columns = ["band", "date", *[int(x) for x in syn_df.columns[2:]]]
    y = syn_df.T.pop("class")[2:].astype(int)
    syn_df = syn_df.drop("class")
    syn_df.date = pd.to_datetime(syn_df.date)
    syn_df = syn_df.groupby(["band", "date"]).mean()
    syn_data[year] = (syn_df, y)

results = {}
for year in tqdm(years):
    kf = KFold(n_splits=5, shuffle=True)

    org_df = raw_data[years[0]]  # Use a single raw_df to get shape
    df, y = syn_data[year]
    test_df_base = raw_data[year]

    for j, (train_index, test_index) in enumerate(kf.split(org_df.T[2:])):
        test_df = test_df_base.loc[:, test_index]

        year_nan_flag = [False]  # a mutable list with one boolean

        def train_and_predict(row):
            row = row.dropna()  # Remove NaN values in the row
            train_df = df.loc[row.index, train_index].fillna(0) #Fills the possible nan values with 0
            y_train = y[train_index]

            if train_df.isna().any().any():
                if not year_nan_flag[0]:
                    print(year, "has", train_df.isna().any().sum(), "nan values")
                    year_nan_flag[0] = True
                train_df = train_df.dropna(axis=1) #Drops the columns that have nan
                y_train = y[train_df.columns] #Updates train_df columns

            model_lgbm = lgb.LGBMClassifier(
                objective='multiclass',
                num_class=len(np.unique(y)),
                metric='multi_logloss',
                verbose=-1
            )
            model_lgbm.fit(train_df.T.values, y_train.values) #Fit LightGBM
            return model_lgbm.predict(row.T.values.reshape(1, -1))[0] #Return the prediction for each row


        y_test = y[test_index]
        y_pred_lgbm = Parallel(n_jobs=-1)( # Use all available cores
            delayed(train_and_predict)(row[1]) for row in test_df.T.iterrows()
        )


        acc_lgbm = 100 * metrics.accuracy_score(y_test, y_pred_lgbm) #Calculate the accuracy
        f1_w_lgbm = 100 * metrics.f1_score(y_test, y_pred_lgbm, average='weighted') #Calculate the F1 Score
        results[f"{year}_syn_raw_lgbm_{j}"] = [f1_w_lgbm, acc_lgbm]

# Convert results to DataFrame
df_results = pd.DataFrame.from_dict(results, orient='index')

# Name the Columns
df_results.columns = ['F1', 'OA']

# Save results to CSV
df_results.to_csv("results/syn_test_raw_lgbm_results.csv")

print(df_results.mean())

  0%|          | 0/7 [00:00<?, ?it/s]

F1    89.052106
OA    89.227928
dtype: float64
