This notebook explores the use synthetic data and Random Forest algorithm for forest stand classification.

In [1]:
import pandas as pd
from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

In [2]:
name_map = {'Larix decidua': 0,
 'Pinus sylvestris': 1,
 'Broadleaved trees': 2,
 'Picea abies': 3,
 'Pinus mugo': 4,
 'Pinus cembra': 5,
 'Abies alba': 6}
 # reverse the dictionary
value_map = {v: k for k, v in name_map.items()}

In [3]:
# load all features
bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09','B11', 'B12', 'ndvi']
years = ["2018", "2019", "2020", "2021", "2022", "2023", "2024"]
DATA_PATH = "data/"

In [4]:
# Preload and preprocess all raw and syn data
raw_data = {}
syn_data = {}

for year in years:
    raw_df = pd.read_csv(DATA_PATH + year + "_raw.csv", index_col=0, low_memory=False)
    raw_df.columns = ["band", "date", *[int(x) for x in raw_df.columns[2:]]]
    raw_df = raw_df.drop("class")
    raw_df.date = pd.to_datetime(raw_df.date)
    raw_df = raw_df.groupby(["band", "date"]).mean()
    raw_data[year] = raw_df

    syn_df = pd.read_csv(DATA_PATH + year + "_syn_raw.csv", index_col=0, low_memory=False)
    syn_df.columns = ["band", "date", *[int(x) for x in syn_df.columns[2:]]]
    y = syn_df.T.pop("class")[2:].astype(int)
    syn_df = syn_df.drop("class")
    syn_df.date = pd.to_datetime(syn_df.date)
    syn_df = syn_df.groupby(["band", "date"]).mean()
    syn_data[year] = (syn_df, y)

results = {}
for year in tqdm(years):
    kf = KFold(n_splits=5, shuffle=True)

    org_df = raw_data[years[0]]  # Use a single raw_df to get shape
    df, y = syn_data[year]
    test_df_base = raw_data[year]

    for j, (train_index, test_index) in enumerate(kf.split(org_df.T[2:])):
        test_df = test_df_base.loc[:, test_index]
        year_nan_flag = [False]  # a mutable list with one boolean

        def train_and_predict(row):
            row = row.dropna()
            train_df = df.loc[row.index, train_index].fillna(0)
            y_train = y[train_index]

            if train_df.isna().any().any():
                if not year_nan_flag[0]:
                    print(year, "has", train_df.isna().any().sum(), "nan values")
                    year_nan_flag[0] = True
                train_df = train_df.dropna(axis=1)
                y_train = y[train_df.columns]

            model_rf = RandomForestClassifier(n_estimators=1000)
            model_rf.fit(train_df.T.values, y_train.values)
            return model_rf.predict(row.T.values.reshape(1, -1))[0]


        y_test = y[test_index]
        y_pred_rf = Parallel(n_jobs=-1)(
            delayed(train_and_predict)(row[1]) for row in test_df.T.iterrows()
        )

        acc_rf = 100 * metrics.accuracy_score(y_test, y_pred_rf)
        f1_w_rf = 100 * metrics.f1_score(y_test, y_pred_rf, average='weighted')
        results[f"{year}_syn_raw_rf_{j}"] = [f1_w_rf, acc_rf]

# Convert results to DataFrame
df_results = pd.DataFrame.from_dict(results, orient='index')

# Name the Columns
df_results.columns = ['F1', 'OA']

# Save results to CSV
df_results.to_csv("results/syn_test_raw_rf_results.csv")

df_results.mean()

  0%|          | 0/7 [00:00<?, ?it/s]

F1    86.95883
OA    87.32757
dtype: float64