This notebook explores the use of synthetic data for training and validating LightGBM and RF algorithm.

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from sklearn import metrics
import lightgbm as lgb
from tqdm.notebook import tqdm

In [6]:
name_map = {'Larix decidua': 0,
 'Pinus sylvestris': 1,
 'Broadleaved trees': 2,
 'Picea abies': 3,
 'Pinus mugo': 4,
 'Pinus cembra': 5,
 'Abies alba': 6}
 # reverse the dictionary
value_map = {v: k for k, v in name_map.items()}

In [7]:
# load all features
years = ["2018", "2019", "2020", "2021", "2022", "2023", "2024"]
bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'ndvi']
DATA_PATH = "data/"

In [8]:
results = {}

for year in tqdm(years):
    yearly_results = {}
    
    # Load synthetic data for training
    train_df = pd.read_csv(DATA_PATH + year + "_syn_raw.csv", index_col=0, low_memory=False)
    train_df.columns = ["band", "date", *[int(x) for x in train_df.columns[2:]]]
    y = train_df.T.pop("class")[2:].astype(int)
    train_df = train_df.drop("class")
    train_df.date = pd.to_datetime(train_df.date)
    train_df = train_df.groupby(["band", "date"]).mean()
    train_df = train_df.loc[bands].fillna(0)    

    # Load raw data to use the same columns
    raw_df = pd.read_csv(DATA_PATH + year + "_raw.csv", index_col=0, low_memory=False)
    raw_df.columns = ["band", "date", *[int(x) for x in raw_df.columns[2:]]]
    raw_df = raw_df.drop("class")
    raw_df.date = pd.to_datetime(raw_df.date)
    raw_df = raw_df.groupby(["band", "date"]).mean()
    raw_df = raw_df.loc[bands]

    # Initialize K-fold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Store metrics for each fold
    lgbm_syn_metrics = []
    rf_syn_metrics = [] 
    
    # Perform k-fold cross validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df.T)):
        # Prepare training data
        X_train = train_df.loc[raw_df.index].T.iloc[train_idx].values
        y_train = y[train_idx]
        X_val = train_df.loc[raw_df.index].T.iloc[val_idx].values
        y_val = y[val_idx]

        # Get unique classes for LightGBM
        y_train_unique = np.unique(y_train)

        # Train LightGBM
        model_lgbm = lgb.LGBMClassifier(
            objective='multiclass',
            num_class=len(y_train_unique),
            metric='multi_logloss',
            verbose=-1
        )
        model_lgbm.fit(X_train, y_train)

        # Train Random Forest
        model_rf = RandomForestClassifier(n_estimators=1000)
        model_rf.fit(X_train, y_train)

        # Test on synthetic validation data
        y_pred_lgbm_syn = model_lgbm.predict(X_val)
        y_pred_rf_syn = model_rf.predict(X_val)

        # Calculate metrics for LightGBM
        lgbm_syn_acc = 100 * metrics.accuracy_score(y_val, y_pred_lgbm_syn)
        lgbm_syn_f1 = 100 * metrics.f1_score(y_val, y_pred_lgbm_syn, average='weighted')
        lgbm_syn_metrics.append([lgbm_syn_acc, lgbm_syn_f1]) # Append the metrics

        # Calculate metrics for Random Forest
        rf_syn_acc = 100 * metrics.accuracy_score(y_val, y_pred_rf_syn)
        rf_syn_f1 = 100 * metrics.f1_score(y_val, y_pred_rf_syn, average='weighted')
        rf_syn_metrics.append([rf_syn_acc, rf_syn_f1]) # Append the metrics

    results[year] = {
        'lgbm_syn': np.mean(lgbm_syn_metrics, axis=0).tolist(),  # Avg acc and f1
        'rf_syn': np.mean(rf_syn_metrics, axis=0).tolist() # Avg acc and f1
    }

# Convert results to DataFrame
df_results = pd.DataFrame.from_dict(results, orient='index')

# Convert the lists into two separate columns for each model
df_results[['lgbm_OA', 'lgbm_F1']] = pd.DataFrame(df_results['lgbm_syn'].tolist(), index=df_results.index)
df_results[['rf_OA', 'rf_F1']] = pd.DataFrame(df_results['rf_syn'].tolist(), index=df_results.index)
del df_results["lgbm_syn"], df_results["rf_syn"]
#Save results to CSV
df_results.to_csv("results/syn_metrics.csv")
df_results.mean()



lgbm_OA    88.703806
lgbm_F1    88.513925
rf_OA      83.949255
rf_F1      83.443898
dtype: float64