This notebook explores the use of synthetic data for training and validating LightGBM and RF algorithm.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from sklearn import metrics
import lightgbm as lgb

In [4]:
name_map = {'Larix decidua': 0,
 'Pinus sylvestris': 1,
 'Broadleaved trees': 2,
 'Picea abies': 3,
 'Pinus mugo': 4,
 'Pinus cembra': 5,
 'Abies alba': 6}
 # reverse the dictionary
value_map = {v: k for k, v in name_map.items()}

In [6]:
# load all features
years = ["2018", "2019", "2020", "2021", "2022", "2023", "2024"]
bands = ['B01', 'B02', 'B03', 'B04', 'B05', 'B06', 'B07', 'B08', 'B8A', 'B09', 'B11', 'B12', 'ndvi']
DATA_PATH = "data/"

Model training and prediction using K-Fold validation

In [14]:
results = {}

for year in years:
    yearly_results = {}
    
    # Load synthetic data for training
    train_df = pd.read_csv(DATA_PATH + year + "_syn_raw.csv", index_col=0, low_memory=False)
    train_df.columns = ["band", "date", *[int(x) for x in train_df.columns[2:]]]
    y = train_df.T.pop("class")[2:].astype(int)
    train_df = train_df.drop("class")
    train_df.date = pd.to_datetime(train_df.date)
    train_df = train_df.groupby(["band", "date"]).mean()
    train_df = train_df.loc[bands].fillna(0)    

    # Load raw data for testing
    raw_df = pd.read_csv(DATA_PATH + year + "_raw.csv", index_col=0, low_memory=False)
    raw_df.columns = ["band", "date", *[int(x) for x in raw_df.columns[2:]]]
    raw_df = raw_df.drop("class")
    raw_df.date = pd.to_datetime(raw_df.date)
    raw_df = raw_df.groupby(["band", "date"]).mean()
    raw_df = raw_df.loc[bands]

    # Initialize K-fold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Store metrics for each fold
    lgbm_syn_acc = []
    lgbm_syn_f1 = []
    rf_syn_acc = []
    rf_syn_f1 = []

    # Perform k-fold cross validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df.T)):
        # Prepare training data
        X_train = train_df.loc[raw_df.index].T.iloc[train_idx].values
        y_train = y[train_idx]
        X_val = train_df.loc[raw_df.index].T.iloc[val_idx].values
        y_val = y[val_idx]

        # Get unique classes for LightGBM
        y_train_unique = np.unique(y_train)

        # Train LightGBM
        model_lgbm = lgb.LGBMClassifier(
            objective='multiclass',
            num_class=len(y_train_unique),
            metric='multi_logloss',
            verbose=-1
        )
        model_lgbm.fit(X_train, y_train)

        # Train Random Forest
        model_rf = RandomForestClassifier(n_estimators=1000)
        model_rf.fit(X_train, y_train)

        # Test on synthetic validation data
        y_pred_lgbm_syn = model_lgbm.predict(X_val)
        y_pred_rf_syn = model_rf.predict(X_val)

        # Calculate metrics for LightGBM
        lgbm_syn_acc.append(100 * metrics.accuracy_score(y_val, y_pred_lgbm_syn))
        lgbm_syn_f1.append(100 * metrics.f1_score(y_val, y_pred_lgbm_syn, average='weighted'))

        # Calculate metrics for Random Forest
        rf_syn_acc.append(100 * metrics.accuracy_score(y_val, y_pred_rf_syn))
        rf_syn_f1.append(100 * metrics.f1_score(y_val, y_pred_rf_syn, average='weighted'))

    # Calculate mean metrics for the year
    yearly_results = {
        'lgbm_syn': [
            np.mean(lgbm_syn_acc),
            np.mean(lgbm_syn_f1)
        ],
        'rf_syn': [
            np.mean(rf_syn_acc),
            np.mean(rf_syn_f1)
        ]
    }
    
    results[year] = yearly_results

In [16]:
# Create DataFrame with metrics for each model and data type
metrics_data = {
    'Model': [],
    'Data_Type': [],
    'Year': [],
    'Accuracy': [],
    'F1': []
}

for year in years:
    for model in ['lgbm', 'rf']:
        for data_type in ['syn', 'raw']:
            key = f'{model}_{data_type}'
            metrics_data['Model'].append('LightGBM' if model == 'lgbm' else 'RandomForest')
            metrics_data['Data_Type'].append('Synthetic' if data_type == 'syn' else 'Raw')
            metrics_data['Year'].append(int(year))
            metrics_data['Accuracy'].append(results[year][key][0])
            metrics_data['F1'].append(results[year][key][1])

df_metrics = pd.DataFrame(metrics_data)
df_metrics.columns = ['Model', 'Test_Data_Type', 'Year', 'Accuracy', 'F1']
# Save to CSV
df_metrics.to_csv('syn_raw_metrics.csv')

# Display mean metrics across years
print("\nMean metrics across all years:")
print(df_metrics.groupby(['Model', 'Test_Data_Type']).mean())


Mean metrics across all years:
                               Year   Accuracy         F1
Model        Test_Data_Type                              
LightGBM     Raw             2021.0  58.201875  51.915017
             Synthetic       2021.0  88.703806  88.513925
RandomForest Raw             2021.0  41.577496  26.890437
             Synthetic       2021.0  83.706564  83.208715


In [None]:
Mean metrics across all years:
                               Year   Accuracy         F1
Model        Test_Data_Type                              
LightGBM     Raw             2020.5  54.504505  46.342181
             Synthetic       2020.5  88.339768  88.127918
RandomForest Raw             2020.5  42.458172  28.462128
             Synthetic       2020.5  83.951094  83.294282

In [17]:
# Load raw data for all years

all_results = {}
for year in years:
    # Load raw data
    raw_df = pd.read_csv(DATA_PATH + year + "_raw.csv", index_col=0, low_memory=False)
    raw_df.columns = ["band", "date", *[int(x) for x in raw_df.columns[2:]]]
    y = raw_df.T.pop("class")[2:].astype(int)
    raw_df = raw_df.drop("class")
    raw_df.date = pd.to_datetime(raw_df.date)
    raw_df = raw_df.groupby(["band", "date"]).mean()
    raw_df = raw_df.loc[bands]

    # Initialize K-fold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    # Store metrics for each fold
    lgbm_acc = []
    lgbm_f1 = []

    # Perform k-fold cross validation
    for fold, (train_idx, val_idx) in enumerate(kf.split(raw_df.T)):    
        # Prepare training data
        X_train = raw_df.T.iloc[train_idx].values
        X_val = raw_df.T.iloc[val_idx].values
        y_train = y[train_idx]
        y_val = y[val_idx]
        
        # Set up the LightGBM model
        model_lgbm = lgb.LGBMClassifier(
            objective='multiclass',
            num_class=len(np.unique(y)),
            metric='multi_logloss',
            verbose=-1
        )
        
        # Train the model
        model_lgbm.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model_lgbm.predict(X_val)
        
        # Calculate metrics
        acc = 100 * metrics.accuracy_score(y_val, y_pred)
        f1 = 100 * metrics.f1_score(y_val, y_pred, average='weighted')
        
        lgbm_acc.append(acc)
        lgbm_f1.append(f1)
    
    # Store results for this year
    all_results[year] = {'F1': np.mean(lgbm_f1), 'OA': np.mean(lgbm_acc)}

In [18]:
# Create DataFrame with F1 and OA for all years
df_metrics = pd.DataFrame(all_results).T
df_metrics.columns = ['F1', 'OA']
df_metrics.index.name = 'Year'
df_metrics.to_csv('full_ts_lightGBM_metrics.csv')
print(df_metrics.mean())

F1    90.613855
OA    90.667402
dtype: float64


In [None]:
F1    91.132073
OA    91.196911
dtype: float64