In [14]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix

In [2]:
np.random.seed(0)

In [3]:
raw_df_train: pd.DataFrame = pd.read_csv(filepath_or_buffer='../../data/train.csv')
raw_df_train.rename(
    columns={
        "fid": "fid",
        "year": "year",
        "class": "is_disturbance",
        "numerical_id": "forest_id",
        "BLU": "blue",
        "GRN": "green",
        "RED": "red",
        "NIR": "near_infrared",
        "SW1": "shortwave_infrared_1",
        "SW2": "shortwave_infrared_2",
    },
    inplace=True,
)
raw_df_val: pd.DataFrame = pd.read_csv(filepath_or_buffer='../../data/validation.csv')
raw_df_train.rename(
    columns={
        "fid": "fid",
        "year": "year",
        "class": "is_disturbance",
        "numerical_id": "forest_id",
        "BLU": "blue",
        "GRN": "green",
        "RED": "red",
        "NIR": "near_infrared",
        "SW1": "shortwave_infrared_1",
        "SW2": "shortwave_infrared_2",
    },
    inplace=True,
)
raw_df_test: pd.DataFrame = pd.read_csv(filepath_or_buffer='../../data/test.csv')
raw_df_train.rename(
    columns={
        "fid": "fid",
        "year": "year",
        "class": "is_disturbance",
        "numerical_id": "forest_id",
        "BLU": "blue",
        "GRN": "green",
        "RED": "red",
        "NIR": "near_infrared",
        "SW1": "shortwave_infrared_1",
        "SW2": "shortwave_infrared_2",
    },
    inplace=True,
)

In [4]:
def sliding_window_delta(forest_df: pd.DataFrame, features: list, years_in_window: int = 1) -> pd.DataFrame:
    """
    Create a dataframe which contains in each row the value of the features and the value of the difference between the current year's value and the value of the feature i years ago.
    """
    df_sliding_window = forest_df.copy().sort_values(['forest_id', 'year'])

    for feature in features:
        if feature not in df_sliding_window.columns:
            raise ValueError(f"Feature '{feature}' not found in the DataFrame columns.")
        
        for i in range(1, years_in_window + 1):
            df_sliding_window[f'{feature}_delata_{i}'] = df_sliding_window.groupby('forest_id')[feature].transform(
        lambda x: x - x.shift(i)
    )


    return df_sliding_window

   

In [7]:
df_test_sliding_window = sliding_window_delta(forest_df=raw_df_test, features=['blue', 'green', 'red', 'near_infrared', 'shortwave_infrared_1', 'shortwave_infrared_2'], years_in_window=1)
df_val_sliding_window = sliding_window_delta(forest_df=raw_df_val, features=['blue', 'green', 'red', 'near_infrared', 'shortwave_infrared_1', 'shortwave_infrared_2'], years_in_window=1)

In [15]:
def random_forest_classifier(features,df_train, df_val, n_estimators=100, max_depth=None, random_state=42):
    """
    Train and evaluate a Random Forest Classifier model.
    
    Args:
        features (list): List of feature column names
        n_estimators (int): Number of trees in the forest
        max_depth (int): Maximum depth of the trees
        random_state (int): Random seed for reproducibility
        
    Returns:
        dict: Evaluation metrics
    """
    print(f"Features: {features}")
    target = 'is_disturbance'
    
    # Split the data into training and validation sets
    X_train = df_train[features]
    y_train = df_train[target]
    X_val = df_val[features]
    y_val = df_val[target]
    
    # Initialize and train the Random Forest model
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state
    )
    rf.fit(X_train, y_train)
    
    # Make predictions
    y_pred = rf.predict(X_val)
    print(classification_report(y_true=y_val, y_pred=y_pred))
    print(f"F1 Score: {f1_score(y_true=y_val, y_pred=y_pred)}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true=y_val, y_pred=y_pred))
    

In [16]:
random_forest_classifier(
    features=[
        'blue',
        'green',
        'red',
        'near_infrared',
        'shortwave_infrared_1',
        'shortwave_infrared_2',
        'blue_delata_1',
        'green_delata_1',
        'red_delata_1',
        'near_infrared_delata_1',
        'shortwave_infrared_1_delata_1',
        'shortwave_infrared_2_delata_1'
    ],
    df_train=df_test_sliding_window,
    df_val=df_val_sliding_window,
    n_estimators=250,
    max_depth=None,
    random_state=42
)

Features: ['blue', 'green', 'red', 'near_infrared', 'shortwave_infrared_1', 'shortwave_infrared_2', 'blue_delata_1', 'green_delata_1', 'red_delata_1', 'near_infrared_delata_1', 'shortwave_infrared_1_delata_1', 'shortwave_infrared_2_delata_1']
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     90382
           1       0.61      0.24      0.35       352

    accuracy                           1.00     90734
   macro avg       0.80      0.62      0.67     90734
weighted avg       1.00      1.00      1.00     90734

F1 Score: 0.34552845528455284
Confusion Matrix:
[[90327    55]
 [  267    85]]


In [17]:
df_test_sliding_window_delta_2 = sliding_window_delta(forest_df=raw_df_test, features=['blue', 'green', 'red', 'near_infrared', 'shortwave_infrared_1', 'shortwave_infrared_2'], years_in_window=2)
df_val_sliding_window_delta_2 = sliding_window_delta(forest_df=raw_df_val, features=['blue', 'green', 'red', 'near_infrared', 'shortwave_infrared_1', 'shortwave_infrared_2'], years_in_window=2)

In [20]:
random_forest_classifier(
    features=[
        'blue',
        'green',
        'red',
        'near_infrared',
        'shortwave_infrared_1',
        'shortwave_infrared_2',
        'blue_delata_1',
        'green_delata_1',
        'red_delata_1',
        'near_infrared_delata_1',
        'shortwave_infrared_1_delata_1',
        'shortwave_infrared_2_delata_1',
        'blue_delata_2',
        'green_delata_2',
        'red_delata_2',
        'near_infrared_delata_2',
        'shortwave_infrared_1_delata_2',
        'shortwave_infrared_2_delata_2'
    ],
    df_train=df_test_sliding_window_delta_2,
    df_val=df_val_sliding_window_delta_2,
    n_estimators=500,
    max_depth=None,
    random_state=42
)

Features: ['blue', 'green', 'red', 'near_infrared', 'shortwave_infrared_1', 'shortwave_infrared_2', 'blue_delata_1', 'green_delata_1', 'red_delata_1', 'near_infrared_delata_1', 'shortwave_infrared_1_delata_1', 'shortwave_infrared_2_delata_1', 'blue_delata_2', 'green_delata_2', 'red_delata_2', 'near_infrared_delata_2', 'shortwave_infrared_1_delata_2', 'shortwave_infrared_2_delata_2']
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     90382
           1       0.67      0.30      0.42       352

    accuracy                           1.00     90734
   macro avg       0.83      0.65      0.71     90734
weighted avg       1.00      1.00      1.00     90734

F1 Score: 0.41796875
Confusion Matrix:
[[90329    53]
 [  245   107]]
