# Tree Mortality Predictions


## Setup


In [None]:
import sys

sys.path.insert(0, "../../src")
from imports import *

init_notebook()

from IPython.display import clear_output  # For clearing the output of a cell
import json
import shutil


# List available data
tmp = list_predictor_datasets(return_list=False)
display("--------")
print("\nList of available species and their percentages")
tmp = get_final_nfi_data_for_analysis(verbose=False).query(
    "tree_state_change in ['alive_alive', 'alive_dead']"
)
# Get normalized and non normalized counts
all_species = tmp["species_lat2"].value_counts()
all_species_norm = tmp["species_lat2"].value_counts(normalize=True)
top9_species = all_species.head(9).index.tolist()

for i in all_species.index:
    print(f"{i:25} {all_species[i]:<30} {all_species_norm[i]*100:.2f}%")

## Function Definition


In [None]:
def run_all(species, user_input, base_dir=None):

    # ! Set and save user input settings ---------------------------------
    # Set species
    user_input["subset_group"] = [species]

    # ! Get current directory (create if non-existent)
    if base_dir is None:
        user_input["current_dir"] = create_new_run_folder_treemort(
            user_input["subset_group"][0]
        )
    else:
        if base_dir[-1] != "/":
            base_dir += "/"
        user_input["current_dir"] = base_dir + species + "/"
        os.makedirs(user_input["current_dir"], exist_ok=True)

    current_dir = user_input["current_dir"]

    # ! Skip run if already done
    ffile = f"{current_dir}/final_model_performance.csv"
    if os.path.exists(ffile):
        print(f"Skipping {species} as it already exists")
        return None

    # ! Write settings to file
    file_path = f"{current_dir}/__user_input.txt"
    with open(file_path, "w") as file:
        for key, value in user_input.items():
            if isinstance(value, list):
                file.write(f"{key}:")
                for v in value:
                    file.write(f"\n - {v}")
                file.write("\n\n")
            else:
                file.write(f"{key}:\n - {value}\n\n")

    # ! Get Target Data -------------------------------------------------------
    # ! Load NFI Dataset
    df_raw = pd.read_feather(here("data/final/nfi/nfi_ready_for_analysis.feather"))

    # ! Filter Target Data
    df_subset = df_raw.copy()

    # Check if species column present
    for subset in user_input["subset"]:
        if subset not in df_subset.columns:
            raise KeyError(f"{subset} not in columns")

    # Filter out trees that do not belong to the desired species
    for subset in user_input["subset"]:
        df_subset = df_subset[df_subset[subset].isin(user_input["subset_group"])].copy()

    # Keep only trees that survived or died
    df_subset = df_subset.query(
        "tree_state_change == 'alive_alive' or tree_state_change == 'alive_dead'"
    ).copy()

    # Encode target (1 = dead, 0 = alive)
    df_subset["target"] = (
        df_subset["tree_state_change"]
        .copy()
        .apply(lambda x: 1 if x == "alive_dead" else 0)
    )

    # Clean df
    df_subset = move_vars_to_front(df_subset, ["idp", "tree_id", "target"])

    # Keep target dataset separately
    df_target = df_subset[["idp", "tree_id", "target"]].copy()

    # Break function if only alive trees
    # Counts has only shape 1 if only alive trees
    if df_target.target.value_counts().shape[0] == 1:
        display(df_target.target.value_counts())
        print(f" - Skipping because too few dead trees")
        write_txt(f"{current_dir}/⚠️ too few dead trees.txt")
        return None

    # Break function if too little dead trees
    # Algorithm usually broke when there were less than 35 dead trees
    if df_target.target.value_counts()[1] < 35:
        # Value counts [1] is the count of dead trees
        display(df_target.target.value_counts())
        print(f" - Skipping because too few dead trees")
        write_txt(f"{current_dir}/⚠️ too few dead trees.txt")
        return None

    # Break function if less trees than smote k requires
    if df_target.target.value_counts()[1] <= user_input["smote_k"] * 1.25:
        display(df_target.target.value_counts())
        print(f" - Skipping because too few dead trees")
        write_txt(f"{current_dir}/⚠️ too few dead trees.txt")
        write_txt(f"{current_dir}/⚠️ N dead less than smote k requires.txt")
        return None

    # ! Attach Feature Data -------------------------------------------------------

    # Select predictor data
    user_input["predictor_datasets"] = [""]  # Not needed anymore

    # Initiate dictionary and df
    dict_preds = {}
    df_preds = df_subset.copy()[["idp", "tree_id"]]

    #! Tree Properties
    # Using df_subset from above to pick variables
    voi = ["htot_final", "c13_rel", "c13_1"]
    df_tree = df_subset[["idp", "tree_id"] + voi]
    df_preds = df_preds.merge(df_tree, on=["idp", "tree_id"], how="left")
    dict_preds = add_vars_to_dict("Tree", df_tree, dict_preds)

    #! Stand Properties
    # Using df_subset from above to pick variables
    df_stand = df_subset[["idp", "tree_id", "social_status"]]

    # Using separately calculated metrics
    df_stand = (
        df_stand.merge(
            attach_or_load_predictor_dataset("forest_competition"),
            on=["idp", "tree_id"],
            how="left",
        )
        .merge(
            attach_or_load_predictor_dataset("forest_biodiversity"),
            on=["idp"],
            how="left",
        )
        .merge(
            attach_or_load_predictor_dataset("forest_gini"),
            on=["idp"],
            how="left",
        )
    )
    df_preds = df_preds.merge(df_stand, on=["idp", "tree_id"], how="left")
    dict_preds = add_vars_to_dict("Stand", df_stand, dict_preds)

    #! Carrying Capacity
    df_cc = attach_or_load_predictor_dataset("forest_carrying_capacity")
    df_preds = df_preds.merge(df_cc, on="idp", how="left")
    dict_preds = add_vars_to_dict("Carrying Capacity", df_cc, dict_preds)

    #! Topography
    df_topo = attach_or_load_predictor_dataset("topography")
    # Keep only variables at 1000m resolution (we will use this as the main resolution)
    df_topo = df_topo[["idp"] + [var for var in df_topo.columns if "1000" in var]]
    # Remove dem1000_ and _mean from variable names
    df_topo.columns = ["idp"] + [
        var.replace("dem1000_", "").replace("_mean", "") for var in df_topo.columns[1:]
    ]
    # Attach to df_preds
    df_preds = df_preds.merge(df_topo, on="idp", how="left")
    # Save variables to dictionary
    dict_preds = add_vars_to_dict("Topography", df_topo, dict_preds)

    #! Soil Conditions
    df_soil = attach_or_load_predictor_dataset("soil")
    # Clean variable names
    df_soil.columns = [var.replace("soil_", "") for var in df_soil.columns]
    df_soil = df_soil.drop(columns=["first_year"])
    # Attach to df_preds
    df_preds = df_preds.merge(df_soil, on="idp", how="left")
    # Save variables to dictionary
    dict_preds = add_vars_to_dict("Soil", df_soil, dict_preds)

    #! Temperature
    drop_cols = ["idp", "first_year", "yrs_before_second_visit"]
    df_temp = pd.concat(
        [
            attach_or_load_predictor_dataset("digitalis_tmoy"),
            attach_or_load_predictor_dataset("digitalis_tmin").drop(columns=drop_cols),
            attach_or_load_predictor_dataset("digitalis_tmax").drop(columns=drop_cols),
        ],
        axis=1,
    )

    # Attach to df_preds
    df_preds = df_preds.merge(df_temp, on="idp", how="left")

    # Save variables to dictionary
    dict_preds = add_vars_to_dict("Temperature", df_temp, dict_preds)

    #! SPEI
    df_spei = attach_or_load_predictor_dataset("spei_anom")

    # Rename columns from numbers to months
    df_spei.columns = [
        var.replace("-1_", "-jan_")
        .replace("-2_", "-feb_")
        .replace("-3_", "-mar_")
        .replace("-4_", "-apr_")
        .replace("-5_", "-may_")
        .replace("-6_", "-jun_")
        .replace("-7_", "-jul_")
        .replace("-8_", "-aug_")
        .replace("-9_", "-sep_")
        .replace("-10_", "-oct_")
        .replace("-11_", "-nov_")
        .replace("-12_", "-dec_")
        .replace("-13_", "-ann_")
        for var in df_spei.columns
    ]

    # Keep features describing seasonal anomalies
    spei_durations = [f"spei{i}-" for i in [1, 3, 6, 9, 12, 15, 18, 21, 24]]
    spei_months = [f"*-{i}_*" for i in ["feb", "may", "aug", "nov"]]
    spei_subset = match_variables(df_spei, spei_durations)
    spei_subset = match_variables(df_spei[spei_subset], spei_months)

    df_spei = df_spei[["idp"] + spei_subset]

    df_preds = df_preds.merge(df_spei, on="idp", how="left")
    dict_preds = add_vars_to_dict("SPEI", df_spei, dict_preds)

    #! Management
    df_human = attach_or_load_predictor_dataset("management")
    df_preds = df_preds.merge(df_human, on="idp", how="left")
    dict_preds = add_vars_to_dict("Management", df_human, dict_preds)

    #! NDVI
    df_ndvi = attach_or_load_predictor_dataset("ndvi")
    df_preds = df_preds.merge(df_ndvi, on="idp", how="left")
    dict_preds = add_vars_to_dict("NDVI", df_ndvi, dict_preds)

    # ! Align direction of variables
    # Increasing distance to road should mean more management
    df_preds.dist_road = df_preds.dist_road.replace({0: 4, 1: 3, 3: 1, 4: 0})

    # ! Update dictionary --------------------------------------------------------------------------------
    dict_preds_org = dict_preds.copy()
    dict_preds_org
    dict_preds = dict_preds_org.copy()
    dict_preds.pop("Tree", None)
    dict_preds.pop("Stand", None)
    dict_preds.pop("Soil", None)
    dict_preds.pop("Carrying Capacity", None)

    dict_preds["Tree Size"] = [
        "htot_final",
        "c13_1",
    ]

    dict_preds["Light Competition"] = [
        "c13_rel",
        "social_status",
        "competition_larger",
        "competition_larger_rel",
    ]

    dict_preds["Species Competition"] = [
        "competition_same_species",
        "competition_same_species_rel",
        "competition_other_species",
        "competition_other_species_rel",
        "belongs_to_dom_spec",
        "num_species",
        "simpson_species",
        "shannon_species",
    ]

    dict_preds["Stand Structure"] = [
        "num_trees",
        "gini_ba_1",
        "mean_dbh",
        "carrying_capacity",
        "competition_total",
    ]

    dict_preds["Soil Fertility"] = [
        "CN",
        "pH",
    ]

    dict_preds["Soil Water Conditions"] = [
        "waterlogging_temp",
        "waterlogging_perm",
        "swhc",
    ]

    # Save dictionary to file
    with open(f"{current_dir}/feature_category_dictionary.json", "w") as f:
        json.dump(dict_preds, f)

    # ! DATA PREPARATION --------------------------------------------------------------------------------

    ## ! One-Hot-Encoding
    # Note: Technically not needed because no categorical features but keeping it for future use
    df_ohe = df_preds.copy()

    # Get all variables names before one-hot encoding
    all_var_names_before_ohe = sorted(df_ohe.columns.to_list())

    # Set variables to not ohe:
    my_vars_not_to_ohe = ["test_train_strata", "target", "idp", "tree_id"]

    # Do the OHE
    df_ohe = do_ohe(df_ohe, my_vars_not_to_ohe, verbose=False)

    # Get all variables names after one-hot encoding
    all_var_names_after_ohe = sorted(df_ohe.columns.to_list())

    # Get variable dictionary
    var_ohe_dict = {}
    for var in all_var_names_before_ohe:
        sub_vars = []

        if var in all_var_names_after_ohe:
            # If the variable was not ohe, it stays the same
            var_ohe_dict[var] = [var]
            continue
        else:
            # If the variable was ohe, search for pattern and add it
            pattern = r"^" + var + r"_.*"
            for sub_var in all_var_names_after_ohe:
                # print(pattern, sub_var, re.match(pattern, sub_var))
                if re.match(pattern, sub_var):
                    sub_vars.append(sub_var)
        var_ohe_dict[var] = sub_vars

    ## ! Final Dataset
    df_predictors_final = df_ohe.copy()

    # Raise error if target and predictor df have not same number of rows
    if df_target.shape[0] != df_predictors_final.shape[0]:
        raise ValueError(
            f"Target and predictor datasets have different number of rows: {df_target.shape[0]} vs {df_predictors_final.shape[0]}"
        )

    # Merge to get correct order
    df_target_pred_final = pd.merge(
        df_target, df_predictors_final, on=["idp", "tree_id"], how="left"
    )

    df_target_pred_final = df_target_pred_final.drop(
        columns=["idp", "tree_id", "first_year"], errors="ignore"
    )
    # df_target_pred_final.to_csv("df_final_target_predictors.csv", index=False)

    ## ! Test/Train Split
    # Get df
    df_for_splitting = df_target_pred_final.copy()
    print(f" - Shape of df before splitting: \t {df_for_splitting.shape}")

    X = df_for_splitting.drop("target", axis=1)
    y = df_for_splitting["target"]

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=user_input["test_split"],
        random_state=user_input["seed_nr"],
        stratify=y,
    )

    Xy_train = pd.concat([y_train, X_train], axis=1).reset_index(drop=True)
    Xy_test = pd.concat([y_test, X_test], axis=1).reset_index(drop=True)

    # Impute missing data using the training data mean values
    for col in Xy_train.columns:
        if Xy_train[col].dtype == "float64":
            # Use training mean to avoid data leakage
            # Generally less than 1% of data is missing
            train_mean = Xy_train[col].mean()
            Xy_train[col] = Xy_train[col].fillna(train_mean)
            Xy_test[col] = Xy_test[col].fillna(train_mean)

    # Check for any missing values
    if Xy_train.isnull().sum().sum() > 0:
        raise ValueError("Missing values in train dataset!")

    if Xy_test.isnull().sum().sum() > 0:
        raise ValueError("Missing values in test dataset!")

    print(f" - Shape of Xy_train:\t\t\t {Xy_train.shape}")
    print(f" - Shape of Xy_test:\t\t\t {Xy_test.shape}")

    # Keep original dfs for saving tree ID further below
    df_target_for_treeid = df_target.copy()
    df_predictors_final_for_treeid = df_predictors_final.copy()

    # ! RFE ------------------------------------------------------------------------------
    # Ensure SMOTE is only applied to training data
    user_input["do_smote_test_validation"] = False
    user_input["do_smote_test_final"] = False

    display(" --- FEATURE ELIMINATION ---")
    rfecv_params = {
        "n_estimators": 100,
        "max_depth": 8,
        "max_features": 0.01,
        "bootstrap": True,
        "criterion": "gini",
    }

    df_cvmetrics_per_nfeatures = run_rfecv_treemort(
        dict_categories=dict_preds.copy(),
        var_ohe_dict=var_ohe_dict.copy(),
        Xy_train_for_rfe=Xy_train.copy(),
        user_input=user_input,
        rfecv_params=rfecv_params,
        debug_stop=False,
        debug_stop_after_n_iterations=10,
        verbose=False,
    )

    #! Report best variables ----------------------------------------------------------------
    display(" --- BEST FEATURES ---")
    # If rfe based on best oob, set best_model_metric to it too
    if user_input["method_validation"] == "oob":
        user_input["best_model_metric"] = "oob"

    # Select best-performing model based on user input
    # If best_metric, select the model with the highest score
    # If best_per_category, select the model with the single best feature per feature category
    if user_input["best_model_decision"] == "best_metric":

        ohed_variables_in_final_model = (
            df_cvmetrics_per_nfeatures.sort_values(
                by=user_input["best_model_metric"], ascending=False
            )
            .head(1)["ohe_vars_in_model"]
            .values[0]
        )

        non_ohed_variables_in_final_model = (
            df_cvmetrics_per_nfeatures.sort_values(
                by=user_input["best_model_metric"], ascending=False
            )
            .head(1)["non_ohe_vars_in_model"]
            .values[0]
        )

        best_score = (
            df_cvmetrics_per_nfeatures.sort_values(
                by=user_input["best_model_metric"], ascending=False
            )
            .head(1)[user_input["best_model_metric"]]
            .values[0]
        )

    elif user_input["best_model_decision"] == "best_per_category":
        dict_len = len(dict_preds)

        ohed_variables_in_final_model = df_cvmetrics_per_nfeatures.query(
            "n_features == @dict_len"
        )["ohe_vars_in_model"].values[0]

        non_ohed_variables_in_final_model = df_cvmetrics_per_nfeatures.query(
            "n_features == @dict_len"
        )["non_ohe_vars_in_model"].values[0]

        best_score = df_cvmetrics_per_nfeatures.query("n_features == @dict_len")[
            user_input["best_model_metric"]
        ].values[0]

    elif user_input["best_model_decision"] == "best_metric_max1":
        dict_len = len(dict_preds)

        max1cat = df_cvmetrics_per_nfeatures.query("n_features <= @dict_len")

        non_ohed_variables_in_final_model = (
            max1cat.sort_values(by=user_input["best_model_metric"], ascending=False)
            .head(1)["non_ohe_vars_in_model"]
            .values[0]
        )

        best_score = (
            max1cat.sort_values(by=user_input["best_model_metric"], ascending=False)
            .head(1)[user_input["best_model_metric"]]
            .values[0]
        )

    else:
        raise ValueError(
            f"Invalid selection for final model decision!: {user_input['best_model_decision']}"
        )

    txt_best_var = f"""
    - Best score: {user_input['best_model_metric']} = {round(best_score,3)} based on model selecting by '{user_input['best_model_decision']}
    
    - Variables in best model (ohe):\t{ohed_variables_in_final_model}
    
    - Variables in best model (non-ohe):\t{sorted(non_ohed_variables_in_final_model)}
        """

    # print(txt_best_var)
    with open(f"{current_dir}/final_model_variables.txt", "w") as f:
        f.write(txt_best_var)

    # ! Select variables of best model
    Xy_train_best_preds = Xy_train.copy()[["target"] + ohed_variables_in_final_model]

    # ! Correlation Removal ----------------------------------------------------------------
    # First get feature importance of the best model
    if user_input["method_validation"] == "cv":
        rf, sco, rf_vi = SMOTE_cv(
            Xy_all=Xy_train_best_preds,
            var_ohe_dict=var_ohe_dict,
            rf_params=rfecv_params,
            method_importance=user_input["method_importance"],
            smote_on_test=user_input["do_smote_test_validation"],
            rnd_seed=user_input["seed_nr"],
            verbose=False,
            save_directory=None,
        )
    elif user_input["method_validation"] == "oob":
        rf, sco, rf_vi = SMOTE_oob(
            Xy_all=Xy_train_best_preds,
            var_ohe_dict=var_ohe_dict,
            rf_params=rfecv_params,
            method_importance=user_input["method_importance"],
            smote_on_test=user_input["do_smote_test_validation"],
            rnd_seed=user_input["seed_nr"],
            verbose=False,
            save_directory=None,
        )
    else:
        raise ValueError(
            f"Failed during RFE - Invalid method_validation! Got: {user_input['method_validation']}"
        )

    # Get order of features (note that they are NOT ohe'd, so I have to first decode the dataframe, before selection. As done below.)
    order_of_features = rf_vi.Feature.to_list()
    final_vars = remove_correlation_based_on_vi(
        Xy_train_best_preds,
        var_ohe_dict,
        rf_vi,
        threshold=user_input["correlation_threshold"],
        make_heatmaps=False,
        return_only_top_n=15,
        save_directory=current_dir,
    )

    # ! SET FINAL FEATURES ----------------------------------------------------------------
    Xy_train_final = Xy_train_best_preds.copy()[["target"] + final_vars]
    Xy_test_final = Xy_test.copy()[["target"] + final_vars]

    # ! TUNING -----------------------------------------------------------------------
    # ! Prescribed Gridsearch
    display(" --- GRID SEARCH ---")
    # Get dataframe
    Xy_train_for_tuning = Xy_train_final.copy()

    # Split into response and predictors
    Xy = Xy_train_for_tuning.copy()
    X = Xy.drop(
        columns=["target", "test_train_strata", "tree_id", "idp"], errors="ignore"
    )
    y = Xy["target"]

    # Build model
    model = RandomForestClassifier(random_state=user_input["seed_nr"], n_jobs=-1)

    # Apply oversampling to train set
    X_train_over, y_train_over = apply_smote(
        X=X,
        y=y,
        seed=user_input["seed_nr"],
        k=user_input["smote_k"],
    )

    # Create Stratified K-fold cross validation
    cv = RepeatedStratifiedKFold(
        n_splits=3, n_repeats=1, random_state=user_input["seed_nr"]
    )

    # Get parameter grid
    param_grid = {
        "n_estimators": [100, 300],  # Higher than 300 has minor influence
        "max_depth": [1, 3, 12, 18],  # Higher than 18 has minor influence
        "max_features": [0.01, 0.1, "sqrt"],  # Minor influence
    }

    # Set the grid search model
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        cv=cv,
        n_jobs=-1,
        verbose=0,
        return_train_score=True,
        scoring=user_input["gsc_metric"],
    )

    # Fit the grid search to the data
    grid_search.fit(
        X,
        y,
    )

    # Print results
    display("")
    print("--- FINAL RESULTS ---")
    print("Parameter grid:")
    for key, value in param_grid.items():
        print(f" - {key}: {value}")

    print("\nBest parameters:")
    for key, value in grid_search.best_params_.items():
        print(f" - {key}: {value}")
    print(
        f"\nBest {user_input['best_model_metric']}: {round(grid_search.best_score_, 2)}"
    )

    # Get best parameters
    best_params = grid_search.best_params_
    # Visualize tuning
    # plot_grid_search_results(
    #     grid_search, "prescribed", save_directory=current_dir, show=False
    # )

    # ! Final Model ------------------------------------------------------------------------
    display(" --- FINAL MODEL RUN ---")

    # Setup model
    rf_model = RandomForestClassifier(
        random_state=user_input["seed_nr"],
        n_jobs=-1,
        **best_params,
    )

    # Split response and predictors
    X_train_final = Xy_train_final.drop(columns=["target"], errors="ignore")
    y_train_final = Xy_train_final["target"]

    X_test_final = Xy_test_final.drop(columns=["target"], errors="ignore")
    y_test_final = Xy_test_final["target"]

    # Apply SMOTE to train data
    X_train_final, y_train_final = apply_smote(
        X=X_train_final,
        y=y_train_final,
        seed=user_input["seed_nr"],
        k=user_input["smote_k"],
    )

    # Fit model
    rf_model.fit(X_train_final, y_train_final)

    # Feature importance
    rf_vi = assessing_top_predictors(
        vi_method="impurity",
        rf_in=rf_model,
        X_train_in=X_train_final,
        X_test_in=X_test_final,
        y_test_in=y_test_final,
        dict_ohe_in=var_ohe_dict,
        with_aggregation=True,
        n_predictors=20,
        random_state=user_input["seed_nr"],
        verbose=False,
        save_directory=None,
        # save_directory=user_input["current_dir"],
    )

    # Evaluate model
    model_evaluation_classification(
        rf_model=rf_model,
        X_train=X_train_final,
        y_train=y_train_final,
        X_test=X_test_final,
        y_test=y_test_final,
        prob_threshold=0.4,  # Irrelevant when calculating full AUC
        save_directory=user_input["current_dir"],
        metric="f1-score",
        verbose=False,
        save_only_predictions=False,
    )

    # ! Save tree_id information separately, needed for merging SHAP and features during analysis
    final_predictors = (
        pd.read_csv(f"{current_dir}/final_model/X_test.csv")
        .drop(columns=["Unnamed: 0"])
        .columns.to_list()
    )

    final_predictors = []

    df_targted_treeid = pd.merge(
        df_target_for_treeid,
        df_predictors_final_for_treeid,
        on=["idp", "tree_id"],
        how="left",
    )

    print(f" - Shape of df_targted_treeid: {df_targted_treeid.shape}")
    print(f" - Shape of df_targted_treeid target: {df_targted_treeid['target'].shape}")

    # Repeat same splitting as done before model fitting
    X_train_treeid, X_test_treeid, y_train_treeid, y_test_treeid = train_test_split(
        df_targted_treeid,
        df_targted_treeid["target"],
        test_size=user_input["test_split"],
        random_state=user_input["seed_nr"],
        stratify=df_targted_treeid["target"],
    )

    dir_treeid = f"{current_dir}/treeid"
    os.makedirs(dir_treeid, exist_ok=True)

    X_train_treeid[["tree_id"] + final_predictors].to_csv(
        f"{dir_treeid}/X_train_treeid.csv", index=True
    )
    X_test_treeid[["tree_id"] + final_predictors].to_csv(
        f"{dir_treeid}/X_test_treeid.csv", index=True
    )

    y_train_treeid.to_csv(f"{dir_treeid}/y_train_treeid.csv", index=True)
    y_test_treeid.to_csv(f"{dir_treeid}/y_test_treeid.csv", index=True)

    # ! Save data --------------------------------------------------------
    # General information
    df_save = pd.DataFrame(
        {
            "subset": [user_input["subset"][0]],
            "subset_group": [user_input["subset_group"][0]],
            "created": [datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")],
            "best_model_decision": [user_input["best_model_decision"]],
            "N_died": [df_target.target.sum()],
            "N_surv": [df_target.shape[0] - df_target.target.sum()],
            "dir": [user_input["current_dir"]],
            "oversampled_cv": [user_input["do_smote_test_validation"]],
            "oversampled_test": [user_input["do_smote_test_final"]],
        }
    )

    # Final model metrics
    df_save = pd.concat(
        [df_save, pd.read_csv(f"{current_dir}/classification_metrics.csv")], axis=1
    )

    df_save.to_csv(f"{current_dir}/final_model_performance.csv", index=False)

## Loop over seeds


In [None]:
# user_input = {}

# # ! General -----------------------------------------------------------------------
# runs_dir = "./model_runs/all_runs"
# user_input["dir_suffix"] = None  # None or string
# user_input["description_file"] = None
# user_input["subset"] = ["species_lat2"]
# # ! TRAINING -----------------------------------------------------------------------
# # Data splitting
# user_input["test_split"] = 0.2
# # Feature Elimination
# user_input["do_ref"] = True
# user_input["method_validation"] = "oob"  # none | cv | oob
# user_input["method_importance"] = "impurity"  # permutation | impurity
# user_input["cv_folds"] = 5  # Number of folds for if cv is selected for validation
# user_input["do_tuning"] = False  # Tune during rfe validation?
# user_input["correlation_threshold"] = 0.8  # Threshold for r correlation removal
# # Tuning
# user_input["do_prescribed_search"] = True
# user_input["do_random_search"] = False
# user_input["gsc_metric"] = "roc_auc"  # grid search metric
# # ! Final model ---------------------------------------------------------------------
# # best_per_category | best_metric | best_metric_max1
# user_input["best_model_decision"] = "best_per_category"
# user_input["best_model_metric"] = "roc_auc"

# # ! Extra analyses ------------------------------------------------
# # Number of nearest neighbors for SMOTE (default setting is 5)
# user_input["smote_k"] = 5
# # Minimum number of features per category to keep (default is 2)
# user_input["min_features_per_category"] = 2

In [None]:
# # Get all seeds
# all_seeds = pd.read_csv("all_seeds.csv").seed.tolist()
# all_species = all_species_norm.index.tolist()
# all_runs = pd.DataFrame(
#     list(itertools.product(all_seeds, all_species)),
#     columns=["seed", "species"],
# )
# all_runs["dir"] = ""
# all_runs["done"] = False

# # Loop over all runs and check if that run has been completed
# for i, row in all_runs.iterrows():
#     # Get folder matching the seed
#     seed = row.seed
#     base_dir = glob.glob(f"{runs_dir}/run_{seed}")

#     if len(base_dir) == 0:
#         # print(f" - No folder found for seed: {seed}")
#         continue
#     else:
#         base_dir = base_dir[0]
#         all_runs.loc[i, "dir"] = base_dir

#     if os.path.isfile(f"./{base_dir}/{row.species}/final_model_performance.csv"):
#         all_runs.loc[i, "done"] = True
#     elif os.path.isfile(f"{base_dir}/{row.species}/⚠️ too few dead trees.txt"):
#         all_runs.loc[i, "done"] = True
#     else:
#         all_runs.loc[i, "done"] = False

# # Get missing runs
# runs_to_run = (
#     all_runs.query("done == False")
#     # .sort_values(["species", "seed"])
#     .reset_index(drop=True)
# )

# # ! Option to only run top9 species
# # final_species = get_species_with_models("list")
# # top9 = all_species_norm.index.tolist()
# # runs_to_run = runs_to_run.query("species in @top9")

# # Reset index to start from 0
# runs_to_run = runs_to_run.reset_index(drop=True)

# # ! Option for running on multiple notebooks
# # Create multiple notebooks with name 01_model_fitting 1.ipynb, 01_model_fitting 2.ipynb, etc.
# # import IPython

# # # Number of notebooks
# # n_splits = 10
# # # Current notebook index
# # nb_name = IPython.extract_module_locals()[1]["__vsc_ipynb_file__"]
# # nb_id = int(nb_name.split("01_model_fitting ")[-1].split(".")[0])
# # print(f"Running notebook {nb_id} with {n_splits} splits")
# # # Select according nested list
# # l_runs_to_run = split_df_into_list_of_group_or_ns(runs_to_run, n_splits, "seed")
# # # l_runs_to_run = split_df_into_list_of_group_or_ns(runs_to_run, n_splits)
# # runs_to_run = l_runs_to_run[nb_id].reset_index(drop=True)

# # # Sort runs by increasing number of trees
# # sort_order = runs_to_run.species.value_counts().index.tolist()

# # # Display runs to run
# # runs_to_run

# # # Get sort order by increasing number of trees
# # species_order = all_species_norm.sort_values(ascending=True).index.tolist()

# # # Sort all_runs by species_order
# # runs_to_run["species"] = runs_to_run["species"].astype("category")
# # runs_to_run["species"] = runs_to_run["species"].cat.set_categories(species_order)
# # runs_to_run = runs_to_run.sort_values(["species"]).reset_index(drop=True)


# # ! Loop over all runs
# for i, row in runs_to_run.iterrows():

#     iseed = row.seed
#     ispecies = row.species
#     idir = row.dir

#     if idir == "":
#         # Create folder for run
#         idir = f"model_runs/all_runs/run_{iseed}"
#         os.makedirs(idir, exist_ok=True)
#         all_runs.loc[i, "dir"] = idir

#     # Start run
#     user_input["seed_nr"] = iseed
#     display("")
#     print(
#         f"""
#         --------------------------------------------------------------------------------
#         Run {i}/{runs_to_run.shape[0]}
#         Seed: {iseed}
#         Species: {ispecies}
#         Dir: {idir}
#         Started: {datetime.datetime.now().strftime('%Y-%m-%d @ %H:%M:%S')}
#         --------------------------------------------------------------------------------
#         """
#     )
#     ist = start_time(False)
#     run_all(ispecies, user_input, base_dir=idir)
#     clear_output(wait=True)
#     end_time(ist, None, ring=False)

In [None]:
# ! osascript -e 'tell app "System Events" to shut down'

---


## Sensitivity Analysis


### SMOTE and RFE (Top 9 Species)


In [None]:
# #
# # ! Select which extra analyses to run <<<<<
extra_analyses = "smotek"  # smotek | rfe_nkeep
# species_to_test = ["Fagus sylvatica"]  # Species to test for extra analyses
species_to_test = top9_species

In [None]:
user_input = {}

# ! Settings for extra analyses ------------------------------------------------
# todo: Adjust directory
runs_dir = "./extra_runs/"
# Number of nearest neighbors for SMOTE (default setting is 5)
user_input["smote_k"] = 5
# Minimum number of features per category to keep (default is 2)
user_input["min_features_per_category"] = 2

if extra_analyses not in ["smotek", "rfe_nkeep"]:
    raise ValueError(f"Invalid extra analysis: {extra_analyses}")
elif extra_analyses == "smotek":
    all_extras = [0, 1, 5, 15]
elif extra_analyses == "rfe_nkeep":
    all_extras = [4, 6]

# ! General -----------------------------------------------------------------------
user_input["dir_suffix"] = None  # None or string
user_input["description_file"] = None
user_input["subset"] = ["species_lat2"]
# ! TRAINING -----------------------------------------------------------------------
# Data splitting
user_input["test_split"] = 0.2
# Feature Elimination
user_input["do_ref"] = True
user_input["method_validation"] = "oob"  # none | cv | oob
user_input["method_importance"] = "impurity"  # permutation | impurity
user_input["cv_folds"] = 5  # Number of folds for if cv is selected for validation
user_input["do_tuning"] = False  # Tune during rfe validation?
user_input["correlation_threshold"] = 0.8  # Threshold for r correlation removal
user_input["seed_nr"] = 51  # Set seed for randomization
# Tuning
user_input["do_prescribed_search"] = True
user_input["do_random_search"] = False
user_input["gsc_metric"] = "roc_auc"  # grid search metric
# ! Final model ---------------------------------------------------------------------
# best_per_category | best_metric | best_metric_max1
user_input["best_model_decision"] = "best_per_category"
user_input["best_model_metric"] = "roc_auc"

In [None]:
# # ! ------------------------------------------------------------------------------------------------
# # ! Note that this is for smote k runs and not for seeds but I am simply reusing the same code
# # ! ------------------------------------------------------------------------------------------------

# Get all run configurations
all_seeds = pd.read_csv("all_seeds.csv").seed.tolist()

# Create all combinations of seeds, extras, and species
all_runs = pd.DataFrame(
    list(itertools.product(all_seeds, all_extras, species_to_test)),
    columns=["seed", "extra", "species"],
)
all_runs["dir"] = ""

# ! Set runs
runs_to_run = all_runs.copy()
if runs_to_run.shape[0] == 0:
    raise ValueError(
        f"No runs to run for subset: {subset}. Please check if the species are available in the dataset."
    )

# Expected number of runs
expected_runs = len(all_seeds) * len(all_extras) * len(species_to_test)

if runs_to_run.shape[0] != expected_runs:
    raise ValueError(
        f"Expected {expected_runs} runs but got {runs_to_run.shape[0]} runs. Please check the configuration."
    )

# ! Filter for completed runs
from pathlib import Path


# Filter out completed or failed runs before splitting
def is_run_incomplete(row):
    iseed, ispecies, iextra = row["seed"], row["species"], row["extra"]

    if extra_analyses == "smotek":
        tmp_dir = f"./extra_runs/smotek_{iextra}"
    elif extra_analyses == "rfe_nkeep":
        tmp_dir = f"./extra_runs/rfe_nkeep_{iextra}"
    else:
        raise ValueError(
            f"Invalid 'extra_analyses'! Got: {extra_analyses}. Expected 'smotek' or 'rfe_nkeep'."
        )

    completed = f"{tmp_dir}/run_{iseed}/{ispecies}/final_model_performance.csv"
    failed = f"{tmp_dir}/run_{iseed}/{ispecies}/⚠️ too few dead trees.txt"

    # Return false if the run is completed or failed
    if os.path.isfile(completed) or os.path.isfile(failed):
        return True
    else:
        return False


runs_to_run["done"] = runs_to_run.apply(is_run_incomplete, axis=1)
runs_to_run = runs_to_run.query("done == False").reset_index(drop=True)
runs_to_run

In [None]:
# ! Option for running on multiple notebooks
## Before running:
## 1. Create 10 multiple notebooks with name pattern:
##   -`01_model_fitting copy 0.ipynb`,
##   -`01_model_fitting copy 1.ipynb`,
##   -`01_model_fitting copy 2.ipynb`,
##   ...
##   -`01_model_fitting copy 9.ipynb`,
## 2. Uncomment the code below to run the notebooks in parallel.
## 3. Add `raise` stopper to the end of the code block to stop execution after this point.
## 4. Run code below in each notebook.


# Imports
import IPython

# ! For using multiple notebooks ---
# Number of notebooks
n_splits = 5
# Current notebook index
nb_name = IPython.extract_module_locals()[1]["__vsc_ipynb_file__"]
nb_id = int(nb_name.split("01_model_fitting copy ")[-1].split(".")[0])
# nb_id = 0  # todo: REMOVE THIS

print(f" - Running notebook {nb_id} with {n_splits} splits")
# Select according nested list
l_runs_to_run = split_df_into_list_of_group_or_ns(runs_to_run, n_splits, "seed")
# l_runs_to_run = split_df_into_list_of_group_or_ns(runs_to_run, n_splits)
runs_to_run = l_runs_to_run[nb_id].reset_index(drop=True)

# ! Skip to here for only one notebook ---

# Get sort order by increasing number of trees
species_order = all_species_norm.sort_values(ascending=True).index.tolist()
# Sort all_runs by species_order
runs_to_run["species"] = runs_to_run["species"].astype("category")
runs_to_run["species"] = runs_to_run["species"].cat.set_categories(species_order)
runs_to_run = runs_to_run.sort_values(["species"]).reset_index(drop=True)

print(f" - Seeds to run: \t\t{runs_to_run.seed.unique().tolist()}")
print(f" - Extra {extra_analyses} to run: \t{runs_to_run.extra.unique()}")
print(f" - Species to run: \t\t{runs_to_run.species.unique().tolist()}")
print(f" - Number of runs to run: \t{runs_to_run.shape[0]}")

In [None]:
# ! Loop over all runs
for i, row in runs_to_run.iterrows():

    iseed = row.seed
    ispecies = row.species
    idir = row.dir
    iextra = row.extra

    # Set correct user input
    user_input["seed_nr"] = iseed
    if extra_analyses == "smotek":
        dir_prefix = f"smotek_{iextra}"
        user_input["smote_k"] = iextra
        if user_input["min_features_per_category"] != 2:
            raise ValueError(
                f"Minimum number of features per category is set to {user_input['min_features_per_category']} but should be 2 for smotek runs!"
            )
    elif extra_analyses == "rfe_nkeep":
        dir_prefix = f"rfe_nkeep_{iextra}"
        user_input["min_features_per_category"] = iextra
        if user_input["smote_k"] != 5:
            raise ValueError(
                f"SMOTE k is set to {user_input['smote_k']} but should be 5 for rfe_nkeep runs!"
            )
    else:
        raise ValueError(
            f"Invalid 'extra_analyses'! Got: {extra_analyses}. Expected 'smotek' or 'rfe_nkeep' in the path."
        )

    # Create folder for run
    all_runs.loc[i, "dir"] = idir
    idir = f"{runs_dir}/{dir_prefix}/run_{iseed}"
    os.makedirs(idir, exist_ok=True)

    # Check if run is already done
    if os.path.isfile(f"{idir}/{ispecies}/final_model_performance.csv"):
        continue
    elif os.path.isfile(f"{idir}/{ispecies}/⚠️ too few dead trees.txt"):
        continue

    # Run it
    display("")
    print(
        f"""
        --------------------------------------------------------------------------------
        Progress: {i}/{runs_to_run.shape[0]}
        Seed: {user_input['seed_nr']}
        k: {user_input['smote_k']}
        rfe_nkeep: {user_input['min_features_per_category']}
        Species: {ispecies}
        Dir: {idir}
        Started: {datetime.datetime.now().strftime('%Y-%m-%d @ %H:%M:%S')}
        --------------------------------------------------------------------------------
        """
    )
    ist = start_time(False)
    run_all(ispecies, user_input, base_dir=idir)
    clear_output(wait=True)
    end_time(ist, None, ring=False)

---


## Post-Fitting Calculations


### Get Final Models


In [None]:
# Get available runs directories
available_dirs = ["./model_runs"] + sorted(glob.glob("./extra_runs/*"))
print("Available runs directories:")
for i, d in enumerate(available_dirs):
    print(f"{i + 1}: {d}")

# ! SET RUNS DIRECTORY
# runs_dir = "./model_runs/all_runs"
# runs_dir = "./extra_runs/*"
# runs_dir = "./extra_runs/smote*"
runs_dir = "./extra_runs/rfe*"

# ! SET SPECIES TO ANALYZE
# species_subset = get_species_with_models("list")
species_subset = top9_species
# species_subset = ["Abies alba"]

In [None]:
# Get model directories
df_available = glob.glob(f"{runs_dir}/run_*/*/final_model_performance.csv")
df_available = pd.DataFrame(df_available, columns=["file"])
df_available["species"] = df_available["file"].str.split("/").str[-2]
df_available = df_available.query("species in @species_subset").reset_index(drop=True)
df_available["model"] = df_available["file"].str.split("/").str[-3]
df_available["base_dir"] = df_available["file"].str.split("/run").str[0] + "/"

# Check for missing runs
print(" --- The following species do not have their 50 seed runs yet: ---")
print(f" - Species found: {df_available['species'].nunique()}")
print(f" - Seeds found: {df_available['model'].nunique()}")
if "extra" in runs_dir:
    df_available["extraid"] = (
        df_available["file"].str.split("/run_").str[0].str.split("_").str[-1]
    )
    print(f" - Extra IDs found: {df_available['extraid'].nunique()}")
    display(
        df_available[["extraid", "species"]]
        .value_counts()
        .sort_values()[
            df_available[["extraid", "species"]].value_counts().sort_values() < 50
        ]
    )
else:
    display(
        df_available[["species"]]
        .value_counts()
        .sort_values()[df_available[["species"]].value_counts().sort_values() < 50]
    )

# df_available

### Calculate Model Performance


In [None]:
# Reduce to missing runs
df_todo = []
for i, row in df_available.iterrows():
    # Check if file exists
    ifile = row.file.replace(
        "final_model_performance.csv",
        "rf_performance/classification_metrics_fixed_threshold.csv",
    )

    if not os.path.isfile(ifile):
        df_todo.append(row)

df_todo = pd.DataFrame(df_todo)

if df_todo.shape[0] == 0:
    raise ValueError("✅ All model have been run!")
else:
    display(df_todo)

In [None]:
# Calculate model performance
from random_forest_utils import calculate_rf_performance

ncores = 5
run_mp(
    calculate_rf_performance,
    split_df_into_list_of_group_or_ns(df_todo, ncores, "model"),
    skip_if_csv_exists=False,
    progress_bar=True,
    num_cores=ncores,
)

### Calculate SHAP Values


In [None]:
# Reduce to missing runs
df_todo = []
for i, row in df_available.iterrows():
    # Check if file exists
    ifile = row.file.replace(
        "final_model_performance.csv",
        "shap/approximated/shap_values_test.pkl",
    )
    if not os.path.isfile(ifile):
        df_todo.append(row)
df_todo = pd.DataFrame(df_todo)
if df_todo.shape[0] == 0:
    raise ValueError("✅ All model have been run!")
else:
    display(df_todo)

In [None]:
# Calculate SHAP values

shap_run_new_loop_mp(
    df_todo,
    run_interaction=False,
    approximate=True,
    test_or_train="test",
    force_run=False,
    verbose=False,
    num_cores=9,
)

### Calculate SHAP Importance


In [None]:
# Reduce to missing runs
df_todo = []
for i, row in df_available.iterrows():
    # Check if file exists
    ifile = row.file.replace(
        "final_model_performance.csv",
        "final_model_performance_org.csv",
    )
    if not os.path.isfile(ifile):
        df_todo.append(row)
df_todo = pd.DataFrame(df_todo)
if df_todo.shape[0] == 0:
    raise ValueError("✅ All model have been run!")
else:
    display(df_todo)

In [None]:
# Calculate SHAP Variable Importance

# Loop over runs and species and calculate mean absolute SHAP values
# for i, row in tqdm(df_todo.head().iterrows(), total=df_todo.head().shape[0]): # ! Only for subset!
for i, row in tqdm(df_todo.iterrows(), total=df_todo.shape[0]):  # ! For all runs!
    # Get predictor data
    runs_dir = row.base_dir
    ipreds = f"{runs_dir}/{row.model}/{row.species}/final_model/X_test.csv"
    ipreds = pd.read_csv(ipreds, index_col=[0])

    # Get SHAP data
    ishap = (
        f"{runs_dir}/{row.model}/{row.species}/shap/approximated/shap_values_test.pkl"
    )
    if not os.path.exists(ishap):
        raise ValueError(
            f" 🚨 Skipping {row.model}/{row.species} because no SHAP values calculated yet!"
        )
    ishap = load_shap(ishap)

    # Extract SHAP values per prediction (saved in third dimension)
    ishap = ishap.values[:, :, 1]

    # Get the row of SHAP values to have a basis to add to
    ishapAll = pd.DataFrame(ishap[0].tolist()).T

    # Give the df the correct predictor names
    ishapAll.columns = ipreds.columns

    # Loop over all SHAP predictions and concatenate
    for j in range(1, len(ishap)):
        iii = pd.DataFrame(ishap[j].tolist()).T
        iii.columns = ipreds.columns
        ishapAll = pd.concat([ishapAll, iii], axis=0, ignore_index=True)

    # Safety check: Shape of predictors should be the same as for SHAP values
    if ipreds.shape != ishapAll.shape:
        print(
            f" - Issue: The shape of the predictor data should equal the shape of the concatenated SHAP values!"
        )

    # Take mean of SHAP values across all variables
    ishapMean_org = ishapAll.abs().mean().sort_values(ascending=False)
    ishapMean = ishapMean_org / ishapMean_org.sum()
    ishapMean = pd.DataFrame(ishapMean)
    ishapMean.columns = ["Importance"]
    ishapMean.Importance = ishapMean.Importance * 100
    ishapMean["Feature"] = ishapMean.index
    ishapMean.reset_index(drop=True, inplace=True)

    # Link feature variable to predictor dataset in new column
    # Load predictor dictionary
    dict_preds = json.load(open(f"./model_runs/feature_category_dictionary.json"))
    for f in ishapMean.Feature:
        for key, value in dict_preds.items():
            if f in value:
                ishapMean.loc[ishapMean.Feature == f, "dataset"] = key

    # Sum up the VI for each dataset
    ishapMean_of_dataset = (
        ishapMean[["Importance", "dataset"]]
        .groupby("dataset")
        .sum()
        .reset_index()
        .rename({"Importance": "dataset_imp"}, axis=1)
    )

    ishapMean_of_dataset.dataset_imp = (
        ishapMean_of_dataset.dataset_imp / ishapMean_of_dataset.dataset_imp.sum() * 100
    )

    ishapMean["mean_abs_shap_org"] = ishapMean_org.values

    # Attach dataset label with percentages
    for j, jrow in ishapMean_of_dataset.iterrows():
        ishapMean_of_dataset.loc[j, "dataset_label"] = (
            str(round(ishapMean_of_dataset.loc[j, "dataset_imp"]))
            + "%: "
            + ishapMean_of_dataset.loc[j, "dataset"]
        )

    ishapMean = ishapMean.merge(ishapMean_of_dataset, on="dataset", how="left")

    # Save SHAP data
    ishapMean.to_csv(
        f"{runs_dir}/{row.model}/{row.species}/shap_variable_importance.csv"
    )

    # Load final model performance
    ifinalOrg = f"{runs_dir}/{row.model}/{row.species}/final_model_performance_org.csv"
    ifinalNew = f"{runs_dir}/{row.model}/{row.species}/final_model_performance.csv"

    # If the original file has not yet been backuped, save it!
    if not os.path.exists(ifinalOrg):
        shutil.copy2(ifinalNew, ifinalOrg)

    # Load model performance file, attach SHAP information and save it again
    ifinalNewDf = pd.read_csv(ifinalNew)

    for dataset in ishapMean.dataset.unique():
        ifinalNewDf[f"{dataset} - Importance"] = ishapMean.loc[
            ishapMean.dataset == dataset, "dataset_imp"
        ].values[0]
        ifinalNewDf[f"{dataset} - Metrics"] = [
            ishapMean.loc[ishapMean.dataset == dataset, "Feature"].values
        ]
        ifinalNewDf[f"{dataset} - Values"] = [
            ishapMean.loc[ishapMean.dataset == dataset, "Importance"].values
        ]

    ifinalNewDf.to_csv(ifinalNew, index=False)

In [None]:
# ! osascript -e 'tell app "System Events" to shut down'

---
