# Extract Forest Structure


In [None]:
import sys

sys.path.insert(0, "../../src")
from imports import *

init_notebook()

# Information from Alive Trees


## Load all alive trees


In [None]:
# Load NFI data
nfi_raw = get_final_nfi_data_for_analysis()

# Focus only on trees that were alive at first visit
nfi_raw = nfi_raw[nfi_raw["tree_state_1"] == "alive"]

# Number of trees and sites
print("\nSubsetting trees that were alive at first visit:")
print("\tNumber of trees:", nfi_raw.tree_id.nunique())
print("\tNumber of sites:", nfi_raw.idp.nunique())

# Keep only relevant columns
subset = nfi_raw[
    [
        "campagne_1",
        "idp",
        "tree_id",
        "genus_lat",
        "species_lat",
        "species_lat2",
        "tree_height_class",
        "tree_circumference_class",
        "espar",
        "espar_red",
        "tree_state_1",
        "tree_state_2",
        "tree_state_change",
        "lib",
        "c13_1",
        "c13_rel",
        "ba_1",
        "dbh_1",
        "ir5",
        "htot_final",  # Using mix of measured and RF-predicted values!
        "simplif",
    ]
]

# Fix formatting of columns
subset.loc[:, "genus_lat"] = subset["genus_lat"].astype(str)
subset.loc[:, "species_lat"] = subset["species_lat"].astype(str)

subset.loc[:, "lib"] = subset["lib"].fillna(0)
subset.loc[:, "htot_final"] = subset["htot_final"].fillna(0)

print(subset.shape)

## Competition


### Site-Level


In [None]:
dom_genu = get_dominant_species(subset, "genus_lat")
dom_spec = get_dominant_species(subset, "species_lat")
dom_spec2 = get_dominant_species(subset, "species_lat2")
dom_htot = get_dominant_species(subset, "tree_height_class")
dom_circ = get_dominant_species(subset, "tree_circumference_class")
# dom_espa = get_dominant_species(subset, "espar")
# dom_espared = get_dominant_species(subset, "espar_red")
df_dom = (
    dom_genu.merge(dom_spec, on="idp", how="left")
    .merge(dom_spec2, on="idp", how="left")
    .merge(dom_htot, on="idp", how="left")
    .merge(dom_circ, on="idp", how="left")
)
df_dom

### Tree-Level


In [None]:
# Calculate competition metrics (~20 minutes)
df_comp = calculate_competition_metrics_mp(subset, tree_size_var="htot_final")

# Check whether tree belongs to dominant species
dom_spec2 = get_dominant_species(subset, "species_lat2")
df_dom_tree = pd.merge(
    subset[["idp", "tree_id", "species_lat2"]], dom_spec2, how="left", on="idp"
)
df_dom_tree["belongs_to_dom_spec"] = (
    df_dom_tree["species_lat2"] == df_dom_tree["dom_species_lat2"]
).astype(int)

df_dom_tree = df_dom_tree[["tree_id", "belongs_to_dom_spec"]]

In [None]:
# Calculate mean dbh per site
df_dbh = (
    subset.groupby("idp")["dbh_1"]
    .mean()
    .reset_index()
    .rename(columns={"dbh_1": "mean_dbh"})
)

df_dbh = pd.merge(subset[["idp", "tree_id"]], df_dbh, how="left", on="idp")

# Calculate trees per site
df_trees = (
    subset.groupby("idp")["tree_id"]
    .count()
    .reset_index()
    .rename(columns={"tree_id": "num_trees"})
)

df_trees = df_trees.merge(df_dbh, how="left", on="idp")
df_trees

In [None]:
# Merge dataframes together
df_tree_tmp = df_trees.merge(df_comp, how="left", on="tree_id").merge(
    df_dom_tree, how="left", on="tree_id"
)

df_tree_tmp = move_vars_to_front(df_tree_tmp, ["tree_id", "idp"])

# Save dataframe
df_tree_tmp.to_feather(here("data/final/predictor_datasets/forest_competition.feather"))

df_tree_tmp

## Gini Inequality


In [None]:
df_gini = calculate_gini_coefficient(subset, "idp", "ba_1").sort_values("gini_ba_1")
df_gini.sort_values("gini_ba_1", ascending=False)
df_gini.to_feather("../../data/final/predictor_datasets/forest_gini.feather")

## Biodiversity Indeces


In [None]:
# # ! Compare the two diversity indices

pick_metric = "species_lat2"

simpson = calculate_simpson_diversity(subset, "idp", pick_metric)
shannon = calculate_shannon_diversity(subset, "idp", "tree_id", pick_metric)

# Merge the two diversity indices
diversity = pd.merge(simpson, shannon, on="idp", how="left")
diversity

# Scale each index to a range of 0-1
diversity[f"biodiv_simpson_score_{pick_metric}"] = (
    diversity[f"biodiv_simpson_score_{pick_metric}"]
    - diversity[f"biodiv_simpson_score_{pick_metric}"].min()
) / (
    diversity[f"biodiv_simpson_score_{pick_metric}"].max()
    - diversity[f"biodiv_simpson_score_{pick_metric}"].min()
)

diversity[f"biodiv_shan_{pick_metric}"] = (
    diversity[f"biodiv_shan_{pick_metric}"]
    - diversity[f"biodiv_shan_{pick_metric}"].min()
) / (
    diversity[f"biodiv_shan_{pick_metric}"].max()
    - diversity[f"biodiv_shan_{pick_metric}"].min()
)

# Plot the diversity indices
fig, ax = plt.subplots(figsize=(10, 6))

# Add 1 to 1 line
ax.plot([0, 1], [0, 1], color="black", linestyle="--")

sns.scatterplot(
    data=diversity,
    x=f"biodiv_simpson_score_{pick_metric}",
    y=f"biodiv_shan_{pick_metric}",
    ax=ax,
)

In [None]:
# Calculate number of species per site
df_spec_per_site = subset.groupby("idp")["species_lat2"].nunique().reset_index()
df_spec_per_site = df_spec_per_site.rename(columns={"species_lat2": "num_species"})

# Clean names
simpson = simpson.rename(
    columns={"biodiv_simpson_score_species_lat2": "simpson_species"}
)[["idp", "simpson_species"]]
shannon = shannon.rename(columns={"biodiv_shan_species_lat2": "shannon_species"})[
    ["idp", "shannon_species"]
]

df_diversity = pd.merge(simpson, shannon, on="idp", how="left").merge(
    df_spec_per_site, on="idp", how="left"
)
df_diversity

df_diversity.to_feather(
    "../../data/final/predictor_datasets/forest_biodiversity.feather"
)

## Self-Thinning Line


In [None]:
# LOAD SUBSET FROM ABOVE!
# ! Number of trees per hectare


# Functions
def trees_per_hectare(count_trees, radius):
    area_ha = (radius / 100) ** 2 * 3.14159
    trees_per_ha = count_trees / area_ha
    return trees_per_ha


# Inputs
radius_small = 6
radius_medium = 9
radius_large = 15

# Group by 'idp' and 'tree_circumference_class', then count the occurrences
counts = (
    subset.groupby(["idp", "tree_circumference_class"], observed=False)
    .size()
    .unstack(fill_value=0)
)

# Calculate trees per hectare for each circumference class
trees_per_hectare_small = trees_per_hectare(counts["small"], radius_small)
trees_per_hectare_medium = trees_per_hectare(counts["medium"], radius_medium)
trees_per_hectare_large = trees_per_hectare(counts["large"], radius_large)

# Calculate total trees per hectare for each site
total_trees_per_hectare = (
    trees_per_hectare_small + trees_per_hectare_medium + trees_per_hectare_large
)

# Create the output DataFrame
df_out = pd.DataFrame(
    {"idp": counts.index, "num_trees_per_ha": total_trees_per_hectare}
).reset_index(drop=True)

# ! DBH per plot
# Calculate mean DBH normal
df_meandbh = subset.groupby("idp")["dbh_1"].mean().reset_index(name="dbh_1")
df_meandbh["dbh_1"] = df_meandbh["dbh_1"] * 100

# Calculate mean DBH squared
subset["dbh_1_sq"] = (subset["dbh_1"] * 100) ** 2
df_meandbh2 = subset.groupby("idp")["dbh_1_sq"].mean().reset_index(name="dbh_1_sq")
df_meandbh2["dbh_1_sq"] = df_meandbh2["dbh_1_sq"] ** 0.5

df_meandbh = pd.merge(df_meandbh, df_meandbh2, on="idp", how="left")

# Attach variables directly to be used in the model


# ! Merge them
df_stl = pd.merge(df_meandbh, df_out, on="idp", how="left")

# ! Attach dominant species and its "purity"
df_stl = df_stl.merge(
    get_dominant_species(subset, "species_lat2"), how="left", on="idp"
)

df_stl

In [None]:
above_90percent = (
    df_stl.query("ba_1_perc_of_species_lat2 >= 90").shape[0] / df_stl.shape[0] * 100
)
print(f"{round(above_90percent)}% of all sites have a purity of at least 90%")
display(df_stl.ba_1_perc_of_species_lat2.describe())
df_stl.ba_1_perc_of_species_lat2.hist(bins=100)

In [None]:
# Load empirical models for self-thinning lines
stl_lin = pd.read_excel(
    "../../docs/charru2012significant_self-thinning-lines.xlsx",
    sheet_name="Table 2 - Linear Models",
)

stl_cur = (
    pd.read_excel(
        "../../docs/charru2012significant_self-thinning-lines.xlsx",
        sheet_name="Table 5 - Curvilinear Models",
    )
    .dropna()
    .reset_index(drop=True)
)

display(stl_lin)
display(stl_cur)

In [None]:
min_purity = 50  # Minimum percentage of ba to be occupied by one species
df_stl["max_N_ln"] = np.nan
df_stl["formula"] = ""
df_stl["model"] = ""

avg_int = stl_lin.query("species_lat2 == 'Average'")["intercept"].values[0]
avg_slo = stl_lin.query("species_lat2 == 'Average'")["ln_D"].values[0]

for i in tqdm(range(df_stl.shape[0])):
    # Get dominant species of plot
    s = df_stl.at[i, "dom_species_lat2"]
    dbh_sq = df_stl.at[i, "dbh_1_sq"]
    purity = df_stl.at[i, "ba_1_perc_of_species_lat2"]
    # If purity is not achieved, then go for averaged linear model
    if purity < min_purity:
        df_stl.at[i, "max_N_ln"] = avg_int + avg_slo * math.log(dbh_sq)
        df_stl.at[i, "formula"] = f"{avg_int} + {avg_slo} * math.log({dbh_sq})"
        df_stl.at[i, "model"] = "avg_low_purity"
    else:
        # If purity is achieved, check if curvilinear model is available:
        if s in stl_cur.species_lat2.unique():
            # Get parameters
            inter = stl_cur.query("species_lat2 == @s")["intercept"].values[0]
            slope_1 = stl_cur.query("species_lat2 == @s")["ln_D"].values[0]
            slope_2 = stl_cur.query("species_lat2 == @s")["ln_D2"].values[0]
            # Calculate max ln(N)
            df_stl.at[i, "max_N_ln"] = (
                inter + slope_1 * math.log(dbh_sq) + slope_2 * math.log(dbh_sq) ** 2
            )
            df_stl.at[i, "formula"] = (
                f"{inter} + {slope_1} * math.log({dbh_sq}) + {slope_2} * math.log({dbh_sq}) ** 2"
            )
            df_stl.at[i, "model"] = "curvilinear"
            # If curvilinear model is not available check if linear model for species is available
        elif s in stl_lin.species_lat2.unique():
            # Get parameters
            inter = stl_lin.query("species_lat2 == @s")["intercept"].values[0]
            slope_1 = stl_lin.query("species_lat2 == @s")["ln_D"].values[0]
            # Calculate max ln(N)
            df_stl.at[i, "max_N_ln"] = inter + slope_1 * math.log(dbh_sq)
            df_stl.at[i, "formula"] = f"{inter} + {slope_1} * math.log({dbh_sq})"
            # If linear model is not available for species, take average value
            df_stl.at[i, "model"] = "linear"
        else:
            df_stl.at[i, "max_N_ln"] = avg_int + avg_slo * math.log(dbh_sq)
            df_stl.at[i, "formula"] = f"{avg_int} + {avg_slo} * math.log({dbh_sq})"
            df_stl.at[i, "model"] = "avg_no_species"

# When done, take exponential of max_N to get number of trees
df_stl["max_N"] = df_stl["max_N_ln"].apply(math.exp)
# Add percentage of current / max N trees
df_stl["carrying_capacity"] = df_stl["num_trees_per_ha"] / df_stl["max_N"]
# Add log ba
df_stl["dbh_1_sq_log"] = np.log(df_stl["dbh_1_sq"])
# Show df
df_stl

In [None]:
# Some stats
display(df_stl.describe())
display(df_stl.groupby("model")["max_N"].describe())
display(df_stl.value_counts("model", normalize=True))

In [None]:
# Ignore infinity warning, just future changes warning
warnings.simplefilter(action="ignore", category=FutureWarning)

# Set the style of the plot
sns.set_style("whitegrid")

# Create a figure with multiple subplots
fig, axes = plt.subplots(3, 2, figsize=(10, 10))

# Plot density lines for quality control
# max N ~ Model
for model, group in df_stl.groupby("model"):
    sns.kdeplot(
        group["max_N"], ax=axes[0, 0], label=model
    )  # Adjust other parameters as needed
axes[0, 0].set_title("Calculated Max Number of Trees per Model Setup")
axes[0, 0].legend()

# Purity ~ Model
for model, group in df_stl.groupby("model"):
    sns.kdeplot(
        group["ba_1_perc_of_species_lat2"], ax=axes[0, 1], label=model
    )  # Adjust other parameters as needed
axes[0, 1].set_title("Purity per Model Setup")
axes[0, 1].legend()

# DBH ~ Model
for model, group in df_stl.groupby("model"):
    sns.kdeplot(
        group["dbh_1_sq"], ax=axes[1, 0], label=model
    )  # Adjust other parameters as needed
axes[1, 0].set_title("Mean of DBH^2 per Model Setup")
axes[1, 0].legend()

# CC ~ Model
for model, group in df_stl.groupby("model"):
    sns.kdeplot(
        group["carrying_capacity"], ax=axes[1, 1], label=model
    )  # Adjust other parameters as needed
axes[1, 1].set_title("Carrying Capacity per Model Setup")
axes[1, 1].legend()

# STL - Scatterplot
sns.scatterplot(
    data=df_stl,
    x="dbh_1_sq_log",
    y="max_N_ln",
    hue="model",
    palette="Set1",
    ax=axes[2, 0],
)
axes[2, 0].legend(title="Model", loc="upper right")

# Estimated Max versus Current
sns.kdeplot(
    data=df_stl,
    x="num_trees_per_ha",
    y="max_N",
    fill=True,
    thresh=0.05,
    levels=10,
    cmap="mako",
    legend=True,
    ax=axes[2, 1],
)
axes[2, 1].plot([0, 4000], [0, 4000], color="red", linestyle="--")
axes[2, 1].set_xlim([0, 4000])
axes[2, 1].set_ylim([0, 4000])

plt.tight_layout()
plt.show()

In [None]:
df_stl[["idp", "carrying_capacity"]].to_feather(
    "../../data/final/predictor_datasets/forest_carrying_capacity.feather"
)