# Run Random Forest

In [1]:
# Data wrangling
import pandas as pd
import numpy as np

# Data visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Machine learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import missingno as msno # For visualizing missing values

# My functions
import sys
sys.path.insert(0, "../../src")
from run_mp import *
from nfi_wrangling import *

## Load NFI Dataset

In [2]:
# Load NFI Data
nfi_final_data = pd.read_csv(
    "../00_process_nfi_data/20231123-191200_nfi_dataset_for_analysis copy.csv",
    index_col=[0],
)

  nfi_final_data = pd.read_csv(


## Check for documented / undocumented variables

In [3]:
# Original NFI Variables
nfi_org = pd.read_excel(
    "ifna_predictor_database.xlsx",
    sheet_name="NFI Original Variables",
)[["var", "type", "level", "remove"]]

# NFI Derivative Variables
nfi_derivatives = (
    pd.read_excel(
        "20231124_ifna_predictor_database.xlsx",
        sheet_name="NFI Derivatives",
    )
    .reset_index()[["var", "type", "level", "remove"]]
    .dropna(subset=["type"])
)

# Add suffixes _1 and _2 to the original variables
suffix_1 = nfi_org.copy()
suffix_2 = nfi_org.copy()

suffix_1["var"] = suffix_1["var"].apply(lambda x: x + "_1")
suffix_2["var"] = suffix_2["var"].apply(lambda x: x + "_2")

# Concatenate them
vars_described_in_sheet = pd.concat([nfi_org, suffix_1, suffix_2, nfi_derivatives])
# vars_described_in_sheet

# Extract variables that are in the final wrangled nfi dataset
final_vars = nfi_final_data.columns.to_frame(index=False, name="var").sort_values("var")

# Remove all variables that can be found in the excel file:
mask = final_vars["var"].isin(vars_described_in_sheet["var"])
not_described_vars = final_vars[~mask]

# Print output
print("Variables that are not registered in the excel (should show empty dataframe):")
display(not_described_vars)

# Check if variables are described in the excel file that are not in the final wrangled nfi dataset
# Extract all derivatives
all_ders = nfi_derivatives["var"]

# Find entries in var that are in all_ders but not in final_vars
mask = nfi_derivatives["var"].isin(final_vars["var"])
documented_but_not_in_final_df = nfi_derivatives[~mask]

# Print the output
print(
    "Variables that are documented in the excel but not in the final wrangled nfi dataframe (should show only variables with level 'grouping'):"
)
display(documented_but_not_in_final_df)

Variables that are not registered in the excel (should show empty dataframe):


Unnamed: 0,var


Variables that are documented in the excel but not in the final wrangled nfi dataframe (should show only variables with level 'grouping'):


Unnamed: 0,var,type,level,remove
53,n_plots,num,grouping,
54,n_ini,num,grouping,
55,n_sur,num,grouping,
56,n_fin,num,grouping,
57,n_rec,num,grouping,
58,ba_at_v1_of_alive_trees,num,grouping,
59,ba_at_v2_of_alive_trees,num,grouping,
60,ba_at_v1_of_survivors,num,grouping,
61,ba_at_v2_of_survivors,num,grouping,
62,ba_at_v1_of_dead,num,grouping,


## Encode variables as specified in the excel file

In [4]:
# CATEGORICAL VARIABLES
# Encode variables that are cat in column type in vars_described_in_sheet as categorical in nfi_final_data
# Filter vars_described_in_sheet to hold only variables contained in nfi_final_data
mask = vars_described_in_sheet["var"].isin(nfi_final_data.columns)

# Further filter to hold only categorical variables
cat_vars = vars_described_in_sheet.loc[
    (vars_described_in_sheet["type"] == "cat") & mask, "var"
].to_list()

# Encode them as categorical in nfi_final_data
nfi_final_data[cat_vars] = nfi_final_data[cat_vars].astype("category")

# DATE VARIABLES (by hand, routine caused errors...)
nfi_final_data["dateeco"] = nfi_final_data["dateeco"].astype("datetime64[ns]")
nfi_final_data["datemort"] = nfi_final_data["datemort"].astype("datetime64[ns]")

## Calculate plot-level mortality

In [5]:
# Calculate growth and mortality at the plot-level
run_cell = False

if run_cell:
    # ⚠️  This takes about 30 minutes to run, so loading the data from the feather file instead
    #     Plus, for some reason, after running this cell, the notebook slows down massively.

    grouped = nfi_final_data.groupby("idp", as_index=False)
    df_list = [group for name, group in grouped]

    # df_list = df_list[:100] # For debug, reduce number to 100 sites only

    out = run_mp(
        calculate_growth_mortality,
        df_list,
        combine_func=pd.concat,
        progress_bar=True,
        num_cores=10,
    )

    out.to_parquet("nfi-idp_level_aggregated_growth_mortality.parquet")

out = pd.read_parquet("nfi-idp_level_aggregated_growth_mortality.parquet")
# out.shape
# out.head(100)

## Merge location-data back in

In [6]:
# Get subset
nfi_subset = nfi_final_data.copy()

# TODO: This is just a quick fix until correction has been made on R side
# TODO: There are duplicated sites because of mutli-to-multi matches for idp-tca-tcl.
nfi_subset = nfi_subset.drop(columns=["tca", "tcl"])

# Get all variables on location-level
location_vars = vars_described_in_sheet.query("level == 'location'")

# Get all columns in nfi_final_data that are also in location_vars
matching_columns = [var for var in location_vars["var"] if var in nfi_subset.columns]

# Reduce nfi_final_data to only contain location-level variables
nfi_subset = nfi_subset[matching_columns].drop_duplicates()
nfi_subset.shape

(40975, 136)

In [7]:
# Attach only variables of interest from out to nfi_subset
target_variable = "mort_ba_prc_yr_v1"
out_subset = out[["idp", target_variable]]
combined_out_nfi = out_subset.merge(nfi_subset, on="idp", how="left")
combined_out_nfi.shape

(40975, 137)

## Add GEE Data

In [8]:
# TODO: Omitted for now
# gee_data = pd.read_feather("../02_process_gee_data/final_gee_predictor_dataset.feather")
# df_tmp = out.merge(gee_data, on="idp", how="left")

# Random Forest Setup

### Settings

In [9]:
# Define the target variable

# Target metric
target = target_variable  # from above

# Data filtering
na_drop_threshold = 0.05

# Randomness

# Test/train splitting
seed_nr = 42
test_split = 0.2
test_train_strata = ["ser"]

### Filter data

#### Rows

In [10]:
# Get dataset from above
df_for_rf = combined_out_nfi.copy()

In [11]:
# Drop rows where target is zero
df_dropped_rows_with_0s = df_for_rf[df_for_rf[target] != 0]

#### Columns

In [12]:
# Get temporary df
df_tmp = df_dropped_rows_with_0s.copy()

# vars_described_in_sheet holds all possible suffix combinations but those are not needed.
# So, reduce vars_described_in_sheet to match variables in df_tmp
vars_described_in_sheet_and_in_df = vars_described_in_sheet[
    vars_described_in_sheet["var"].isin(df_tmp.columns)
]

# Get the columns to remove
columns_to_remove = vars_described_in_sheet_and_in_df.query("remove == 'x'")["var"]

# Remove the columns from df_tmp
df_tmp = df_tmp.drop(columns=columns_to_remove)

# Print the removed columns
print(f"{len(columns_to_remove)} columns removed as specified in excel file:")
for column in columns_to_remove:
    print(f" - {column}")

print(f"\nFinal shape of df_tmp: {df_tmp.shape}")

# Save dataframe for next cell
df_dropped_cell_from_excel = df_tmp.copy()

7 columns removed as specified in excel file:
 - idp
 - peupnr_1
 - visite_1
 - peupnr_2
 - visite_2
 - census_interval
 - human_activity

Final shape of df_tmp: (7725, 130)


In [13]:
# Get temporary df for cell
df_tmp = df_dropped_cell_from_excel.copy()

# Unify encoding of missing data (NA into NaN)
df_tmp = df_tmp.fillna(value=pd.NA)

# Get number of rows of dataframe to calculate percentages
n_rows = df_tmp.shape[0]

for my_col in sorted(df_tmp.columns):
    n_na = df_tmp[my_col].isna().sum()
    na_perc = n_na / n_rows

    if na_perc > na_drop_threshold:
        df_tmp = df_tmp.drop(my_col, axis=1)
        print(f"Dropping: {my_col} because it has {n_na} NAs ({round(na_perc*100)}%).")

print("")
print(f"👉 New shape of df: {df_tmp.shape}.")
print(
    f"{df_dropped_cell_from_excel.shape[1] - df_tmp.shape[1]} variables were dropped."
)


# Get remaining columns with NA values for imputation later
vars_with_na = df_tmp.columns[df_tmp.isna().any()].tolist()
vars_with_na = pd.Series(vars_with_na)
display("---")
print("Variables still containing NAs:")
for var in vars_with_na:
    print(f" - {var}")

# Save df for later
df_dropped_na_columns = df_tmp.copy()

Dropping: acces because it has 7725 NAs (100%).
Dropping: anpyr because it has 7725 NAs (100%).
Dropping: asperite because it has 501 NAs (6%).
Dropping: autut_1 because it has 6574 NAs (85%).
Dropping: autut_2 because it has 1275 NAs (17%).
Dropping: bois_1 because it has 5496 NAs (71%).
Dropping: bois_2 because it has 1269 NAs (16%).
Dropping: bplant_1 because it has 7327 NAs (95%).
Dropping: bplant_2 because it has 7721 NAs (100%).
Dropping: cam because it has 7725 NAs (100%).
Dropping: cslisi because it has 5703 NAs (74%).
Dropping: dcespar1 because it has 7688 NAs (100%).
Dropping: dcespar2 because it has 7712 NAs (100%).
Dropping: def5 because it has 2463 NAs (32%).
Dropping: denivriv because it has 7020 NAs (91%).
Dropping: dist because it has 540 NAs (7%).
Dropping: distriv because it has 7018 NAs (91%).
Dropping: dpyr because it has 7725 NAs (100%).
Dropping: elag because it has 7248 NAs (94%).
Dropping: elisi because it has 6544 NAs (85%).
Dropping: entp because it has 7542 N

'---'

Variables still containing NAs:
 - mort_ba_prc_yr_v1
 - strate
 - humus
 - obschemin
 - obsriv
 - roche
 - text1
 - text2
 - topo
 - tsol
 - instp5
 - dateeco
 - affroc
 - afpla
 - cai40
 - cailloux
 - herb
 - masque
 - prof2
 - lign1
 - lign2
 - mousse
 - gest
 - pentexp
 - land_use
 - site_ba_prc_dead_at_v1


#### Visualize

In [14]:
from ydata_profiling import ProfileReport

df_tmp = df_dropped_na_columns.copy()

ProfileReport(df_tmp.sort_index(axis=1), minimal=True, dark_mode=True)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



## Last cleaning steps

In [122]:
# Get temporary df
df_tmp = df_dropped_na_columns.copy()
df_tmp.shape

(7725, 54)

In [123]:
# TODO: Remove variables that have mostly the same values.
# E.g. strate is causing error later on, so removing it for now
df_tmp = df_tmp.drop(columns=["strate"])
df_tmp.shape

(7725, 53)

In [124]:
# IMPUTE DATE VARIABLES BASED ON CAMPAIGN YEAR
date_cols = [var for var in df_tmp.columns if df_tmp[var].dtype == "datetime64[ns]"]

for var in date_cols:
    df_tmp[var] = df_tmp[var].fillna(df_tmp["campagne_1"].astype(str) + "-07-01")

# TODO: dropping datetime variable for now because it is of little importance and
# encoding takes significant extra work for now...
# To keep circular nature of data structure, encode using trigonometric functions
# https://scikit-learn.org/stable/auto_examples/applications/plot_cyclical_feature_engineering.html
df_tmp = df_tmp.drop(columns=date_cols)
df_tmp.shape

(7725, 52)

In [125]:
# ENCODE CATEGORICAL VARIABLES
# vars_described_in_sheet holds all possible suffix combinations but those are not needed.
# So, reduce vars_described_in_sheet to match variables in df_tmp
vars_described_in_sheet_and_in_df = vars_described_in_sheet[
    vars_described_in_sheet["var"].isin(df_tmp.columns)
]

# Get list of variables to encode, based on excel file
vars_to_ohe = vars_described_in_sheet_and_in_df.query("type == 'cat'")["var"]

# Reduce to have only variables in df_tmp
vars_to_ohe = [var for var in vars_to_ohe if var in df_tmp.columns]

# Make sure all categorical variables are of dtype object
for var in vars_to_ohe:
    df_tmp[var] = df_tmp[var].astype(str)

sorted(vars_to_ohe)

['andain',
 'bord',
 'cover_change',
 'csa_1',
 'csa_2',
 'dep',
 'gre',
 'human_activity_var',
 'humus',
 'instp5',
 'integr',
 'iti',
 'land_use',
 'nincid_2',
 'obschemin',
 'obsriv',
 'roche',
 'ser',
 'text1',
 'text2',
 'topo',
 'tplant',
 'tsol']

In [127]:
# GET CATEGORICAL VARIABLES WITH NA VALUES
# Get all categorical variables that have NAs in them
cats_with_na = vars_with_na[vars_with_na.isin(vars_to_ohe)].tolist()
display(cats_with_na)

# Replace NA values with "missing"
for var in cats_with_na:
    df_tmp[var] = df_tmp[var].fillna("missing")

['humus',
 'obschemin',
 'obsriv',
 'roche',
 'text1',
 'text2',
 'topo',
 'tsol',
 'instp5',
 'land_use']

In [128]:
# GET NUMERICAL VARIABLES WITH NA VALUES

# No categorical variables
numerics_with_na = [var for var in df_tmp.columns if df_tmp[var].dtype != "O"]

# No date time variables
numerics_with_na = [var for var in numerics_with_na if var not in date_cols]

# No target variable
numerics_with_na = [var for var in numerics_with_na if var != target]
numerics_with_na

['affroc',
 'afpla',
 'cai40',
 'cailloux',
 'herb',
 'masque',
 'prof2',
 'lign1',
 'lign2',
 'mousse',
 'gest',
 'nlisi5',
 'pentexp',
 'campagne_1',
 'dc_1',
 'campagne_2',
 'dc_2',
 'lat',
 'lat_fr',
 'lon',
 'lon_fr',
 'n_species_per_plot',
 'site_ba_prc_cut_at_v2',
 'site_ba_prc_dead_at_v1',
 'site_ba_prc_dead_at_v2',
 'site_ba_prc_rec_at_v2',
 'site_total_ba_at_v1',
 'site_total_ba_at_v2']

In [129]:
df_final_before_traintest = df_tmp.copy()

# Prepare Train / Test

## Split Data

In [130]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df_final_before_traintest.drop(target, axis=1),
    df_final_before_traintest[target],
    test_size=test_split,
    random_state=seed_nr,
    stratify=df_final_before_traintest[test_train_strata],
)

# TODO: If stratify is more than one column, the new stratification variable
# must be removed again. This is not implemented yet at all.
# Eg ser + campagne_1

# Remove stratification column
# X_train = X_train.drop(test_train_strata, axis=1)
# X_test  = X_test.drop(test_train_strata, axis=1)

## Preprocessing

In [152]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df_final_before_traintest.drop(target, axis=1),
    df_final_before_traintest[target],
    test_size=test_split,
    random_state=seed_nr,
    stratify=df_final_before_traintest[test_train_strata],
)

# One-hot encoding
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_train_encoded = pd.DataFrame(
    ohe.fit_transform(X_train[vars_to_ohe]),
    columns=ohe.get_feature_names_out(vars_to_ohe),
)

X_test_encoded = pd.DataFrame(
    ohe.transform(X_test[vars_to_ohe]), columns=ohe.get_feature_names_out(vars_to_ohe)
)

# Concatenate the encoded variables with the rest of the dataset
# A bit hacky because the index is reset, but it works
df1 = X_train.drop(vars_to_ohe, axis=1).reset_index(drop=True)
df2 = X_train_encoded.reset_index(drop=True)
X_train = pd.concat([df1, df2], axis=1)

df1 = X_test.drop(vars_to_ohe, axis=1).reset_index(drop=True)
df2 = X_test_encoded.reset_index(drop=True)
X_test = pd.concat([df1, df2], axis=1)

# KNN imputation
imputer = KNNImputer(n_neighbors=5)
X_train[numerics_with_na] = imputer.fit_transform(X_train[numerics_with_na])
X_test[numerics_with_na] = imputer.transform(X_test[numerics_with_na])

In [114]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

# Impute categorical variables with NAs with "missing"
mis = SimpleImputer(missing_values=None, strategy="constant", fill_value="missing")

# Define KNN imputer
imp = KNNImputer(n_neighbors=5, add_indicator=True)

# Initiate one-hot-encoder
# sparse_output=False to get a numpy array instead of a sparse matrix
# handle_unknown="ignore" to ignore new categories in test set (if the category is not present in the training set)
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# # Test which variables are NOT in the dataframe
# all_vars_to_transform = vars_to_ohe + numerics_with_na + cats_with_na
# print("Check if all variables are in the dataframe:")
# print("\nVariables to OHE that are NOT in the dataframe:")
# [print(var) for var in vars_to_ohe if var not in X_train.columns]

# Define column transformer
tr1 = make_column_transformer(
    (mis, cats_with_na), n_jobs=-1, verbose=True, verbose_feature_names_out=True
)
tr2 = make_column_transformer(
    (ohe, vars_to_ohe), n_jobs=-1, verbose=True, verbose_feature_names_out=True
)
tr3 = make_column_transformer(
    (imp, numerics_with_na), n_jobs=-1, verbose=True, verbose_feature_names_out=True
)
# preprocessor = make_column_transformer(
#     (mis, cats_with_na),
#     (ohe, vars_to_ohe),
#     (imp, numerics_with_na),
#     n_jobs=-1,
#     verbose=True,
#     verbose_feature_names_out=True,
# )

pl = Pipeline(
    [
        ("tr1", tr1),
        ("tr2", tr2),
        ("tr3", tr3),
        # ("regressor", RandomForestRegressor(random_state=seed_nr)),
    ]
)

preprocessor.fit(X_train)

X_train_pp = preprocessor.transform(X_train)
X_test_pp = preprocessor.transform(X_test)

pd.DataFrame(X_train_pp)

[ColumnTransformer] . (2 of 3) Processing onehotencoder, total=   0.0s
[ColumnTransformer] .... (3 of 3) Processing knnimputer, total=   0.2s
[ColumnTransformer] . (1 of 3) Processing simpleimputer, total=   0.0s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,458,459,460,461,462,463,464,465,466,467
0,15.0,0.0,0.0,314.0,0.0,8.0,2.0,27.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,40.0,0.0,0.0,230.0,5.0,7.0,0.0,81.0,0.0,No Use,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,31.0,1.0,1.0,233.0,0.0,1.0,0.0,11.0,0.0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,,,,,,,,0.0,No Use,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
4,31.0,0.0,0.0,310.0,0.0,7.0,2.0,81.0,0.0,No Use,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6175,20.0,0.0,0.0,230.0,0.0,6.0,4.0,34.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6176,31.0,0.0,0.0,230.0,0.0,6.0,4.0,34.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6177,22.0,1.0,0.0,230.0,0.0,4.0,3.0,34.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6178,31.0,0.0,0.0,810.0,0.0,4.0,3.0,34.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Impute missing values

## Fit Model

In [110]:
# Create a Random Forest Regressor object with the best parameters
# rf_best = RandomForestRegressor(**grid_search.best_params_, random_state=42, n_jobs=-1)

# Simple one run RF
rf_best = RandomForestRegressor(n_estimators=100, random_state=seed_nr, n_jobs=-1)
xxx = pd.DataFrame(X_train_pp)
xxx

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,458,459,460,461,462,463,464,465,466,467
0,15.0,0.0,0.0,314.0,0.0,8.0,2.0,27.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,40.0,0.0,0.0,230.0,5.0,7.0,0.0,81.0,0.0,No Use,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,31.0,1.0,1.0,233.0,0.0,1.0,0.0,11.0,0.0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,,,,,,,,,0.0,No Use,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
4,31.0,0.0,0.0,310.0,0.0,7.0,2.0,81.0,0.0,No Use,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6175,20.0,0.0,0.0,230.0,0.0,6.0,4.0,34.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6176,31.0,0.0,0.0,230.0,0.0,6.0,4.0,34.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6177,22.0,1.0,0.0,230.0,0.0,4.0,3.0,34.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6178,31.0,0.0,0.0,810.0,0.0,4.0,3.0,34.0,0,Wood Production,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
# Fit the model on the training data
rf_best.fit(X_train_pp, y_train)

ValueError: could not convert string to float: 'Wood Production'

In [359]:
# Predict on the test data
y_pred = rf_best.predict(X_test)

# Calculate the evaluation metrics
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

# Print the evaluation metrics
print("R2 score: ", r2)
print("RMSE: ", rmse)
print("MAE: ", mae)

NotFittedError: This RandomForestRegressor instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
# Plot the variable importance
importances = rf_best.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf_best.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]  # only top 20 features

plt.figure()
plt.title("Top 20 Most Influential Features")
plt.bar(
    range(20),  # only top 20 features
    importances[indices],
    color="r",
    yerr=std[indices],
    align="center",
)

# Convert X_train to a pandas dataframe
X_train_df = pd.DataFrame(X_train)

plt.xticks(
    range(20), X_train_df.columns[indices], rotation=315, ha="left"
)  # only top 20 features
plt.xlim([-1, 20])  # only top 20 features


top_20 = pd.DataFrame(
    {
        "Feature": X_train_df.columns[indices][:20],
        "Importance": importances[indices][:20],
    }
)
top_20

In [None]:
import seaborn as sns

# Plot the predicted against true values
sns.regplot(
    x=y_pred,
    y=y_test,
    scatter_kws=dict(color="gray", s=10, alpha=0.05),
    line_kws=dict(color="blue"),
)
plt.plot([0, 1], [0, 1], transform=plt.gca().transAxes, ls="--", c="red")
plt.ylabel("Observations")
plt.xlabel("Predictions")
plt.title(
    f"Predicted versus observed values of {target}\nR2 score: {round(r2, 2)}, RMSE: {round(rmse, 2)}, MAE: {round(mae, 2)}"
)
plt.show()