In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from save_and_compare_results import *
import shap

In [None]:
X = pd.read_pickle("X_matrix_proteins_nb.pkl")
Y = pd.read_pickle("Y_matrix_proteins_nb.pkl")

In [None]:
# Preprocess the data
X = X.drop(columns=["Yeast_ID"]).fillna(0)
Y = Y.drop(columns=["Yeast_ID"]).fillna(Y.drop(columns=["Yeast_ID"]).mean())

In [None]:
param_grid = {
    "n_estimators": [100, 200, 500],  
    "max_depth": [10, 30, None],      
    "min_samples_split": [2, 5, 10], 
    "min_samples_leaf": [1, 2, 4],  
    "max_features": ["sqrt", "log2", None] 
}

rf = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring="r2",
    cv=5,
    n_jobs=-1,
    verbose=2
)

In [None]:
print("Training of Random Forest with hyperparameters tuning")
grid_search.fit(X, Y.values.ravel())

In [None]:
best_rf = grid_search.best_estimator_
best_params = grid_search.best_params_
print("\nBest hyperparameters :", best_params)

In [None]:
y_pred = best_rf.predict(X)
r2 = r2_score(Y, y_pred)
mse = mean_squared_error(Y, y_pred)
print(f"\nR² Score: {r2:.4f}, Mean Squared Error: {mse:.4f}")

# Model features importance

In [None]:
# Save feature importance from RF
print("Saving RF feature importances...")
save_feature_importance(
    features=X.columns,
    importance_scores=best_rf.feature_importances_,
    method="model",
    model_name="RandomForest"
)

# Display and plot the top 10 features
print("Displaying top 10 features...")
feature_importances_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_rf.feature_importances_
}).sort_values(by="Importance", ascending=False)

top_features = feature_importances_df.head(10)

print("\nTop 10 Features Impacting YPD Doubling Time:")
print(top_features)

plt.figure(figsize=(10, 6))
plt.barh(top_features["Feature"], top_features["Importance"], color="skyblue")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.title("Top 10 Features (RF)")
plt.gca().invert_yaxis()
plt.show()

# SHAP features importance

In [None]:
print("Calcul of SHAP values...")
explainer = shap.Explainer(best_rf, X)
shap_values = explainer(X)
shap_mean_importance = np.abs(shap_values.values).mean(axis=0)

# Sauvegarder les importances SHAP
print("Saving SHAP feature importances for Random Forest...")
save_feature_importance(
    features=X.columns,
    importance_scores=shap_mean_importance,
    method="SHAP",
    model_name="RandomForest"
)

# %%
# Graphiques SHAP
print("Generating SHAP summary plots for RF...")
shap.summary_plot(shap_values, X, plot_type="bar")
shap.summary_plot(shap_values, X)