In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
import shap
import matplotlib.pyplot as plt

# Define the current directory if __file__ is not available
current_dir = os.getcwd()  # Gets the current working directory
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))  # Moves one level up

from save_and_compare_results import *

ModuleNotFoundError: No module named 'shap'

In [None]:
# Define the path to the parent directory
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Move one level up

# Charger les données prétraitées
X = pd.read_pickle(os.path.join(data_dir, "data/X_matrix_proteins_nb.pkl"))
Y = pd.read_pickle(os.path.join(data_dir, "data/Y_matrix_proteins_nb.pkl"))

In [None]:
# Preprocess the data
X = X.drop(columns=["Yeast_ID"]).fillna(0)
Y = Y.drop(columns=["Yeast_ID"]).fillna(Y.drop(columns=["Yeast_ID"]).mean())

Prétraitement des données...


In [None]:
# Define the hyperparameter grid
param_grid = {
    "alpha": [0.05, 0.1, 0.2],  # Regularization strength
    "l1_ratio": [0.7, 0.9, 1.0],  # Mix between L1 (LASSO) and L2 (Ridge)
}

In [None]:
# Initialize ElasticNet
elastic_net = ElasticNet(max_iter=10000, random_state=42)

# GridSearchCV
print("Running hyperparameter tuning...")
grid_search = GridSearchCV(
    estimator=elastic_net,
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    verbose=2,
    n_jobs=-1
)

In [None]:
# Fit to the data
grid_search.fit(X, Y.values.ravel())

# Get the best model and parameters
best_elastic_net = grid_search.best_estimator_
best_params = grid_search.best_params_
print("\nBest hyperparameters:", best_params)

In [None]:
# Evaluate the best model
y_pred = best_elastic_net.predict(X)
r2 = r2_score(Y, y_pred)
mse = mean_squared_error(Y, y_pred)
print(f"\nR² Score: {r2:.4f}, Mean Squared Error: {mse:.4f}")

# Model features importance

In [None]:
# Save feature importance from ElasticNet
print("Saving ElasticNet feature importances...")
save_feature_importance(
    features=X.columns,
    importance_scores=best_elastic_net.coef_,
    method="model",
    model_name="ElasticNet"
)

# Display and plot the top 10 features
print("Displaying top 10 features...")
feature_importances_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_elastic_net.coef_
}).sort_values(by="Importance", ascending=False)

top_features = feature_importances_df.head(10)

print("\nTop 10 Features Impacting YPD Doubling Time:")
print(top_features)

plt.figure(figsize=(10, 6))
plt.barh(top_features["Feature"], top_features["Importance"], color="skyblue")
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.title("Top 10 Features (ElasticNet)")
plt.gca().invert_yaxis()
plt.show()

# SHAP features importance

In [None]:
# Create a SHAP explainer for ElasticNet
explainer = shap.Explainer(best_elastic_net, X)

# Calculate SHAP values
print("Calculating SHAP values for ElasticNet...")
shap_values = explainer(X)

# Save SHAP feature importance
shap_mean_importance = np.abs(shap_values.values).mean(axis=0)

print("Saving SHAP feature importances for ElasticNet...")
save_feature_importance(
    features=X.columns,
    importance_scores=shap_mean_importance,
    method="SHAP",
    model_name="ElasticNet"
)

# Generate SHAP summary plots
print("Generating SHAP summary plots for ElasticNet...")
shap.summary_plot(shap_values, X, plot_type="bar")
shap.summary_plot(shap_values, X)