In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import uniform, randint
import matplotlib.pyplot as plt
from preprocessing import *
from save_and_compare_results import *
from sklearn.linear_model import LassoCV
import shap

In [None]:
# Load the preprocessed data
X = pd.read_pickle("X_matrix_proteins.pkl")
Y = pd.read_pickle("Y_matrix_proteins.pkl")

Chargement des données...


In [None]:
# Preprocess the data
X = X.drop(columns=["Yeast_ID"]).fillna(0)
Y = Y.drop(columns=["Yeast_ID"]).fillna(Y.drop(columns=["Yeast_ID"]).mean())

In [None]:
# Define the hyperparameter grid
GBM_distributions = dict(
    max_features=["auto", "log2", "sqrt"],
    learning_rate=uniform(1e-3, 1),
    subsample=uniform(0, 1),
    min_samples_split=randint(2, 100),
    min_samples_leaf=randint(2, 100),
    n_estimators=randint(10, 200),
    criterion=['friedman_mse', 'squared_error'],
    max_depth=randint(2, 10)
)

Définition des paramètres pour la recherche aléatoire...


In [None]:
n_iterations = 100
cross_val = 3
num_jobs = -1

multireg = MultiOutputRegressor(RandomizedSearchCV(
    GradientBoostingRegressor(loss="squared_error", n_iter_no_change=5),
    GBM_distributions,
    n_iter=n_iterations,
    verbose=10,
    cv=cross_val,
    n_jobs=num_jobs
)).fit(X, Y)

Lancement de l'entraînement multitâche avec Gradient Boosting...
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV 1/3; 1/100] START criterion=friedman_mse, learning_rate=0.5159515873187837, max_depth=8, max_features=sqrt, min_samples_leaf=43, min_samples_split=44, n_estimators=188, subsample=0.9753324755102544
[CV 3/3; 1/100] START criterion=friedman_mse, learning_rate=0.5159515873187837, max_depth=8, max_features=sqrt, min_samples_leaf=43, min_samples_split=44, n_estimators=188, subsample=0.9753324755102544
[CV 3/3; 2/100] START criterion=squared_error, learning_rate=0.8113053828369198, max_depth=7, max_features=sqrt, min_samples_leaf=49, min_samples_split=12, n_estimators=131, subsample=0.4838369945829888
[CV 2/3; 1/100] START criterion=friedman_mse, learning_rate=0.5159515873187837, max_depth=8, max_features=sqrt, min_samples_leaf=43, min_samples_split=44, n_estimators=188, subsample=0.9753324755102544
[CV 2/3; 3/100] START criterion=friedman_mse, learning_rate=0.6

In [None]:
# Get the best model and parameters
best_model = multireg.best_estimator_
best_params = multireg.best_params_
print("\nBest hyperparameters:", best_params)

In [None]:
# Evaluate the best model
y_pred = best_model.predict(X)
r2 = r2_score(Y, y_pred)
mse = mean_squared_error(Y, y_pred)
print(f"\nR² Score: {r2:.4f}, Mean Squared Error: {mse:.4f}")

# Model features importance

In [None]:
# Save feature importance
print("Saving GBM feature importances...")
save_feature_importance(
    features=X.columns,
    importance_scores=best_model.feature_importances_,
    method="model",
    model_name="GBM"
)

In [None]:
# Afficher et tracer les 10 caractéristiques les plus importantes
print("Affichage des 10 caractéristiques les plus importantes...")
feature_importances_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_lgbm.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Sélectionner les 10 caractéristiques les plus importantes
top_mutations = feature_importances_df.head(10)

# Afficher dans la console
print("\nMutations ayant le plus d'impact sur le YPD doubling time :")
print(top_mutations)

# Créer le graphique
plt.figure(figsize=(10, 6))
plt.barh(top_mutations["Feature"], top_mutations["Importance"], color="skyblue")
plt.xlabel("Importance")
plt.ylabel("Mutation")
plt.title("Top 10 Mutations Impacting YPD Doubling Time")
plt.gca().invert_yaxis()
plt.show()

# SHAP importances

In [None]:
# SHAP analysis
print("Calculating SHAP values...")
explainer = shap.Explainer(best_model, X)
shap_values = explainer(X)
shap_mean_importance = np.abs(shap_values.values).mean(axis=0)

# Save SHAP feature importance
print("Saving SHAP feature importances...")
save_feature_importance(
    features=X.columns,
    importance_scores=shap_mean_importance,
    method="SHAP",
    model_name="GBM"
)

In [None]:
# Generate SHAP summary plots
print("Generating SHAP plots...")
shap.summary_plot(shap_values, X, plot_type="bar")
shap.summary_plot(shap_values, X)