In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
# Charger les fichiers
mutations_file = "data/extend_mutations_dataset.csv"  # Fichier contenant les mutations
phenotype_file = "data/filtered_phenotype_dataset.csv"  # Fichier contenant les YPD doubling times

mutations_df = pd.read_csv(mutations_file)
phenotype_df = pd.read_csv(phenotype_file)

In [None]:
# Vérifier l'ordre des Yeast_ID
if not all(mutations_df["Yeast_ID"] == phenotype_df["Yeast_ID"]):
    raise ValueError("L'ordre des Yeast_ID ne correspond pas entre les deux fichiers.")

In [None]:
# Préparer les données
X = mutations_df.drop(columns=["Yeast_ID"])
y = phenotype_df["YPD_doublingtime"]

# Gérer les valeurs manquantes dans y
y.fillna(y.mean(), inplace=True)

In [None]:
# Diviser les données en ensemble d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:


# Modèle : Random Forest Regressor
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Prédictions
y_pred = model.predict(X_test)

# Évaluer le modèle
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

# Importance des caractéristiques
feature_importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": model.feature_importances_
}).sort_values(by="Importance", ascending=False)


In [None]:
# Afficher les mutations les plus importantes
top_mutations = feature_importances.head(10)
print("\nMutations ayant le plus d'impact sur le YPD doubling time :")
print(top_mutations)

# Visualisation
plt.figure(figsize=(10, 6))
plt.barh(top_mutations["Feature"], top_mutations["Importance"], color="skyblue")
plt.xlabel("Importance")
plt.ylabel("Mutation")
plt.title("Top 10 Mutations Impacting YPD Doubling Time")
plt.gca().invert_yaxis()
plt.show()