In [None]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.linear_model import LassoCV
import json
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# Define the current directory if __file__ is not available
current_dir = os.getcwd()  # Gets the current working directory
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))  # Moves one level up

# Add the parent directory to the Python path
sys.path.insert(0, parent_dir)

from preprocessing import *
from save_and_compare_results import *



In [None]:
# Define the path to the parent directory
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Move one level up

# Charger les fichiers
X_file = os.path.join(data_dir, "data/X_matrix.csv")
Y_file = os.path.join(data_dir, "data/Y_matrix.csv")

print("Chargement des données...")
X = pd.read_csv(X_file)
Y = pd.read_csv(Y_file)

In [None]:
X = X.drop(columns=["Yeast_ID"]).fillna(0)
Y = Y.drop(columns=["Yeast_ID"]).fillna(Y.drop(columns=["Yeast_ID"]).mean())

X_pre, Y_pre = preprocessed_data(X, Y)

In [None]:
# Division des données en ensembles d'entraînement et de test
print("Division des données en ensembles d'entraînement et de test...")
X_train, X_test, y_train, y_test = train_test_split(X_pre, Y_pre, test_size=0.2, random_state=42)

# Affichage des dimensions des ensembles divisés
print(f"Dimensions de X_train : {X_train.shape}, X_test : {X_test.shape}")
print(f"Dimensions de y_train : {y_train.shape}, y_test : {y_test.shape}")

In [None]:
# Hyperparameter tuning using LassoCV
lasso_cv = LassoCV(
    alphas=np.logspace(-4, 1, 50),  # Search over a range of alpha values
    cv=5,                           # 5-fold cross-validation
    random_state=42
)

In [None]:
# Fit LassoCV to the training data
lasso_cv.fit(X_train, y_train)

In [None]:
# Best alpha value
best_alpha = lasso_cv.alpha_
print(f"Optimal Alpha: {best_alpha}")

In [None]:
# Evaluate model performance on validation set
y_pred = lasso_cv.predict(X_test)

In [None]:
# Sauvegarder les résultats
pd.DataFrame(y_pred, index=y_test.index, columns=Y_pre.columns).to_csv('results/y_test_predicted_LASSO.csv')
pd.DataFrame(y_test, index=y_test.index, columns=Y_pre.columns).to_csv('results/y_test_true_LASSO.csv')

In [None]:
# Calcul des métriques
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.6f}")
print(f"R² Score: {r2:.6f}")

In [None]:
# Save feature importances
coefficients_df = pd.DataFrame(
    lasso_cv.coef_.reshape(1, -1),  # Ensure coefficients are a row vector
    columns=X_train.columns  # Feature names
)
coefficients_df.to_csv('results/LASSO_feature_importances.csv', index=False)

In [None]:
# Visualisation des résultats
print("Affichage des résultats...")
top_features = coefficients_df.abs().mean(axis=0).sort_values(ascending=False).head(10)
print(top_features)
plt.figure(figsize=(10, 6))
plt.barh(top_features.index, top_features.values, color="skyblue")
plt.xlabel("Importance moyenne des coefficients absolus")
plt.ylabel("Mutation / Variation")
plt.title("Top 10 Mutations et CNVs Impactant le YPD Doubling Time")
plt.gca().invert_yaxis()
plt.show()

In [None]:
# Enregistrement des métadonnées
with open(f'results/LASSO_model_metadata.json', 'w+') as f:
    metadata = {
        'best_alpha': best_alpha,
        'mse': mse,
        'r2_score': r2,
        'training_samples': len(X_train),
        'testing_samples': len(X_test)
    }
    json.dump(metadata, f)
print("Les métadonnées du modèle ont été sauvegardées.")