In [None]:
import os
import re
import polars as pl
import dask.dataframe as dd
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.sparse import csr_matrix
import numpy as np

In [None]:
# Chemins des fichiers
proteome_dir_path = "data/Proteome_1011"
phenotype_data_path = "data/Finalset_223phenotypes_1011.csv"
copy_number_variation_path = "data/genesMatrix_CopyNumber.tab"

In [None]:
# Extraction des mutations avec Polars
def extract_mutations(proteome_dir_path):
    mutations = []
    for filename in os.listdir(proteome_dir_path):
        if filename.endswith(".fasta"):
            filepath = os.path.join(proteome_dir_path, filename)
            protein_id = filename.split(".")[0]  # Nom de la protéine
            
            sequences = {}
            with open(filepath, "r") as file:
                current_yeast = None
                for line in file:
                    if line.startswith(">"):  # Identifiant de la levure
                        header = line.strip().split()[0][1:]  # Retirer le ">"
                        current_yeast = header.split(f"_{protein_id}", 1)[0]
                        sequences[current_yeast] = ""
                    else:
                        sequences[current_yeast] += line.strip()
            
            yeast_ids = list(sequences.keys())
            transposed_positions = zip(*sequences.values())
            
            for idx, amino_acids in enumerate(transposed_positions, start=1):
                aa_counts = Counter(amino_acids)
                most_common_aa, _ = aa_counts.most_common(1)[0]
                
                for yeast_id, aa in zip(yeast_ids, amino_acids):
                    mutations.append({
                        "Protein_ID": protein_id,
                        "Yeast_ID": yeast_id,
                        "Position": idx,
                        "Reference_AA": most_common_aa,
                        "Mutated_AA": aa if aa != most_common_aa else "/"
                    })
    
    return pl.DataFrame(mutations)

print("Extraction des mutations...")
mutations_df = extract_mutations(proteome_dir_path)


Extraction des mutations...


In [None]:
# Chargement des données phénotypiques et CNV avec Dask
print("Chargement des données phénotypiques et CNV...")
phenotype_df = dd.read_csv(phenotype_data_path).compute()
cnv_df = dd.read_csv(copy_number_variation_path, sep="\t", dtype=str).T.compute()

# Prétraitement CNV : extraction des IDs et normalisation
cnv_df.index.name = "Standard_name"
cnv_df.reset_index(inplace=True)
cnv_df["Protein_ID"] = cnv_df["Standard_name"].apply(
    lambda x: re.match(r".+\.(Y[A-Z0-9]+)", x).group(1) if re.match(r".+\.(Y[A-Z0-9]+)", x) else None
)
cnv_df.drop(columns=["Standard_name"], inplace=True)
cnv_df.fillna(0, inplace=True)

In [None]:
# Création de la matrice de mutations étendue
def create_mutation_matrix(mutations_df):
    mutations_df = mutations_df.with_column(
        (mutations_df["Protein_ID"] + "_" + mutations_df["Position"].cast(str) +
         "_" + mutations_df["Reference_AA"] + "->" + mutations_df["Mutated_AA"]).alias("Mutation_Label")
    )
    
    mutation_matrix = mutations_df.to_pandas().pivot_table(
        index="Yeast_ID", 
        columns="Mutation_Label", 
        aggfunc="size", 
        fill_value=0
    )
    return mutation_matrix

print("Création de la matrice de mutations...")
mutation_matrix = create_mutation_matrix(mutations_df)


In [None]:
# Réduction de dimension avec SVD
def reduce_dimensionality(matrix):
    sparse_matrix = csr_matrix(matrix.values)
    variance_filter = VarianceThreshold(threshold=0.01)
    filtered_matrix = variance_filter.fit_transform(sparse_matrix)
    
    svd = TruncatedSVD(n_components=50, random_state=42)
    reduced_matrix = svd.fit_transform(filtered_matrix)
    
    return pd.DataFrame(reduced_matrix, columns=[f"PC_{i}" for i in range(reduced_matrix.shape[1])])

print("Réduction de dimension...")
reduced_mutation_matrix = reduce_dimensionality(mutation_matrix)


In [None]:
# Préparation des données pour le modèle
phenotype_df = pl.DataFrame(phenotype_df).rename({"Standard_name": "Yeast_ID"})
phenotype_df = phenotype_df.with_columns(
    phenotype_df["YPD_doublingtime"].fill_null(phenotype_df["YPD_doublingtime"].mean())
)
phenotype_df = phenotype_df.sort("Yeast_ID")

merged_df = pl.DataFrame(reduced_mutation_matrix)
merged_df = merged_df.with_column(mutation_matrix.index.to_series().rename("Yeast_ID"))
merged_df = merged_df.join(phenotype_df, on="Yeast_ID")


In [None]:
# Entraînement du modèle
X = merged_df.drop("YPD_doublingtime").to_numpy()
y = merged_df["YPD_doublingtime"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Entraînement du modèle...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)

# Évaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Importance des mutations
feature_importances = model.feature_importances_
important_features = sorted(
    zip(mutation_matrix.columns, feature_importances), key=lambda x: x[1], reverse=True
)
print("Top 10 mutations affectant le YPD doubling time :")
for feature, importance in important_features[:10]:
    print(f"{feature}: {importance}")