In [3]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

idées de coco:
- on pourrait supprimer les yeasts qui ont les même mutations exactement (mais je sais pas si ça existe)
- si on supprime les features corrélées, c'est bien pour prédire "y" mais pour comprendre à quoi "y" est dû, c'est pas top non? Ex: si deux mutations sont corrélées (très liées), pour la compréhension de "on arrive à ce phénotype y si...", ce sera "si on a les deux mutations", mais si on en tej une, on aura "on arrive à ce phénotype y si : la yeast exprime la mutation (celle qu'on a pas tej)"

In [None]:
# Charger les datasets
cnv_data = pd.read_csv("copy_number_variation_dataset.csv")
proteome_data = pd.read_csv("proteome_dataset.csv")
phenotype_data = pd.read_csv("phenotype_dataset.csv")

# Joindre les datasets : Copy Number Variations et Proteome
# Fusion basée sur le `Standard_name` ou une autre clé partagée
merged_data = cnv_data.set_index("Standard_name").join(proteome_data.set_index("Protein_ID"), how="inner")
merged_data = merged_data.reset_index()

# Ajouter la cible (YPD_doublingtime) à partir du fichier de phénotypes
merged_data = merged_data.merge(
    phenotype_data[["Standard_name", "YPD_doublingtime"]], 
    left_on="index", 
    right_on="Standard_name", 
    how="inner"
)

# Supprimer les colonnes inutiles
merged_data = merged_data.drop(columns=["Standard_name", "index"])

# Prétraitement des données
# 1. CNV - Normalisation
cnv_features = merged_data.iloc[:, :-2]  # Toutes les colonnes sauf la séquence et YPD_doublingtime
scaler = StandardScaler()
cnv_normalized = scaler.fit_transform(cnv_features)

# 2. Proteome - TF-IDF pour encoder les séquences protéiques
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3, 3))  # Tri-grammes pour motifs locaux
proteome_encoded = vectorizer.fit_transform(merged_data["Sequence"])

# Réduction de dimension avec PCA (pour limiter la taille des données après TF-IDF)
pca = PCA(n_components=50)
proteome_reduced = pca.fit_transform(proteome_encoded.toarray())

# Combiner CNV normalisé et protéome réduit
final_data = pd.DataFrame(cnv_normalized).join(pd.DataFrame(proteome_reduced))

# Ajouter la cible (YPD_doublingtime)
#final_data["YPD_doublingtime"] = merged_data["YPD_doublingtime"]

# Séparation en ensembles d'entraînement et de test
X = final_data.drop(columns=["YPD_doublingtime"])
y = final_data["YPD_doublingtime"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


NameError: name 'pd' is not defined

In [None]:
#preprocess coco
from scipy.sparse import csr_matrix
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import TruncatedSVD
import dask.dataframe as dd

X = pd.read_csv("data/X_matrix.csv")
ordered_phenotype = pd.read_csv("data/ordered_phenotype_dataset.csv")
y = ordered_phenotype["YPD_doublingtime"]

# 1. Convert the mutation matrix to sparse representation
print("Converting to sparse matrix...")
sparse_mutation_matrix = csr_matrix(X.iloc[:, 1:].values)

# 2. Dimensionality Reduction
# a. Low-Variance Filter
print("Applying low-variance filter...")
variance_threshold = 0.01
selector = VarianceThreshold(threshold=variance_threshold)
reduced_sparse_matrix = selector.fit_transform(sparse_mutation_matrix)
selected_features = selector.get_support(indices=True)

# b. Truncated SVD (for further reduction)
print("Applying PCA using Truncated SVD...")
n_components = 9  # Adjust based on the desired dimensionality
svd = TruncatedSVD(n_components=n_components, random_state=42)
reduced_matrix_pca = svd.fit_transform(reduced_sparse_matrix)

# 3. Convert Reduced Matrix Back to DataFrame
print("Converting reduced matrix back to DataFrame...")
reduced_df = pd.DataFrame(
    reduced_matrix_pca, 
    columns=[f'PC_{i}' for i in range(reduced_matrix_pca.shape[1])]
)
reduced_df.insert(0, 'Yeast_ID', X['Yeast_ID'])

# 4. Normalize Data
print("Normalizing data...")
reduced_df.iloc[:, 1:] = (reduced_df.iloc[:, 1:] - reduced_df.iloc[:, 1:].mean()) / reduced_df.iloc[:, 1:].std()

# 5. Save Intermediate Results
print("Saving to file...")
reduced_df.to_parquet('reduced_mutation_matrix.parquet')

# 6. Use Dask for Further Processing (if needed)
print("Processing in batches using Dask...")
dask_df = dd.read_parquet('reduced_mutation_matrix.parquet')

# Handle missing data: Fill NaN with 0 (or another value)
processed_dask_df = dask_df.fillna(0)  # Adjust fill value as needed

# Compute and Save Final Dataset
final_df = processed_dask_df.compute()
final_df.to_parquet('final_mutation_matrix.parquet')

print("Preprocessing complete.")

X_train, X_test, y_train, y_test = train_test_split(final_df, y, test_size=0.2, random_state=42)


Converting to sparse matrix...
Applying low-variance filter...
Applying PCA using Truncated SVD...
Converting reduced matrix back to DataFrame...
Normalizing data...
Saving to file...
Processing in batches using Dask...
Dask DataFrame Structure:
              Yeast_ID     PC_0     PC_1     PC_2
npartitions=1                                    
                string  float64  float64  float64
                   ...      ...      ...      ...
Dask Name: read_parquet, 1 expression
Expr=ReadParquetFSSpec(a1a3beb)
Preprocessing complete.


In [2]:
# Modèle : Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Évaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Importance des caractéristiques pour les variations causatives
feature_importances = model.feature_importances_
sorted_indices = feature_importances.argsort()[::-1]
print("Top 10 Feature Importances:")
for idx in sorted_indices[:10]:
    print(f"Feature {idx}: {feature_importances[idx]}")

NameError: name 'RandomForestRegressor' is not defined