In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def create_mutation_matrix(df):
    # Create mutation labels
    df['Mutation_Label'] = df['Protein_ID'] + '_' + df['Position'].astype(str) + '_' + df['Reference_AA'] + '->' + df['Mutated_AA']
    
    # Pivot the data to create a matrix
    mutation_matrix = df.pivot_table(index='Yeast_ID', 
                                     columns='Mutation_Label', 
                                     aggfunc=lambda x: 1, 
                                     fill_value=0)
    
    
    # Reset index to include Yeast_ID as a column
    mutation_matrix.reset_index(inplace=True)
    
    return mutation_matrix


dtype_dict = {
    'Yeast_ID': str,          # Yeast_ID might be a string
    'Protein_ID': str,        # Protein_ID might be a string
    'Position': str,          # Position might be an int
    'Reference_AA': str,      # Reference_AA might be a string
    'Mutated_AA': str,        # Mutated_AA might be a string
    'CNV': float,             # CNV might be a float (can also be NaN)
}

mutations_data = pd.read_csv("data/merged_mutations_dataset.csv", dtype=dtype_dict)
mutation_matrix = create_mutation_matrix(mutations_data)


In [None]:
extend_mutations_output_path = "data/extend_mutations_dataset.csv"
mutation_matrix.to_csv(extend_mutations_output_path, index=False)
print(f"Extend mutations dataset saved to {extend_mutations_output_path}")

In [1]:
cnv_data = pd.read_csv("data/copy_number_variation_dataset.csv")

# Step 1: Reset index of 'cnv_data' so that 'Standard_name' becomes a column
data_reset = cnv_data.reset_index(drop=True)

# Step 2: Transpose 'data_reset' to align it by Yeast_IDs and reset the index
data_transposed = data_reset.set_index('Standard_name').transpose()

# Step 3: Reset index in mutation_matrix so Yeast_ID is a column
mutation_matrix = mutation_matrix.reset_index()

# Step 4: Merge the mutation matrix with the transposed data on Yeast_ID
combined_matrix = pd.merge(mutation_matrix, data_transposed, on='Yeast_ID')


NameError: name 'pd' is not defined

In [None]:
phenotype_data = pd.read_csv("data/phenotype_dataset.csv")
columns_of_interest = ['Standard_name', 'YBL_doublingtime']
filtered_phenotype = phenotype_data[columns_of_interest]
ordered_phenotype = filtered_phenotype.set_index('Standard_name').reindex(X['Yeast_ID']).reset_index()

KeyError: 'Yeast_ID'

In [None]:

# Charger les datasets
cnv_data = pd.read_csv("copy_number_variation_dataset.csv")
proteome_data = pd.read_csv("proteome_dataset.csv")
phenotype_data = pd.read_csv("phenotype_dataset.csv")

# Joindre les datasets : Copy Number Variations et Proteome
# Fusion basée sur le `Standard_name` ou une autre clé partagée
merged_data = cnv_data.set_index("Standard_name").join(proteome_data.set_index("Protein_ID"), how="inner")
merged_data = merged_data.reset_index()

# Ajouter la cible (YPD_doublingtime) à partir du fichier de phénotypes
merged_data = merged_data.merge(
    phenotype_data[["Standard_name", "YPD_doublingtime"]], 
    left_on="index", 
    right_on="Standard_name", 
    how="inner"
)

# Supprimer les colonnes inutiles
merged_data = merged_data.drop(columns=["Standard_name", "index"])

# Prétraitement des données
# 1. CNV - Normalisation
cnv_features = merged_data.iloc[:, :-2]  # Toutes les colonnes sauf la séquence et YPD_doublingtime
scaler = StandardScaler()
cnv_normalized = scaler.fit_transform(cnv_features)

# 2. Proteome - TF-IDF pour encoder les séquences protéiques
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3, 3))  # Tri-grammes pour motifs locaux
proteome_encoded = vectorizer.fit_transform(merged_data["Sequence"])

# Réduction de dimension avec PCA (pour limiter la taille des données après TF-IDF)
pca = PCA(n_components=50)
proteome_reduced = pca.fit_transform(proteome_encoded.toarray())

# Combiner CNV normalisé et protéome réduit
final_data = pd.DataFrame(cnv_normalized).join(pd.DataFrame(proteome_reduced))

# Ajouter la cible (YPD_doublingtime)
final_data["YPD_doublingtime"] = merged_data["YPD_doublingtime"]

# Séparation en ensembles d'entraînement et de test
X = final_data.drop(columns=["YPD_doublingtime"])
y = final_data["YPD_doublingtime"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


NameError: name 'pd' is not defined

In [2]:
# Modèle : Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Évaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Importance des caractéristiques pour les variations causatives
feature_importances = model.feature_importances_
sorted_indices = feature_importances.argsort()[::-1]
print("Top 10 Feature Importances:")
for idx in sorted_indices[:10]:
    print(f"Feature {idx}: {feature_importances[idx]}")

NameError: name 'RandomForestRegressor' is not defined