In [14]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
def create_mutation_matrix(df):
    # Create mutation labels
    df['Mutation_Label'] = df['Protein_ID'] + '_' + df['Position'].astype(str) + '_' + df['Reference_AA'] + '->' + df['Mutated_AA']
    
    # Drop duplicates to ensure each mutation appears only once
    df = df.drop_duplicates(subset=['Yeast_ID', 'Mutation_Label'])
    
    # Pivot the data to create a matrix, use Yeast_ID as index and Mutation_Label as columns
    mutation_matrix = df.pivot(index='Yeast_ID', 
                               columns='Mutation_Label', 
                               values='Mutation_Label')
    
    # Fill missing values with 0 and set the presence of mutation to 1
    mutation_matrix = mutation_matrix.notnull().astype(int)
    
    # Reset index to include Yeast_ID as a column
    mutation_matrix.reset_index(inplace=True)
    
    return mutation_matrix


dtype_dict = {
    'Yeast_ID': str,          # Yeast_ID might be a string
    'Protein_ID': str,        # Protein_ID might be a string
    'Position': str,          # Position might be an int
    'Reference_AA': str,      # Reference_AA might be a string
    'Mutated_AA': str,        # Mutated_AA might be a string
    'CNV': float,             # CNV might be a float (can also be NaN)
}

#mutations_data = pd.read_csv("data/merged_mutations_dataset.csv", dtype=dtype_dict)

mutations_data = pd.DataFrame({
    'Yeast_ID': ['APR', 'APR', 'APL', 'APL', 'BAH', 'BAH'],
    'Protein_ID': ['YIL109C', 'YKL096W-A', 'YIL109C', 'YKL096W-A', 'YDR343C', 'YDR343C'],
    'Position': [10, 12, 5, 6, 20, 21],
    'Reference_AA': ['A', 'T', 'G', 'C', 'A', 'T'],
    'Mutated_AA': ['G', 'A', 'C', 'A', 'G', 'C'],
})
mutation_matrix = create_mutation_matrix(mutations_data)


In [24]:
extend_mutations_output_path = "data/extend_mutations_dataset.csv"
mutation_matrix.to_csv(extend_mutations_output_path, index=False)
print(f"Extend mutations dataset saved to {extend_mutations_output_path}")

Extend mutations dataset saved to data/extend_mutations_dataset.csv


In [None]:
cnv_data = pd.read_csv("data/copy_number_variation_dataset.csv")
mutation_matrix = pd.read_csv("data/extend_mutations_dataset.csv")

# Extraire uniquement le nom de la protéine dans copy_number_variation_dataset
def extract_protein_id(standard_name):
    match = re.match(r".+\.(Y[A-Z0-9]+)(?:_.*)?", standard_name)
    return match.group(1) if match else None

cnv_data['Standard_name'] = cnv_data['Standard_name'].apply(extract_protein_id)

# Step 1: Extract the protein names from the mutation_matrix columns
protein_columns = [col.split('_')[0] for col in mutation_matrix.columns if col != 'Yeast_ID']
unique_proteins = set(protein_columns)

# Step 2: Filter the rows of 'data' to include only proteins in the mutation_matrix
filtered_data = cnv_data[cnv_data['Standard_name'].isin(unique_proteins)]

# Step 3: Identify missing proteins and calculate their median values for each yeast ID
missing_proteins = unique_proteins - set(filtered_data['Standard_name'])
median_values = filtered_data.set_index('Standard_name').median(axis=0)

# Step 4: Add rows for missing proteins with median values
for protein in missing_proteins:
    new_row = pd.DataFrame({protein: median_values}).transpose().reset_index()
    new_row.columns = ['Standard_name'] + list(median_values.index)
    filtered_data = pd.concat([filtered_data, new_row], ignore_index=True)

# Step 5: Transpose the filtered data
filtered_data_transposed = filtered_data.set_index('Standard_name').transpose()

# Step 6: Rename the first column to Yeast_ID
filtered_data_transposed.rename(columns={'index': 'Yeast_ID'}, inplace=True)

# Step 7: Reset the index in the mutation_matrix
mutation_matrix = mutation_matrix.reset_index(drop=True)

# Step 8: Merge the mutation_matrix with the transposed filtered data on Yeast_ID
combined_matrix = pd.merge(
    mutation_matrix, 
    filtered_data_transposed, 
    left_on='Yeast_ID', 
    right_index=True
)

In [34]:
combined_matrix_output_path = "data/combined_matrix.csv"
combined_matrix.to_csv(combined_matrix_output_path, index=False)
print(f"Combined_matrix X saved to {combined_matrix_output_path}")

Combined_matrix X saved to data/combined_matrix.csv


In [36]:
phenotype_data = pd.read_csv("data/phenotype_dataset.csv")
phenotype_data = phenotype_data.rename(columns={'Standard_name': 'Yeast_ID'})
columns_of_interest = ['Yeast_ID', 'YPD_doublingtime']
filtered_phenotype = phenotype_data[columns_of_interest]
ordered_phenotype = filtered_phenotype.set_index('Yeast_ID').reindex(combined_matrix['Yeast_ID']).reset_index()

In [37]:
ordered_phenotype_output_path = "data/ordered_phenotype_dataset.csv"
ordered_phenotype.to_csv(ordered_phenotype_output_path, index=False)
print(f"Ordered phenotype saved to {ordered_phenotype_output_path}")

Ordered phenotype saved to data/ordered_phenotype_dataset.csv


In [None]:

# Charger les datasets
cnv_data = pd.read_csv("copy_number_variation_dataset.csv")
proteome_data = pd.read_csv("proteome_dataset.csv")
phenotype_data = pd.read_csv("phenotype_dataset.csv")

# Joindre les datasets : Copy Number Variations et Proteome
# Fusion basée sur le `Standard_name` ou une autre clé partagée
merged_data = cnv_data.set_index("Standard_name").join(proteome_data.set_index("Protein_ID"), how="inner")
merged_data = merged_data.reset_index()

# Ajouter la cible (YPD_doublingtime) à partir du fichier de phénotypes
merged_data = merged_data.merge(
    phenotype_data[["Standard_name", "YPD_doublingtime"]], 
    left_on="index", 
    right_on="Standard_name", 
    how="inner"
)

# Supprimer les colonnes inutiles
merged_data = merged_data.drop(columns=["Standard_name", "index"])

# Prétraitement des données
# 1. CNV - Normalisation
cnv_features = merged_data.iloc[:, :-2]  # Toutes les colonnes sauf la séquence et YPD_doublingtime
scaler = StandardScaler()
cnv_normalized = scaler.fit_transform(cnv_features)

# 2. Proteome - TF-IDF pour encoder les séquences protéiques
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(3, 3))  # Tri-grammes pour motifs locaux
proteome_encoded = vectorizer.fit_transform(merged_data["Sequence"])

# Réduction de dimension avec PCA (pour limiter la taille des données après TF-IDF)
pca = PCA(n_components=50)
proteome_reduced = pca.fit_transform(proteome_encoded.toarray())

# Combiner CNV normalisé et protéome réduit
final_data = pd.DataFrame(cnv_normalized).join(pd.DataFrame(proteome_reduced))

# Ajouter la cible (YPD_doublingtime)
final_data["YPD_doublingtime"] = merged_data["YPD_doublingtime"]

# Séparation en ensembles d'entraînement et de test
X = final_data.drop(columns=["YPD_doublingtime"])
y = final_data["YPD_doublingtime"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


NameError: name 'pd' is not defined

In [2]:
# Modèle : Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Évaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Importance des caractéristiques pour les variations causatives
feature_importances = model.feature_importances_
sorted_indices = feature_importances.argsort()[::-1]
print("Top 10 Feature Importances:")
for idx in sorted_indices[:10]:
    print(f"Feature {idx}: {feature_importances[idx]}")

NameError: name 'RandomForestRegressor' is not defined