In [3]:
import os
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter


In [4]:
# Chemins des fichiers et dossiers
proteome_dir_path = "data/Proteome_1011"  # Dossier contenant les fichiers de séquences protéiques
phenotype_data_path = "data/Finalset_223phenotypes_1011.csv"
copy_number_variation_path = "data/genesMatrix_CopyNumber.tab"

# Pour les phénotypes et les copy variations

In [5]:
# Chargement des données phénotypiques
phenotype_df = pd.read_csv(phenotype_data_path)

# Chargement des données de variation du nombre de copies
cnv_df = pd.read_csv(copy_number_variation_path, sep="\t", index_col=0).T
cnv_df.index.name = "Standard_name"

# Correction des valeurs manquantes
cnv_df.fillna(0, inplace=True)

# Réindexation pour une compatibilité ultérieure
cnv_df.reset_index(inplace=True)

# Pour les mutations pour chaque proteine sur chaque yeast

In [None]:
# Liste pour stocker les données de mutation
mutations = []

# Parcourir chaque fichier dans le dossier des protéines
for filename in os.listdir(proteome_dir_path):
    if filename.endswith(".fasta"):
        filepath = os.path.join(proteome_dir_path, filename)
        protein_id = filename.split(".")[0]  # Nom de la protéine
        
        # Lecture des séquences du fichier FASTA
        sequences = {}
        with open(filepath, "r") as file:
            current_yeast = None
            for line in file:
                if line.startswith(">"):  # Identifiant de la levure
                    header = line.strip().split()[0][1:]  # Retirer le ">"
                    # Extraire le nom de la levure avant le nom de la protéine
                    current_yeast = header.split(f"_{protein_id}", 1)[0]
                    
                    sequences[current_yeast] = ""
                else:
                    sequences[current_yeast] += line.strip()
        
        # Transposer les séquences pour analyser position par position
        yeast_ids = list(sequences.keys())
        transposed_positions = zip(*sequences.values())  # Colonne par colonne
        
        # Identifier les mutations par rapport à l'acide aminé majoritaire
        for idx, amino_acids in enumerate(transposed_positions, start=1):
            aa_counts = Counter(amino_acids)
            most_common_aa, _ = aa_counts.most_common(1)[0]  # Acide aminé majoritaire
            
            
            for yeast_id, aa in zip(yeast_ids, amino_acids):
                if aa != most_common_aa:  # Mutation détectée
                    mutations.append({
                        "Protein_ID": protein_id,
                        "Yeast_ID": yeast_id,
                        "Position": idx,
                        "Reference_AA": most_common_aa,
                        "Mutated_AA": aa
                    })


            

# Convertir les mutations en DataFrame
mutations_df = pd.DataFrame(mutations)

In [None]:
# 4. Sauvegarder les datasets dans des fichiers CSV
phenotype_output_path = "data/phenotype_dataset.csv"
cnv_output_path = "data/copy_number_variation_dataset.csv"
mutations_output_path = "data/mutations_proteins_dataset.csv"

phenotype_df.to_csv(phenotype_output_path, index=False)
cnv_df.to_csv(cnv_output_path, index=False)
mutations_df.to_csv(mutations_output_path, index=False)

print(f"Phenotype dataset saved to {phenotype_output_path}")
print(f"Copy number variation dataset saved to {cnv_output_path}")
print(f"Mutations dataset saved to {mutations_output_path}")

# Merge les copy number avec les mutations

In [None]:
'''
mutations_proteins_path = "data/mutations_proteins_dataset.csv"
copy_number_variation_path = "data/copy_number_variation_dataset.csv"

mutations_df = pd.read_csv(mutations_proteins_path)
cnv_df = pd.read_csv(copy_number_variation_path)

# Extraire uniquement le nom de la protéine dans copy_number_variation_dataset
def extract_protein_id(standard_name):
    match = re.match(r".+\.(Y[A-Z0-9]+)(?:_.*)?", standard_name)
    return match.group(1) if match else None

cnv_df['Protein_ID'] = cnv_df['Standard_name'].apply(extract_protein_id)
cnv_df = cnv_df.drop(columns=['Standard_name'])

# Réorganiser copy_number_variation_dataset pour un merge par Protein_ID et Yeast_ID
cnv_melted = cnv_df.melt(id_vars=["Protein_ID"], var_name="Yeast_ID", value_name="Copy_Number")

# Effectuer un merge avec mutations_proteins_dataset
merged_df = mutations_df.merge(
    cnv_melted,
    how="left",
    on=["Protein_ID", "Yeast_ID"]
)

# Sauvegarder le résultat dans un fichier CSV
output_path = "data/merged_mutations_dataset.csv"
merged_df.to_csv(output_path, index=False)

# Résultat
print(f"Merged dataset saved to {output_path}")
'''

Merged dataset saved to data/merged_mutations_dataset.csv


# Create the (extended) mutation matrix X

In [None]:
def create_mutation_matrix(df):
    # Create mutation labels
    df['Mutation_Label'] = df['Protein_ID'] + '_' + df['Position'].astype(str) + '_' + df['Reference_AA'] + '->' + df['Mutated_AA']
    
    # Drop duplicates to ensure each mutation appears only once
    df = df.drop_duplicates(subset=['Yeast_ID', 'Mutation_Label'])
    
    # Pivot the data to create a matrix, use Yeast_ID as index and Mutation_Label as columns
    mutation_matrix = df.pivot(index='Yeast_ID', 
                               columns='Mutation_Label', 
                               values='Mutation_Label')
    
    # Fill missing values with 0 and set the presence of mutation to 1
    mutation_matrix = mutation_matrix.notnull().astype(int)
    
    # Reset index to include Yeast_ID as a column
    mutation_matrix.reset_index(inplace=True)
    
    return mutation_matrix


dtype_dict = {
    'Yeast_ID': str,          # Yeast_ID might be a string
    'Protein_ID': str,        # Protein_ID might be a string
    'Position': str,          # Position might be a string
    'Reference_AA': str,      # Reference_AA might be a string
    'Mutated_AA': str,        # Mutated_AA might be a string
}

mutations_data = pd.read_csv("data/mutations_proteins_dataset.csv", dtype=dtype_dict)

'''
mutations_data = pd.DataFrame({
    'Yeast_ID': ['APR', 'APR', 'APL', 'APL', 'BAH', 'BAH'],
    'Protein_ID': ['YIL109C', 'YKL096W-A', 'YIL109C', 'YKL096W-A', 'YDR343C', 'YDR343C'],
    'Position': [10, 12, 5, 6, 20, 21],
    'Reference_AA': ['A', 'T', 'G', 'C', 'A', 'T'],
    'Mutated_AA': ['G', 'A', 'C', 'A', 'G', 'C'],
})
'''
mutation_matrix = create_mutation_matrix(mutations_data)

In [9]:
extend_mutations_output_path = "extend_mutations_dataset.csv"
mutation_matrix.to_csv(extend_mutations_output_path, index=False)
print(f"Extend mutations dataset saved to {extend_mutations_output_path}")

Extend mutations dataset saved to extend_mutations_dataset.csv


In [None]:
cnv_data = pd.read_csv("data/copy_number_variation_dataset.csv")
mutation_matrix = pd.read_csv("data/extend_mutations_dataset.csv")

# Extraire uniquement le nom de la protéine dans copy_number_variation_dataset
def extract_protein_id(standard_name):
    match = re.match(r".+\.(Y[A-Z0-9]+)(?:_.*)?", standard_name)
    return match.group(1) if match else None

cnv_data['Standard_name'] = cnv_data['Standard_name'].apply(extract_protein_id)

# Step 1: Extract the protein names from the mutation_matrix columns
protein_columns = [col.split('_')[0] for col in mutation_matrix.columns if col != 'Yeast_ID']
unique_proteins = set(protein_columns)

# Step 2: Filter the rows of 'data' to include only proteins in the mutation_matrix
filtered_data = cnv_data[cnv_data['Standard_name'].isin(unique_proteins)]

# Step 3: Identify missing proteins and calculate their median values for each yeast ID
missing_proteins = unique_proteins - set(filtered_data['Standard_name'])
median_values = filtered_data.set_index('Standard_name').median(axis=0)

# Step 4: Add rows for missing proteins with median values
for protein in missing_proteins:
    new_row = pd.DataFrame({protein: median_values}).transpose().reset_index()
    new_row.columns = ['Standard_name'] + list(median_values.index)
    filtered_data = pd.concat([filtered_data, new_row], ignore_index=True)

# Step 5: Transpose the filtered data
filtered_data_transposed = filtered_data.set_index('Standard_name').transpose()

# Step 6: Rename the first column to Yeast_ID
filtered_data_transposed.rename(columns={'index': 'Yeast_ID'}, inplace=True)

# Step 7: Reset the index in the mutation_matrix
mutation_matrix = mutation_matrix.reset_index(drop=True)

# Step 8: Merge the mutation_matrix with the transposed filtered data on Yeast_ID
X_matrix = pd.merge(
    mutation_matrix, 
    filtered_data_transposed, 
    left_on='Yeast_ID', 
    right_index=True
)

In [None]:
X_matrix_output_path = "data/X_matrix.csv"
X_matrix.to_csv(X_matrix_output_path, index=False)
print(f"Combined_matrix X saved to {X_matrix_output_path}")

# Create the phenotype matrix 

In [None]:
phenotype_data = pd.read_csv("data/phenotype_dataset.csv")
phenotype_data = phenotype_data.rename(columns={'Standard_name': 'Yeast_ID'})
columns_of_interest = ['Yeast_ID', 'YPD_doublingtime']
filtered_phenotype = phenotype_data[columns_of_interest]
ordered_phenotype = filtered_phenotype.set_index('Yeast_ID').reindex(X_matrix['Yeast_ID']).reset_index()

# If you want the final y, you have to only take the column[1] YPD_doublingtime of ordered_phenotype

In [None]:
ordered_phenotype_output_path = "data/ordered_phenotype_dataset.csv"
ordered_phenotype.to_csv(ordered_phenotype_output_path, index=False)
print(f"Ordered phenotype saved to {ordered_phenotype_output_path}")