In [None]:
import os
import re
import pandas as pd
import dask.dataframe as dd

# 1. Define the pathn to the parent directory

In [None]:
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Move one level up

# 2. Choose the phenotype

In [None]:
phenotype = "YPD_doublingtime"
#phenotype = "YPDCUSO410MM_40h"

# 3. Create X_matrix (proteins+CNV)

In [None]:
# Load the mutation dataset
mutations_path = os.path.join(data_dir, "data/mutations_dataset.csv")
mutations_df = pd.read_csv(mutations_path)

# Create a mutation count matrix
mutation_matrix = (
    mutations_df
    .groupby(['Yeast_ID', 'Protein_ID'])['Position']  # Group by Yeast_ID and Protein_ID
    .count()                # Count the number of mutations for each (Yeast_ID, Protein_ID)
    .unstack(fill_value=0)  # Pivot the table to get a matrix
)

# Load the CNV dataset
cnv_path = os.path.join(data_dir, "data/genesMatrix_CopyNumber.tab")
cnv_df = pd.read_csv(cnv_path, sep="\t", index_col=0).T  # Transpose the CNV data
cnv_df.index.name = "Yeast_ID"
cnv_df.reset_index(inplace=True)

# Extract the real protein IDs in the CNV dataset
pattern_case1 = r'Y[A-Z]{2}\d{3}[A-Z]'          # Case 1 pattern
pattern_case2 = r'Y[A-Z]{2}\d{3}[A-Z]\.[A-Z]'   # Case 2 pattern

# Function to extract the real name
def extract_real_name(name):
    """
    Extracts the real name of a yeast based on two specific patterns.
    
    Parameters:
        name (str): The name to be processed (e.g., "YAA001A.X").
    
    Returns:
        str or None: The extracted real name if it matches either pattern, or None if no match is found.
    """
    # Check if the name matches the more specific pattern (Case 2)
    match_case2 = re.search(pattern_case2, name)
    if match_case2:
        # If it matches, replace the '.' with a '-' and return the modified name
        return match_case2.group().replace('.', '-')

    # If the name doesn't match Case 2, check for Case 1 (less specific pattern)
    match_case1 = re.search(pattern_case1, name)
    if match_case1:
        # If it matches Case 1, return the name as is
        return match_case1.group()
    
    # If the name doesn't match either pattern, return None
    return None

# Apply the extract_real_name function to the 'Standard_name' column
cnv_df['Yeast_ID'] = cnv_df['Yeast_ID'].apply(extract_real_name)

# Drop rows with missing 'Standard_name' values (i.e., rows where real name extraction failed)
cnv_df = cnv_df.dropna(subset=['Yeast_ID']).reset_index(drop=True)

# Transpose CNV data back after processing Standard_name
cnv_matrix = cnv_df.set_index('Yeast_ID').T  # Transpose for Yeast_ID rows
cnv_matrix.index.name = 'Yeast_ID'

# Merge mutation count matrix and CNV matrix
final_matrix = mutation_matrix.merge(
    cnv_matrix,
    left_index=True,
    right_index=True,
    how='left',
    suffixes=('', '_CNV')  # Append '_CNV' to CNV columns
)

# Fill missing values in CNV columns with median
cnv_columns = [col for col in final_matrix.columns if col.startswith("CNV_")]
final_matrix[cnv_columns] = final_matrix[cnv_columns].fillna(final_matrix[cnv_columns].median())

# Save the final matrix to a CSV file
output_path = os.path.join(data_dir, f"data/X_matrix_proteins_and_cnvs_count_{phenotype}.csv")
final_matrix.to_csv(output_path, index=True)

print(f"Final matrix with mutation counts saved at {output_path}. Dimensions: {final_matrix.shape}")

Final matrix with mutation counts saved at X_matrix_proteins_and_cnvs_count.csv. Dimensions: (1011, 12542)


# 4. Verify that y_phenotype and X_matrix (proteins+CNV) are ready for shuffling

In [None]:
# Load the matrices
X_path = os.path.join(data_dir, f"data/X_matrix_proteins_and_cnvs_count_{phenotype}.csv")
Y_path = os.path.join(data_dir, f"data/y_{phenotype}.csv")

X = pd.read_csv(X_path)
Y = pd.read_csv(Y_path)

# Check if Yeast_IDs in X and Y match
X_ids = set(X['Yeast_ID'])
Y_ids = set(Y['Yeast_ID'])

# Identify missing IDs between the two datasets
missing_in_X = Y_ids - X_ids
missing_in_Y = X_ids - Y_ids

# Output missing IDs information
if missing_in_X:
    print(f"IDs present in Y but missing in X: {missing_in_X}")
else:
    print("All IDs from Y are present in X.")

if missing_in_Y:
    print(f"IDs present in X but missing in Y: {missing_in_Y}")
else:
    print("All IDs from X are present in Y.")

# Align the matrices based on the common Yeast_IDs
common_ids = X_ids & Y_ids

# Filter both X and Y to retain only the rows with the common Yeast_IDs
X_aligned = X[X['Yeast_ID'].isin(common_ids)].sort_values(by="Yeast_ID")
Y_aligned = Y[Y['Yeast_ID'].isin(common_ids)].sort_values(by="Yeast_ID")

# Verify that the Yeast_IDs in X and Y are aligned correctly (same order)
if list(X_aligned['Yeast_ID']) == list(Y_aligned['Yeast_ID']):
    print("Yeast_IDs are aligned between X and Y.")
else:
    print("Yeast_IDs are not in the same order. Please check.")

# Save the aligned matrices to CSV files if needed
X_aligned.to_csv(os.path.join(data_dir, "X_matrix_aligned.csv"), index=False)
Y_aligned.to_csv(os.path.join(data_dir, "Y_matrix_aligned.csv"), index=False)

Tous les IDs de Y sont présents dans X.
IDs présents dans X mais absents dans Y : {'AVH', 'SACE_YBP', 'CLS', 'SACE_YBL', 'SACE_YCL', 'SACE_YBS', 'CHP', 'SACE_YDO', 'ARI', 'SACE_YBG', 'CGE', 'CKN', 'SACE_YDC', 'BFL', 'SACE_YCT', 'CLD', 'SACE_GAS', 'BTG', 'SACE_YCC', 'CHI', 'BKS', 'BEF', 'ABH', 'AIG', 'SACE_YBR', 'CLI', 'ABK', 'CHL', 'SACE_YBT', 'SACE_YBQ', 'SACE_YAC', 'CII', 'AGB', 'SACE_YDK', 'AII', 'SACE_YCI', 'SACE_YCB', 'AIR', 'SACE_MAB', 'CLL', 'BHK', 'BGA', 'CGG', 'CHG', 'BRA', 'BHF', 'BRG', 'SACE_YBO', 'SACE_YAY', 'CGV', 'AHQ', 'SACE_YBW', 'CGR', 'SACE_YDL', 'SACE_YAQ', 'BQT', 'AER', 'ARK', 'CGQ', 'CGT', 'AMS', 'CGL', 'SACE_YCF', 'SACE_YBU', 'SACE_YBB', 'AHF', 'SACE_YDB', 'CHD', 'BKR', 'SACE_YCQ', 'SACE_YCM', 'BDC', 'BRE', 'CHN', 'SACE_YAT', 'BKD', 'CQB', 'ABG', 'ALV', 'SACE_YCG', 'SACE_YAN', 'CFN', 'SACE_YCY', 'APV', 'CGS', 'SACE_GAL', 'AIK', 'CKE', 'BFA', 'CGP', 'SACE_YBZ', 'CHM', 'SACE_MAA', 'SACE_YAX', 'CLT', 'SACE_YBE', 'SACE_YCP', 'ASL', 'CKA', 'AFA', 'BRF', 'SACE_YBF', 'BI

# 5. Shuffle both datasets

In [None]:
# Load the aligned matrices
X_path =  os.path.join(data_dir, "X_matrix_aligned.csv")  # Path to the filtered X matrix
Y_path = os.path.join(data_dir, "data/Y_matrix.csv") 

X = pd.read_csv(X_path)
Y = pd.read_csv(Y_path)

# Filter Y to retain only the common Yeast_IDs present in both X and Y
common_ids = set(X['Yeast_ID']) & set(Y['Yeast_ID'])
Y_filtered = Y[Y['Yeast_ID'].isin(common_ids)].sort_values(by="Yeast_ID")

# Verify that the Yeast_IDs are aligned between X and Y
if list(X['Yeast_ID']) != list(Y_filtered['Yeast_ID']):
    raise ValueError("Yeast_IDs are not aligned between X and Y. Please check your data.")

# Shuffle the indices randomly while keeping the alignment intact
shuffled_indices = X.sample(frac=1, random_state=42).index

# Apply the shuffled indices to both X and Y to maintain alignment
X_shuffled = X.loc[shuffled_indices].reset_index(drop=True)
Y_shuffled = Y_filtered.loc[shuffled_indices].reset_index(drop=True)

# Save the shuffled matrices to new CSV files
X_shuffled.to_csv(os.path.join(data_dir, "X_matrix_proteins_nb.csv"), index=False)
Y_shuffled.to_csv(os.path.join(data_dir, "Y_matrix_proteins_nb.csv"), index=False)

print(f"The X and Y matrices have been shuffled and saved successfully.")
print(f"Dimensions of X_shuffled : {X_shuffled.shape}")
print(f"Dimensions of Y_shuffled : {Y_shuffled.shape}")

Les matrices X et Y ont été mélangées et sauvegardées avec succès.
Dimensions de X_shuffled : (792, 12543)
Dimensions de Y_shuffled : (792, 2)


In [None]:
# Save the shuffled matrices in pickle format for efficient storage and retrieval
X_shuffled.to_pickle(os.path.join(data_dir, "X_matrix_proteins_nb.pkl"))
Y_shuffled.to_pickle(os.path.join(data_dir, "Y_matrix_proteins_nb.pkl"))