In [1]:
import os
import re
import pandas as pd
import dask.dataframe as dd

In [2]:
data_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Move one level up

# 1. Choose the phenotype

In [3]:
#phenotype = "YPD_doublingtime"
phenotype = "YPDCUSO410MM_40h"

# 2. Create X_matrix (proteins+CNV)

In [4]:
mutations_path = os.path.join(data_dir, "data/mutations_dataset.csv")
mutations_df = pd.read_csv(mutations_path)

mutation_matrix = (
    mutations_df
    .groupby(['Yeast_ID', 'Protein_ID'])['Position'] 
    .count()                
    .unstack(fill_value=0) 
)

cnv_path = os.path.join(data_dir, "data/genesMatrix_CopyNumber.tab")
cnv_df = pd.read_csv(cnv_path, sep="\t", index_col=0).T 
cnv_df.index.name = "Yeast_ID"
cnv_df.reset_index(inplace=True)

pattern_case1 = r'Y[A-Z]{2}\d{3}[A-Z]'          # Case 1 pattern
pattern_case2 = r'Y[A-Z]{2}\d{3}[A-Z]\.[A-Z]'   # Case 2 pattern

# Function to extract the real name
def extract_real_name(name):
    """
    Extracts the real name of a yeast based on two specific patterns.
    
    Parameters:
        name (str): The name to be processed
    
    Returns:
        str or None: The extracted real name if it matches either pattern, or None if no match is found.
    """
    # Case 2: more specific pattern
    match_case2 = re.search(pattern_case2, name)
    if match_case2:
        return match_case2.group().replace('.', '-')

    # Case 1: less specific pattern
    match_case1 = re.search(pattern_case1, name)
    if match_case1:
        return match_case1.group()
    
    return None

cnv_df['Yeast_ID'] = cnv_df['Yeast_ID'].apply(extract_real_name)
cnv_df = cnv_df.dropna(subset=['Yeast_ID']).reset_index(drop=True)
cnv_matrix = cnv_df.set_index('Yeast_ID').T
cnv_matrix.index.name = 'Yeast_ID'

# Merge mutation count matrix and CNV matrix
final_matrix = mutation_matrix.merge(
    cnv_matrix,
    left_index=True,
    right_index=True,
    how='left',
    suffixes=('', '_CNV')  # Append '_CNV' to CNV columns
)

cnv_columns = [col for col in final_matrix.columns if col.startswith("CNV_")]
final_matrix[cnv_columns] = final_matrix[cnv_columns].fillna(final_matrix[cnv_columns].median())

output_path = os.path.join(data_dir, f"data/X_matrix_proteins_and_cnvs_count_{phenotype}.csv")
final_matrix.to_csv(output_path, index=True)

print(f"Final matrix with mutation counts saved at {output_path}. Dimensions: {final_matrix.shape}")

Final matrix with mutation counts saved at /Users/colineleteurtre/Library/CloudStorage/OneDrive-Personnel/Documents/cours/EPFL/machinelearning/project2/Yeast_Project/data/X_matrix_proteins_and_cnvs_count_YPDCUSO410MM_40h.csv. Dimensions: (1011, 12542)


# 3. Verify that y_phenotype and X_matrix (proteins+CNV) are ready for shuffling

In [None]:
X_path = os.path.join(data_dir, f"data/X_matrix_proteins_and_cnvs_count_{phenotype}.csv")
Y_path = os.path.join(data_dir, f"data/y_{phenotype}.csv")

X = pd.read_csv(X_path)
Y = pd.read_csv(Y_path)

# Check if Yeast_IDs in X and Y match
X_ids = set(X['Yeast_ID'])
Y_ids = set(Y['Yeast_ID'])

# Identify missing IDs between the two datasets
missing_in_X = Y_ids - X_ids
missing_in_Y = X_ids - Y_ids

# Output missing IDs information
if missing_in_X:
    print(f"IDs present in Y but missing in X: {missing_in_X}")
else:
    print("All IDs from Y are present in X.")

if missing_in_Y:
    print(f"IDs present in X but missing in Y: {missing_in_Y}")
else:
    print("All IDs from X are present in Y.")

# Align the matrices based on the common Yeast_IDs
common_ids = X_ids & Y_ids

# Filter both X and Y to retain only the rows with the common Yeast_IDs
X_aligned = X[X['Yeast_ID'].isin(common_ids)].sort_values(by="Yeast_ID")
Y_aligned = Y[Y['Yeast_ID'].isin(common_ids)].sort_values(by="Yeast_ID")

# Verify that the Yeast_IDs in X and Y are aligned correctly (same order)
if list(X_aligned['Yeast_ID']) == list(Y_aligned['Yeast_ID']):
    print("Yeast_IDs are aligned between X and Y.")
else:
    print("Yeast_IDs are not in the same order. Please check.")

X_aligned.to_csv(os.path.join(data_dir, f"data/X_matrix_aligned_{phenotype}.csv"), index=False)
Y_aligned.to_csv(os.path.join(data_dir, f"data/y_aligned_{phenotype}.csv"), index=False)

# 4. Shuffle both datasets

In [None]:
X_path =  os.path.join(data_dir, f"data/X_matrix_aligned_{phenotype}.csv") 
Y_path = os.path.join(data_dir, f"data/y_aligned_{phenotype}.csv") 

X = pd.read_csv(X_path)
Y = pd.read_csv(Y_path)

# Filter Y to retain only the common Yeast_IDs present in both X and Y
common_ids = set(X['Yeast_ID']) & set(Y['Yeast_ID'])
Y_filtered = Y[Y['Yeast_ID'].isin(common_ids)].sort_values(by="Yeast_ID")

# Verify that the Yeast_IDs are aligned between X and Y
if list(X['Yeast_ID']) != list(Y_filtered['Yeast_ID']):
    raise ValueError("Yeast_IDs are not aligned between X and Y. Please check your data.")

# Shuffle the indices randomly while keeping the alignment intact
shuffled_indices = X.sample(frac=1, random_state=42).index

# Apply the shuffled indices to both X and Y to maintain alignment
X_shuffled = X.loc[shuffled_indices].reset_index(drop=True)
Y_shuffled = Y_filtered.loc[shuffled_indices].reset_index(drop=True)

X_shuffled.to_csv(os.path.join(data_dir, f"X_matrix_proteins_nb_{phenotype}.csv"), index=False)
Y_shuffled.to_csv(os.path.join(data_dir, f"y_proteins_nb_{phenotype}.csv"), index=False)

print(f"The X and Y matrices have been shuffled and saved successfully.")
print(f"Dimensions of X_shuffled : {X_shuffled.shape}")
print(f"Dimensions of Y_shuffled : {Y_shuffled.shape}")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/colineleteurtre/Library/CloudStorage/OneDrive-Personnel/Documents/cours/EPFL/machinelearning/project2/Yeast_Project/data/y_aligned_YPDCUSO410MM_40h.csv'

In [None]:
X_shuffled.to_pickle(os.path.join(data_dir, f"data/X_matrix_proteins_nb_{phenotype}.pkl"))
Y_shuffled.to_pickle(os.path.join(data_dir, f"data/y_proteins_nb_{phenotype}.pkl"))