In [4]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:

# Chemins des fichiers et dossiers
proteome_dir_path = "data/Proteome_1011"  # Dossier contenant les fichiers de séquences protéiques
phenotype_data_path = "data/Finalset_223phenotypes_1011.csv"
copy_number_variation_path = "data/genesMatrix_CopyNumber.tab"

# Extraction des données protéiques
proteome_data = {}
for filename in os.listdir(proteome_dir_path):
    if filename.endswith(".fasta"):
        filepath = os.path.join(proteome_dir_path, filename)
        with open(filepath, "r") as file:
            protein_id = filename.split(".")[0]
            sequence = ""
            for line in file:
                if not line.startswith(">"):
                    sequence += line.strip()
            proteome_data[protein_id] = sequence

proteome_df = pd.DataFrame(list(proteome_data.items()), columns=["Protein_ID", "Sequence"])

# Chargement des données phénotypiques
phenotype_df = pd.read_csv(phenotype_data_path)

# Chargement des données de variation du nombre de copies
cnv_df = pd.read_csv(copy_number_variation_path, sep="\t", index_col=0).T
cnv_df.index.name = "Standard_name"

# Sauvegarder les datasets dans des fichiers CSV
proteome_output_path = "data/proteome_dataset.csv"
phenotype_output_path = "data/phenotype_dataset.csv"
cnv_output_path = "data/copy_number_variation_dataset.csv"

proteome_df.to_csv(proteome_output_path, index=False)
phenotype_df.to_csv(phenotype_output_path, index=False)
cnv_df.to_csv(cnv_output_path)

# Résultat
print(f"Proteome dataset saved to {proteome_output_path}")
print(f"Phenotype dataset saved to {phenotype_output_path}")
print(f"Copy number variation dataset saved to {cnv_output_path}")

Proteome dataset saved to data/proteome_dataset.csv
Phenotype dataset saved to data/phenotype_dataset.csv
Copy number variation dataset saved to data/copy_number_variation_dataset.csv


#NE PAS LANCER LA SUITE

In [1]:
import os
import pandas as pd

# Chemins des fichiers et dossiers
proteome_dir_path = "data/Proteome_1011"  # Dossier contenant les fichiers de séquences protéiques
phenotype_data_path = "data/Finalset_223phenotypes_1011.csv"
copy_number_variation_path = "data/genesMatrix_CopyNumber.tab"

# 1. Extraction des données protéiques
proteome_data = []
for filename in os.listdir(proteome_dir_path):
    if filename.endswith(".fasta"):
        filepath = os.path.join(proteome_dir_path, filename)
        with open(filepath, "r") as file:
            protein_id = filename.split(".")[0]
            sequence = ""
            yeast_ids = []
            for line in file:
                if line.startswith(">"):  # Identifiant de la souche de levure
                    yeast_ids.append(line.strip().split()[0][1:])  # Retirer le ">" au début
                else:
                    sequence += line.strip()
            for yeast_id in yeast_ids:
                proteome_data.append({"Protein_ID": protein_id, "Yeast_ID": yeast_id, "Sequence": sequence})

proteome_df = pd.DataFrame(proteome_data)

# 2. Chargement des données phénotypiques
phenotype_df = pd.read_csv(phenotype_data_path)

# 3. Chargement des données de variation du nombre de copies
cnv_df = pd.read_csv(copy_number_variation_path, sep="\t", index_col=0).T
cnv_df.index.name = "Standard_name"

# Correction des valeurs manquantes
cnv_df.fillna(0, inplace=True)

# Réindexation pour une compatibilité ultérieure
cnv_df.reset_index(inplace=True)

# 4. Sauvegarder les datasets dans des fichiers CSV
proteome_output_path = "data/proteome_dataset.csv"
phenotype_output_path = "data/phenotype_dataset.csv"
cnv_output_path = "data/copy_number_variation_dataset.csv"

proteome_df.to_csv(proteome_output_path, index=False)
phenotype_df.to_csv(phenotype_output_path, index=False)
cnv_df.to_csv(cnv_output_path, index=False)

# Résultat
print(f"Proteome dataset saved to {proteome_output_path}")
print(f"Phenotype dataset saved to {phenotype_output_path}")
print(f"Copy number variation dataset saved to {cnv_output_path}")

KeyboardInterrupt: 