# 1. Filtrage

## 1.1. Base de donnees STRING

### 1.1.1. Levure

In [1]:
import pandas as pd
from pathlib import Path

# Chemins des fichiers
input_path = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Protein_Interactions\STRING_Interactions.txt")
output_path = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\STRING_levure.temp")

# 1. Chargement et filtrage initial
df = pd.read_csv(input_path, sep=" ")

# Créer un masque de filtrage
mask = (df["combined_score"] >= 700) & (
    df[["experimental", "coexpression", "database", "textmining"]].max(axis=1) > 90
)

# Appliquer le filtre et faire une copie explicite
filtered_df = df.loc[mask].copy()

# 2. Nettoyage des données
# Supprimer "4932." des noms de protéines
for col in ['protein1', 'protein2']:
    filtered_df.loc[:, col] = filtered_df[col].str.replace('4932.', '')

# 3. Gestion des interactions uniques
# Créer des identifiants d'interaction canoniques
filtered_df.loc[:, 'interaction_key'] = filtered_df.apply(
    lambda x: frozenset({x['protein1'], x['protein2']}), axis=1
)

# Supprimer les doublons en gardant la première occurrence
unique_interactions_df = filtered_df.drop_duplicates(subset='interaction_key')

# 4. Sauvegarde des résultats
unique_interactions_df[['protein1', 'protein2']].to_csv(
    output_path, sep="\t", index=False, header=False
)

# 5. Calcul des statistiques
unique_proteins = pd.unique(
    unique_interactions_df[['protein1', 'protein2']].values.ravel('K')
)
num_interactions = len(unique_interactions_df)

# Affichage des résultats
print("\n STRING ----- levure")
print("Traitement terminé avec succès.")
print(f"Nombre de protéines uniques : {len(unique_proteins)}")
print(f"Nombre d'interactions uniques : {num_interactions}")


 STRING ----- levure
Traitement terminé avec succès.
Nombre de protéines uniques : 5782
Nombre d'interactions uniques : 103986


#### 1.1.1.1. normalisation de STRING levure

In [3]:
import os
from collections import defaultdict

# Chemins des fichiers
interactions_file = r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\STRING_levure.temp"
mapping_file = r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\autres\YEAST_559292_idmapping.dat"
output_file = r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\STRING_levure.txt"

def create_mapping_dict(mapping_file):
    """Crée un dictionnaire de mapping STRING -> UniProt et inversement"""
    str_to_uniprot = defaultdict(list)
    uniprot_to_str = defaultdict(list)
    
    with open(mapping_file, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                uniprot_id = parts[0]
                db_type = parts[1]
                external_id = parts[2]

                if db_type == "STRING":
                    # Supprimer le préfixe '4932.' si présent
                    if external_id.startswith("4932."):
                        external_id = external_id.replace("4932.", "")
                    str_to_uniprot[external_id].append(uniprot_id)
                    uniprot_to_str[uniprot_id].append(external_id)
    
    return str_to_uniprot

def process_interactions(interactions_file, str_to_uniprot):
    """Traite le fichier d'interactions"""
    interactions = []
    missing_mappings = set()
    
    with open(interactions_file, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 2:
                prot1, prot2 = parts[0], parts[1]
                
                # Vérifier si les protéines existent dans le mapping
                mapped1 = str_to_uniprot.get(prot1, [None])[0]
                mapped2 = str_to_uniprot.get(prot2, [None])[0]
                
                if mapped1 and mapped2:
                    interactions.append((mapped1, mapped2))
                else:
                    if not mapped1:
                        missing_mappings.add(prot1)
                    if not mapped2:
                        missing_mappings.add(prot2)
    
    return interactions

def main():
    # 1. Créer le dictionnaire de mapping
    str_to_uniprot = create_mapping_dict(mapping_file)
    
    # 2. Traiter les interactions
    filtered_interactions = process_interactions(interactions_file, str_to_uniprot)
    
    # 3. Sauvegarder les résultats
    with open(output_file, 'w') as f:
        for prot1, prot2 in filtered_interactions:
            f.write(f"{prot1}\t{prot2}\n")

    # 4. Calcul des statistiques
    unique_proteins = set()
    for p1, p2 in filtered_interactions:
        unique_proteins.update([p1, p2])

    # 5. Affichage simplifié
    print("\nTraitement terminé avec succès.")
    print(f"Nombre total d'interactions conservées : {len(filtered_interactions)}")
    print(f"Nombre total de protéines uniques : {len(unique_proteins)}")


if __name__ == "__main__":
    main()



Traitement terminé avec succès.
Nombre total d'interactions conservées : 96810
Nombre total de protéines uniques : 5707


### 1.1.2. humain

In [4]:
import pandas as pd
from pathlib import Path

# Chemins des fichiers
input_path = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Protein_Interactions\9606.protein.links.detailed.v12.0.txt")
output_path = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\STRING_humain_filtered_interactions.txt")

# 1. Chargement et filtrage initial
df = pd.read_csv(input_path, sep=" ")

# Créer un masque de filtrage
mask = (df["combined_score"] >= 600) & (
    df[["experimental", "coexpression", "database", "textmining"]].max(axis=1) > 90
)

# Appliquer le filtre et faire une copie explicite
filtered_df = df.loc[mask].copy()

# 2. Nettoyage des données
# Supprimer "4932." des noms de protéines
for col in ['protein1', 'protein2']:
    filtered_df.loc[:, col] = filtered_df[col].str.replace('9606.', '')

# 3. Gestion des interactions uniques
# Créer des identifiants d'interaction canoniques
filtered_df.loc[:, 'interaction_key'] = filtered_df.apply(
    lambda x: frozenset({x['protein1'], x['protein2']}), axis=1
)

# Supprimer les doublons en gardant la première occurrence
unique_interactions_df = filtered_df.drop_duplicates(subset='interaction_key')

# 4. Sauvegarde des résultats
unique_interactions_df[['protein1', 'protein2']].to_csv(
    output_path, sep="\t", index=False, header=False
)

# 5. Calcul des statistiques
unique_proteins = pd.unique(
    unique_interactions_df[['protein1', 'protein2']].values.ravel('K')
)
num_interactions = len(unique_interactions_df)

# Affichage des résultats
print("\n STRING ----- humain")
print("Traitement terminé avec succès.")
print(f"Nombre de protéines uniques : {len(unique_proteins)}")
print(f"Nombre d'interactions uniques : {num_interactions}")


 STRING ----- humain
Traitement terminé avec succès.
Nombre de protéines uniques : 18000
Nombre d'interactions uniques : 359863


## 1.1.3. Normalisations des interactions STRING (humain)

In [5]:
import sys
from collections import defaultdict

def parse_mapping_file(mapping_file):
    """
    Parse the mapping file to create a dictionary from ENSP to Uniprot IDs.
    The mapping file is assumed to have lines with format like:
    P31946 STRING 9606.ENSP00000361930
    """
    ensembl_to_uniprot = defaultdict(list)
    
    with open(mapping_file, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue
            uniprot_id, db, db_id = parts[:3]
            if db == "STRING":
                # Extract ENSP ID from STRING format (e.g., 9606.ENSP00000361930)
                if '.' in db_id:
                    ensembl_id = db_id.split('.')[1]
                    ensembl_to_uniprot[ensembl_id].append(uniprot_id)
    
    # Remove duplicates and keep only the first Uniprot ID for each ENSP
    return {ensp: uniprots[0] for ensp, uniprots in ensembl_to_uniprot.items() if uniprots}

def convert_interactions(input_file, output_file, mapping_dict):
    """
    Convert interaction file from ENSP IDs to Uniprot IDs using the mapping dictionary.
    Only writes interactions where both proteins have corresponding Uniprot IDs.
    """
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) != 2:
                continue
            prot1, prot2 = parts
            if prot1 in mapping_dict and prot2 in mapping_dict:
                uniprot1 = mapping_dict[prot1]
                uniprot2 = mapping_dict[prot2]
                outfile.write(f"{uniprot1}\t{uniprot2}\n")

def main():
    interactions_file = r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\STRING_humain_filtered_interactions.txt"
    mapping_file = r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\autres\HUMAN_9606_idmapping.dat"
    output_file = r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\STRING_humain.txt"
    
    print("Parsing mapping file...")
    mapping_dict = parse_mapping_file(mapping_file)
    print(f"Found {len(mapping_dict)} ENSP to Uniprot mappings")
    
    print("Converting interactions...")
    convert_interactions(interactions_file, output_file, mapping_dict)
    print(f"Converted interactions written to {output_file}")

if __name__ == "__main__":
    main()


Parsing mapping file...
Found 19110 ENSP to Uniprot mappings
Converting interactions...
Converted interactions written to C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\STRING_humain.txt


## 1.2. Base de donnees DIP

In [60]:
import xml.etree.ElementTree as ET
from pathlib import Path

def process_dip_interactions():
    # Configuration des chemins
    input_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Protein_Interactions\DIP_Interactions.mif25")
    output_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\DIP_levure.txt")
    
    NS = {'mif': 'http://psi.hupo.org/mi/mif'}
    tree = ET.parse(input_file)
    root = tree.getroot()

    # 1. Extraction des protéines avec vérification
    id_to_protein = {}
    for interactor in root.findall(".//mif:interactor", NS):
        interactor_id = interactor.get("id")
        if not interactor_id:
            continue
            
        uniprot_ref = interactor.find(".//mif:xref/mif:secondaryRef[@db='uniprot knowledge base']", NS)
        refseq_ref = interactor.find(".//mif:xref/mif:secondaryRef[@db='refseq']", NS)
        
        protein_id = None
        if uniprot_ref is not None:
            protein_id = uniprot_ref.get("id")
        elif refseq_ref is not None:
            protein_id = refseq_ref.get("id")
        else:
            short_label = interactor.find(".//mif:names/mif:shortLabel", NS)
            protein_id = short_label.text if short_label is not None else None
        
        if protein_id:
            id_to_protein[interactor_id] = protein_id

    # 2. Extraction des interactions avec contrôle qualité
    unique_interactions = set()
    protein_set = set()  # Pour stocker les protéines uniques
    
    for interaction in root.findall(".//mif:interaction", NS):
        participants = interaction.findall(".//mif:participant/mif:interactorRef", NS)
        
        if len(participants) != 2:
            continue
            
        id1, id2 = participants[0].text, participants[1].text
        
        # Vérification que les deux protéines existent et sont différentes
        if (id1 not in id_to_protein or 
            id2 not in id_to_protein or 
            id_to_protein[id1] == id_to_protein[id2]):
            continue
            
        prot1, prot2 = id_to_protein[id1], id_to_protein[id2]
        
        # Ajout aux protéines uniques
        protein_set.add(prot1)
        protein_set.add(prot2)
        
        # Vérification du score
        score_element = interaction.find(".//mif:confidence/mif:value", NS)
        if score_element is not None:
            try:
                score = float(score_element.text)
                if score <= 0.8:
                    continue
            except (ValueError, TypeError):
                continue
                
        # Ajout sous forme triée pour éviter les doublons A-B vs B-A
        sorted_interaction = tuple(sorted((prot1, prot2)))
        unique_interactions.add(sorted_interaction)

    # 3. Sauvegarde
    with open(output_file, "w") as f:
        f.write("Protein1\tProtein2\n")
        for prot1, prot2 in unique_interactions:
            f.write(f"{prot1}\t{prot2}\n")

    # 4. Calcul et affichage des statistiques
    num_unique_proteins = len(protein_set)
    num_unique_interactions = len(unique_interactions)
    
    print("\nRésultats du traitement:")
    print(f"- Protéines uniques: {num_unique_proteins}")
    print(f"- Interactions uniques: {num_unique_interactions}")
    print(f"Fichier sauvegardé: {output_file}")

if __name__ == "__main__":
    process_dip_interactions()


Résultats du traitement:
- Protéines uniques: 5144
- Interactions uniques: 22614
Fichier sauvegardé: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\DIP_levure.txt


## 1.3. Base de donnes BIOGRID

### 1.3.1 Levure

In [6]:
import pandas as pd
import re
from pathlib import Path

def extract_uniprot(alt_ids):
    """Extract UniProt IDs from alternative IDs field"""
    if pd.isna(alt_ids): 
        return None
    alt_ids = str(alt_ids)
    patterns = [
        r'uniprot/swiss[\W-]?prot:([A-Z0-9]{6,10})',
        r'uniprot:([A-Z0-9]{6,10})',
        r'swiss[\W-]?prot:([A-Z0-9]{6,10})',
        r'([A-Z0-9]{6,10})\.\d'
    ]
    for pattern in patterns:
        match = re.search(pattern, alt_ids, re.IGNORECASE)
        if match:
            return match.group(1).upper()
    return None

def process_biogrid_high_confidence():
    """Process BioGRID data to get ~50,000 high-confidence interactions"""
    # File paths
    input_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Protein_Interactions\BIOGRID-ORGANISM-Saccharomyces_cerevisiae.txt")
    output_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\BIOGRID_levure.txt")
    
    # Load data
    try:
        df = pd.read_csv(input_file, sep='\t', comment='#', header=None, dtype=str)
        df.columns = [
            'ID_A', 'ID_B', 'Alt_IDs_A', 'Alt_IDs_B', 
            'Aliases_A', 'Aliases_B', 'Method', 'Author',
            'PubIDs', 'TaxID_A', 'TaxID_B', 'IntType',
            'SourceDB', 'IntIDs', 'Confidence'
        ]
    except Exception as e:
        print(f"Error loading file: {e}")
        return
    
    # 1. Filter for Saccharomyces cerevisiae
    yeast_df = df[(df['TaxID_A'] == 'taxid:559292') & (df['TaxID_B'] == 'taxid:559292')].copy()
    
    # 2. Extract UniProt IDs
    yeast_df['Protein1'] = yeast_df['Alt_IDs_A'].apply(extract_uniprot)
    yeast_df['Protein2'] = yeast_df['Alt_IDs_B'].apply(extract_uniprot)
    
    # 3. Clean data
    clean_df = yeast_df.dropna(subset=['Protein1', 'Protein2'])
    clean_df = clean_df[clean_df['Protein1'] != clean_df['Protein2']].copy()
    
    # 4. High-confidence scoring system (less strict than ultra-strict version)
    method_scores = {
        'x-ray crystallography': 10,
        'electron microscopy': 8,
        'affinity chromatography': 6,
        'coimmunoprecipitation': 4,
        'two hybrid': 2,
        'pull down': 3,
        'mass spectrometry': 2
    }
    
    type_scores = {
        'direct interaction': 6,
        'physical association': 3,
        'complex': 4
    }
    
    clean_df['Pub_Count'] = clean_df['PubIDs'].str.count(r'\|').add(1).fillna(1)
    
    clean_df['Method_Score'] = clean_df['Method'].apply(
        lambda x: max([method_scores.get(method.lower(), 1)
                      for method in str(x).split('|') 
                      if 'psi-mi' not in method.lower()], default=1)
    )
    
    clean_df['Type_Score'] = clean_df['IntType'].apply(
        lambda x: max([type_scores.get(typ.lower(), 1)
                      for typ in str(x).split('|') 
                      if 'psi-mi' not in typ.lower()], default=1)
    )
    
    clean_df['Total_Score'] = (
        clean_df['Method_Score'] * 3 +
        clean_df['Type_Score'] * 2 +
        clean_df['Pub_Count']
    )
    
    # 5. Remove duplicates keeping highest scores
    clean_df['SortedPair'] = clean_df.apply(
        lambda x: tuple(sorted([x['Protein1'], x['Protein2']])), axis=1
    )
    
    non_redundant = clean_df.sort_values('Total_Score', ascending=False).drop_duplicates(subset=['SortedPair'])
    
    # 6. Dynamic threshold to get ~50,000 interactions
    target_count = 300000
    if len(non_redundant) > target_count:
        # Find score threshold that gives us ~50,000 interactions
        thresholds = sorted(non_redundant['Total_Score'].unique(), reverse=True)
        for threshold in thresholds:
            filtered = non_redundant[non_redundant['Total_Score'] >= threshold]
            if len(filtered) <= target_count:
                break
        
        # If we're still too far from target, take top N
        if len(filtered) < target_count * 0.8 or len(filtered) > target_count * 1.2:
            filtered = non_redundant.head(target_count)
    else:
        filtered = non_redundant
        print(f"Warning: Only {len(non_redundant)} interactions available")
    
    # 7. Statistics
    unique_proteins = pd.unique(filtered[['Protein1', 'Protein2']].values.ravel('K'))
    
    print(f"\nRésultats finaux (haute confiance):")
    print(f"- Interactions levures totales : {len(yeast_df):,}")
    print(f"- Interactions avec UniProt valides : {len(clean_df):,}")
    print(f"- Interactions non-redondantes : {len(non_redundant):,}")
    print(f"- Interactions sélectionnées : {len(filtered):,}")
    print(f"- Protéines uniques : {len(unique_proteins):,}")
    print(f"- Score minimum retenu : {filtered['Total_Score'].min():.1f}")
    print(f"- Score maximum : {filtered['Total_Score'].max():.1f}")
    print(f"- Score moyen : {filtered['Total_Score'].mean():.1f}")
    
    # 8. Save
    try:
        filtered[['Protein1', 'Protein2']].to_csv(
            output_file, 
            sep='\t', 
            index=False, 
            header=False
        )
        print(f"\nDonnées sauvegardées dans : {output_file}")
    except Exception as e:
        print(f"Erreur lors de la sauvegarde : {e}")

if __name__ == "__main__":
    process_biogrid_high_confidence()


Résultats finaux (haute confiance):
- Interactions levures totales : 855,577
- Interactions avec UniProt valides : 848,870
- Interactions non-redondantes : 604,862
- Interactions sélectionnées : 300,000
- Protéines uniques : 5,805
- Score minimum retenu : 6.0
- Score maximum : 6.0
- Score moyen : 6.0

Données sauvegardées dans : C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\BIOGRID_levure.txt


### 1.3.2. humain

In [7]:
import pandas as pd
import re
from pathlib import Path

def process_biogrid_max_coverage():
    # Chemins des fichiers
    input_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Protein_Interactions\BIOGRID-MV-Physical.txt")
    output_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\BIOGRID_humain.txt")
    
    # Charger les données en forçant le type string
    df = pd.read_csv(input_file, sep='\t', comment='#', header=None, dtype=str)
    df.columns = [
        'ID_A', 'ID_B', 'Alt_IDs_A', 'Alt_IDs_B', 
        'Aliases_A', 'Aliases_B', 'Method', 'Author',
        'PubIDs', 'TaxID_A', 'TaxID_B', 'IntType',
        'SourceDB', 'IntIDs', 'Confidence'
    ]
    
    # 1. Filtrer pour Homo sapiens uniquement
    human_df = df[(df['TaxID_A'] == 'taxid:9606') & (df['TaxID_B'] == 'taxid:9606')].copy()
    
    # 2. Extraction robuste des UniProt IDs
    def extract_uniprot(alt_ids):
        if pd.isna(alt_ids): 
            return None
        patterns = [
            r'uniprot/swiss[\W-]?prot:([A-Z0-9]{6,8})',
            r'uniprot:([A-Z0-9]{6,8})',
            r'([A-Z0-9]{6,8})\.\d'
        ]
        for pattern in patterns:
            match = re.search(pattern, str(alt_ids), re.IGNORECASE)
            if match:
                return match.group(1)
        return None
    
    human_df['Protein1'] = human_df['Alt_IDs_A'].apply(extract_uniprot)
    human_df['Protein2'] = human_df['Alt_IDs_B'].apply(extract_uniprot)
    
    # 3. Nettoyage de base
    clean_df = human_df.dropna(subset=['Protein1', 'Protein2'])
    clean_df = clean_df[clean_df['Protein1'] != clean_df['Protein2']].copy()
    
    # 4. Système de scoring des interactions (conservé pour information)
    method_scores = {
        'x-ray crystallography': 4,
        'electron microscopy': 3,
        'affinity chromatography': 3,
        'coimmunoprecipitation': 2,
        'two hybrid': 2,
        'pull down': 2,
        'mass spectrometry': 1
    }
    
    type_scores = {
        'direct interaction': 3,
        'physical association': 1,
        'complex': 2
    }
    
    clean_df['Pub_Count'] = clean_df['PubIDs'].str.count(r'\|') + 1
    
    clean_df['Method_Score'] = clean_df['Method'].map(
        lambda x: max([method_scores.get(method.lower(), 1) 
                      for method in str(x).split('|')])
    )
    clean_df['Type_Score'] = clean_df['IntType'].map(
        lambda x: max([type_scores.get(typ.lower(), 1) 
                      for typ in str(x).split('|')])
    )
    clean_df['Total_Score'] = (
        clean_df['Method_Score'] * 3 + 
        clean_df['Type_Score'] * 2 + 
        clean_df['Pub_Count']
    )
    
    # 5. Supprimer les doublons (même paire de protéines dans un ordre différent)
    # Créer une colonne avec la paire triée pour identifier les doublons
    clean_df['SortedPair'] = clean_df.apply(lambda x: tuple(sorted([x['Protein1'], x['Protein2']])), axis=1)
    
    # Garder toutes les interactions uniques (sans filtrage par score)
    non_redundant = clean_df.drop_duplicates(subset=['SortedPair'])
    
    # 6. Statistiques finales
    unique_proteins = pd.unique(
        non_redundant[['Protein1', 'Protein2']].values.ravel('K')
    )
    
    print(f"\nRésultats finaux :")
    print(f"- Interactions humaines totales : {len(human_df)}")
    print(f"- Interactions avec UniProt valides : {len(clean_df)}")
    print(f"- Interactions non-redondantes : {len(non_redundant)}")
    print(f"- Protéines uniques : {len(unique_proteins)}")
    print(f"- Score moyen (pour information) : {non_redundant['Total_Score'].mean():.1f}")
    
    # 7. Sauvegarde (seulement les deux colonnes Protein1 et Protein2)
    non_redundant[['Protein1', 'Protein2']].to_csv(
        output_file, 
        sep='\t', 
        index=False, 
        header=False  # Pas d'en-tête dans le fichier de sortie
    )

if __name__ == "__main__":
    process_biogrid_max_coverage()


Résultats finaux :
- Interactions humaines totales : 316235
- Interactions avec UniProt valides : 309156
- Interactions non-redondantes : 88647
- Protéines uniques : 11266
- Score moyen (pour information) : 6.0


## 1.4. Base de donnees (Complex Portal)

In [8]:
import os
import zipfile
import shutil
from xml.etree import ElementTree as ET

# Chemins
zip_path = r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Complexes\yeast.zip"
extract_folder = r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Complexes\yeast_extracted"
output_file = r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\complexes\Portal_complexes_levure.txt"

# 1. Nettoyage et extraction
if os.path.exists(extract_folder):
    shutil.rmtree(extract_folder)
os.makedirs(extract_folder, exist_ok=True)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

# 2. Traitement spécifique pour votre format XML
complexes = []
yeast_folder = os.path.join(extract_folder, "yeast")

for filename in os.listdir(yeast_folder):
    if filename.endswith(".xml"):
        filepath = os.path.join(yeast_folder, filename)
        
        try:
            tree = ET.parse(filepath)
            root = tree.getroot()
            
            # Namespace spécifique à vos fichiers
            ns = {'mif': 'http://psi.hupo.org/mi/mif300'}
            
            # Recherche des interactions complexes
            for interaction in root.findall(".//mif:abstractInteraction", ns):
                proteins = set()
                
                # Recherche des participants
                for participant in interaction.findall(".//mif:participant", ns):
                    # Référence à l'interacteur
                    interactor_ref = participant.find(".//mif:interactorRef", ns)
                    if interactor_ref is not None:
                        # Trouver l'interacteur correspondant
                        interactor = root.find(f".//mif:interactor[@id='{interactor_ref.text}']", ns)
                        if interactor is not None:
                            # Vérifier si c'est une protéine
                            interactor_type = interactor.find(".//mif:interactorType/mif:names/mif:shortLabel", ns)
                            if interactor_type is not None and interactor_type.text == "protein":
                                # Récupérer l'identifiant UniProt
                                uniprot = interactor.find(".//mif:xref/mif:primaryRef[@db='uniprotkb']", ns)
                                if uniprot is not None:
                                    proteins.add(uniprot.get("id"))
                
                if proteins:
                    complexes.append(sorted(proteins))
        
        except Exception as e:
            print(f"Erreur avec {filename}: {str(e)[:200]}")

# 3. Écriture du fichier final
with open(output_file, 'w', encoding='utf-8') as f_out:
    # Format: ID [tab] Liste_de_protéines (séparées par des espaces)
    for idx, proteins in enumerate(complexes, 1):
        f_out.write(f"{idx}\t{' '.join(proteins)}\n")

# 4. Rapport
print(f"Fichier généré: {output_file}")
print(f"Nombre total de complexes trouvés: {len(complexes)}")

Fichier généré: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\complexes\Portal_complexes_levure.txt
Nombre total de complexes trouvés: 643


## 1.5. filtrage de complexes

In [2]:
from collections import defaultdict, deque
from pathlib import Path

def load_complexes(file_path):
    """Charge les complexes depuis le fichier et retourne une liste de tuples (id_complexe, set de protéines)"""
    complexes = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) >= 2:
                complex_id = parts[0]
                proteins = set(p.strip() for p in parts[1].split() if p.strip())
                complexes.append((complex_id, proteins))
    return complexes

def load_ppi_network(file_path):
    """Charge le réseau PPI et retourne un set de protéines et le graphe PPI"""
    proteins = set()
    graph = defaultdict(set)
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#") or line.startswith("protein1"):
                continue
            parts = line.split('\t') if '\t' in line else line.split()
            if len(parts) >= 2:
                p1, p2 = parts[0].strip(), parts[1].strip()
                if p1 and p2:
                    proteins.add(p1)
                    proteins.add(p2)
                    graph[p1].add(p2)
                    graph[p2].add(p1)
    return proteins, graph

def is_single_connected_component(proteins, ppi_graph):
    """Vérifie si les protéines forment un seul composant connecté dans le réseau PPI"""
    if not proteins:
        return False
    
    visited = set()
    queue = deque()
    
    start_protein = next(iter(proteins))
    queue.append(start_protein)
    visited.add(start_protein)
    
    while queue:
        current = queue.popleft()
        for neighbor in ppi_graph[current]:
            if neighbor in proteins and neighbor not in visited:
                visited.add(neighbor)
                queue.append(neighbor)
    
    return visited == proteins

def filter_complexes(complexes, ppi_proteins, ppi_graph, output_file):
    """Filtre les complexes et sauvegarde ceux valides"""
    stats = {
        'total': 0,
        'kept': 0,
        'missing_proteins': 0,
        'disconnected': 0
    }
    
    with open(output_file, 'w', encoding='utf-8') as f_out:
        for complex_id, proteins in complexes:
            stats['total'] += 1
            
            # Vérifier que toutes les protéines sont dans le PPI
            if not all(p in ppi_proteins for p in proteins):
                stats['missing_proteins'] += 1
                continue
            
            # Vérifier la connectivité
            if not is_single_connected_component(proteins, ppi_graph):
                stats['disconnected'] += 1
                continue
            
            # Écrire le complexe valide
            f_out.write(f"{complex_id}\t{' '.join(proteins)}\n")
            stats['kept'] += 1
    
    return stats

def main():
    # Configuration des chemins
    base_dir = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data")
    
    # Fichiers d'entrée/sortie
    complexes_file = base_dir / "complexes" / "complexes_levure.txt"
    reseau_files = [
        base_dir / "weighted_networks" / "weighted_STRING_levure.txt",
        base_dir / "weighted_networks" / "weighted_DIP_levure.txt",
        base_dir / "weighted_networks" / "weighted_BIOGRID_levure.txt"
    ]
    output_files = [
        base_dir / "complexes" / "complexes_STRING_levure.txt",
        base_dir / "complexes" / "complexes_DIP_levure.txt",
        base_dir / "complexes" / "complexes_BIOGRID_levure.txt"
    ]

    # Charger les complexes
    complexes = load_complexes(complexes_file)
    print(f"Nombre total de complexes chargés: {len(complexes)}")

    # Traiter chaque réseau PPI
    for ppi_file, out_file in zip(reseau_files, output_files):
        print(f"\nTraitement de {ppi_file.name}...")
        
        try:
            # Charger le réseau PPI
            ppi_proteins, ppi_graph = load_ppi_network(ppi_file)
            print(f"- Protéines uniques dans PPI: {len(ppi_proteins):,}")
            print(f"- Interactions dans PPI: {sum(len(v) for v in ppi_graph.values())//2:,}")

            # Filtrer les complexes
            stats = filter_complexes(complexes, ppi_proteins, ppi_graph, out_file)
            
            # Afficher les statistiques
            print(f"- Complexes analysés: {stats['total']:,}")
            print(f"- Complexes conservés: {stats['kept']:,} ({stats['kept']/stats['total']*100:.1f}%)")
            print(f"  - Rejetés (protéines manquantes): {stats['missing_proteins']:,}")
            print(f"  - Rejetés (non connectés): {stats['disconnected']:,}")
            print(f"- Fichier généré: {out_file}")

        except Exception as e:
            print(f"Erreur avec {ppi_file}: {str(e)}")

    print("\nTerminé avec succès !")

if __name__ == "__main__":
    main()

Nombre total de complexes chargés: 643

Traitement de weighted_STRING_levure.txt...
- Protéines uniques dans PPI: 4,150
- Interactions dans PPI: 41,213
- Complexes analysés: 643
- Complexes conservés: 512 (79.6%)
  - Rejetés (protéines manquantes): 122
  - Rejetés (non connectés): 9
- Fichier généré: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\complexes\complexes_STRING_levure.txt

Traitement de weighted_DIP_levure.txt...
- Protéines uniques dans PPI: 3,973
- Interactions dans PPI: 15,376
- Complexes analysés: 643
- Complexes conservés: 358 (55.7%)
  - Rejetés (protéines manquantes): 188
  - Rejetés (non connectés): 97
- Fichier généré: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\complexes\complexes_DIP_levure.txt

Traitement de weighted_BIOGRID_levure.txt...
- Protéines uniques dans PPI: 5,241
- Interactions dans PPI: 200,897
- Complexes analysés: 643
- Complexes conservés: 511 (79.5%)
  - Rejetés (protéines manquantes): 110
  - Rejetés (non connectés): 22


## 1.6.1. Corum ( filtrage de complexes )

In [63]:
import csv

def process_complexes(input_file, output_file):
    # Dictionnaire pour stocker les complexes (id -> set de protéines)
    complexes = {}
    
    # Lire le fichier d'entrée
    with open(input_file, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f, delimiter='\t')
        
        for row in reader:
            complex_id = row['complex_id']
            proteins = row['subunits_uniprot_id']
            
            if proteins:
                # Séparer les protéines et supprimer les doublons
                protein_list = [p.strip() for p in proteins.split(';') if p.strip()]
                unique_proteins = list(set(protein_list))
                
                # Stocker dans le dictionnaire
                complexes[complex_id] = unique_proteins
    
    # Écrire le fichier de sortie
    with open(output_file, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(['complex_id', 'proteins'])
        
        for complex_id, proteins in complexes.items():
            # Joindre les protéines avec des points-virgules
            protein_str = ';'.join(proteins)
            writer.writerow([complex_id, protein_str])

if __name__ == '__main__':
    input_filename = r'C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Complexes\corum_humanComplexes.txt'  # Remplacez par votre fichier d'entrée
    output_filename = r'C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\complexes\CORUM_complexes_humain.txt'  # Fichier de sortie
    
    process_complexes(input_filename, output_filename)
    print(f"Le fichier de sortie a été généré : {output_filename}")

Le fichier de sortie a été généré : C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\complexes\CORUM_complexes_humain.txt


## 1.6.2. filtrage des complexes CORUM -> BIOGRID & CORUM -> STRING

In [1]:
import csv
from pathlib import Path
from collections import defaultdict, deque

def load_ppi_network(ppi_file):
    """Charge le réseau PPI et retourne un set de protéines uniques et le graphe PPI"""
    proteins = set()
    graph = defaultdict(set)
    try:
        with open(ppi_file, 'r', encoding='utf-8') as f:
            reader = csv.reader(f, delimiter='\t')
            for row in reader:
                if len(row) >= 2:
                    p1, p2 = row[0].strip(), row[1].strip()
                    if p1 and p2:
                        proteins.add(p1)
                        proteins.add(p2)
                        graph[p1].add(p2)
                        graph[p2].add(p1)
        return proteins, graph
    except Exception as e:
        print(f"Erreur lecture {ppi_file}: {str(e)}")
        return set(), defaultdict(set)

def is_single_connected_component(proteins, ppi_graph):
    """Vérifie si les protéines forment un seul composant connecté dans le réseau PPI"""
    if not proteins:
        return False
    
    visited = set()
    queue = deque()
    
    # Prendre une protéine quelconque comme point de départ
    start_protein = next(iter(proteins))
    queue.append(start_protein)
    visited.add(start_protein)
    
    while queue:
        current = queue.popleft()
        for neighbor in ppi_graph[current]:
            if neighbor in proteins and neighbor not in visited:
                visited.add(neighbor)
                queue.append(neighbor)
    
    return visited == proteins

def filter_complexes(complexes_file, ppi_proteins, ppi_graph, output_file):
    """Filtre les complexes conservant seulement ceux avec toutes les protéines dans le PPI et formant un seul composant connecté"""
    stats = {'total': 0, 'kept': 0, 'missing_proteins': 0, 'disconnected': 0}
    
    try:
        with open(complexes_file, 'r', encoding='utf-8') as f_in, \
             open(output_file, 'w', encoding='utf-8', newline='') as f_out:
            
            writer = csv.writer(f_out, delimiter='\t')
            writer.writerow(['complex_id', 'proteins'])
            
            for row in csv.reader(f_in, delimiter='\t'):
                if len(row) < 2:
                    continue
                
                stats['total'] += 1
                complex_id, proteins_str = row[0].strip(), row[1].strip()
                proteins = set(p.strip() for p in proteins_str.split(';') if p.strip())
                
                # Vérifier que toutes les protéines sont dans le PPI
                if not all(p in ppi_proteins for p in proteins):
                    stats['missing_proteins'] += 1
                    continue
                
                # Vérifier que les protéines forment un seul composant connecté
                if not is_single_connected_component(proteins, ppi_graph):
                    stats['disconnected'] += 1
                    continue
                
                # Écrire le complexe valide
                writer.writerow([complex_id, ';'.join(proteins)])
                stats['kept'] += 1
        
        return stats
    except Exception as e:
        print(f"Erreur traitement {complexes_file}: {str(e)}")
        return {'total': 0, 'kept': 0, 'missing_proteins': 0, 'disconnected': 0}

def process_ppi_network(ppi_file, complexes_file, output_file, network_name):
    """Processus complet pour un réseau PPI"""
    print(f"\nTraitement du réseau {network_name}...")
    
    # Charger les protéines PPI et le graphe
    ppi_proteins, ppi_graph = load_ppi_network(ppi_file)
    if not ppi_proteins:
        print(f"Échec: Aucune protéine chargée depuis {ppi_file}")
        return
    
    print(f"- Protéines uniques dans PPI: {len(ppi_proteins):,}")
    print(f"- Interactions dans PPI: {sum(len(v) for v in ppi_graph.values())//2:,}")

    # Filtrer les complexes
    stats = filter_complexes(complexes_file, ppi_proteins, ppi_graph, output_file)
    
    if stats['total'] == 0:
        print("Échec: Aucun complexe traité")
        return
    
    # Afficher les statistiques
    print(f"- Complexes analysés: {stats['total']:,}")
    print(f"- Complexes conservés: {stats['kept']:,} ({stats['kept']/stats['total']*100:.1f}%)")
    print(f"  - Rejetés (protéines manquantes): {stats['missing_proteins']:,}")
    print(f"  - Rejetés (non connectés): {stats['disconnected']:,}")
    print(f"- Fichier généré: {output_file}")

if __name__ == '__main__':
    # Configuration des chemins
    DATA_DIR = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data")
    
    # Fichier de complexes source
    CORUM_FILE = DATA_DIR / "complexes" / "CORUM_complexes_humain.txt"
    
    # Traitement BIOGRID
    process_ppi_network(
        ppi_file=DATA_DIR / "weighted_networks" / "weighted_BIOGRID_humain.txt",
        complexes_file=CORUM_FILE,
        output_file=DATA_DIR / "complexes" / "BIOGRID_complexes_humain.txt",
        network_name="BIOGRID"
    )
    
    # Traitement STRING
    process_ppi_network(
        ppi_file=DATA_DIR / "weighted_networks" / "weighted_STRING_humain.txt",
        complexes_file=CORUM_FILE,
        output_file=DATA_DIR / "complexes" / "STRING_complexes_humain.txt",
        network_name="STRING"
    )


Traitement du réseau BIOGRID...
- Protéines uniques dans PPI: 9,173
- Interactions dans PPI: 66,374
- Complexes analysés: 5,367
- Complexes conservés: 2,057 (38.3%)
  - Rejetés (protéines manquantes): 2,060
  - Rejetés (non connectés): 1,250
- Fichier généré: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\complexes\BIOGRID_complexes_humain.txt

Traitement du réseau STRING...
- Protéines uniques dans PPI: 14,219
- Interactions dans PPI: 246,924
- Complexes analysés: 5,367
- Complexes conservés: 2,957 (55.1%)
  - Rejetés (protéines manquantes): 1,352
  - Rejetés (non connectés): 1,058
- Fichier généré: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\complexes\STRING_complexes_humain.txt


## 2.1. Normalisation de subcellular localization data

In [5]:
import pandas as pd
import os
from collections import defaultdict
import re
import time
from pathlib import Path

def load_mapping_file(file_path):
    """Charge le fichier de mapping de manière optimisée"""
    mapping = defaultdict(dict)
    id_conversion = defaultdict(dict)
    
    # Pré-compiler les regex pour plus de performance
    yeast_pattern = re.compile(r'^Y[A-Z]{2}\d{3}[A-Z]?$')
    
    print(f"Chargement du fichier de mapping: {file_path}")
    with open(file_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f, 1):
            if i % 100000 == 0:
                print(f"Ligne {i} traitée...")
                
            parts = line.strip().split('\t')
            if len(parts) < 3:
                continue
                
            uniprot_id, id_type, id_value = parts[0], parts[1], parts[2]
            
            # Stockage optimisé des données
            if id_type not in mapping[uniprot_id]:
                mapping[uniprot_id][id_type] = []
            mapping[uniprot_id][id_type].append(id_value)
            
            # Conversion inversée optimisée
            if id_type == 'STRING' and 'ENSP' in id_value:
                ensembl_id = id_value.split('.')[-1]
                id_conversion['Ensembl'][ensembl_id] = uniprot_id
            elif id_type == 'Gene_OrderedLocusName' and yeast_pattern.match(id_value):
                id_conversion['Yeast_Locus'][id_value] = uniprot_id
            elif id_type == 'Ensembl_PRO':
                id_conversion['Ensembl_PRO'][id_value] = uniprot_id
    
    return dict(mapping), dict(id_conversion)

def map_compartment_data(compartment_file, mapping_data, id_conversion, output_file, species):
    """Mappe les données de compartiment de manière optimisée"""
    print(f"\nDébut du traitement du fichier: {compartment_file}")
    
    # Charger le fichier de compartiments
    print("Chargement des données de compartiments...")
    compartment_df = pd.read_csv(
        compartment_file, 
        sep='\t', 
        header=None,
        names=['Protein_ID', 'Gene_Name', 'GO_ID', 'GO_Term', 'Score'],
        dtype={'Protein_ID': 'string', 'Gene_Name': 'string', 
               'GO_ID': 'string', 'GO_Term': 'string'},
        on_bad_lines='warn'
    )
    
    print(f"Nombre d'entrées à traiter: {len(compartment_df)}")
    
    # Préparer les structures pour une recherche rapide
    print("Préparation des structures de recherche...")
    string_map = {}
    if species == 'human':
        string_map = {v.split('.')[-1]: k for k in mapping_data for v in mapping_data[k].get('STRING', [])}
    
    # Fonction de mapping optimisée
    def map_id(protein_id):
        # 1. Essayer le mapping direct
        if protein_id in mapping_data:
            return protein_id, mapping_data[protein_id]
        
        # 2. Essayer les conversions spécifiques
        if species == 'human' and protein_id.startswith('ENSP'):
            # Via STRING ID
            if protein_id in string_map:
                uniprot_id = string_map[protein_id]
                return uniprot_id, mapping_data.get(uniprot_id, {})
            # Via Ensembl_PRO direct
            if protein_id in id_conversion.get('Ensembl_PRO', {}):
                uniprot_id = id_conversion['Ensembl_PRO'][protein_id]
                return uniprot_id, mapping_data.get(uniprot_id, {})
                
        elif species == 'yeast':
            # Via Yeast Locus ID
            if protein_id in id_conversion.get('Yeast_Locus', {}):
                uniprot_id = id_conversion['Yeast_Locus'][protein_id]
                return uniprot_id, mapping_data.get(uniprot_id, {})
        
        return None, None
    
    # Appliquer le mapping
    print("Application du mapping...")
    compartment_df['Mapped_Info'] = compartment_df['Protein_ID'].apply(map_id)
    
    # Extraire les résultats
    print("Extraction des résultats...")
    compartment_df['Mapped_UniProtKB_ID'] = compartment_df['Mapped_Info'].apply(lambda x: x[0] if x and x[0] else '')
    compartment_df['Mapping_Status'] = compartment_df['Mapped_UniProtKB_ID'].apply(lambda x: 'Success' if x else 'Failed')
    
    # Remplir les colonnes mappées
    for field in ['GeneID', 'Ensembl', 'RefSeq', 'Entrez_Gene', 'STRING_ID']:
        compartment_df[field] = compartment_df['Mapped_Info'].apply(
            lambda x: x[1].get(field, [''])[0] if x and x[1] else ''
        )
    
    # Sélectionner les colonnes finales
    result_cols = {
        'Protein_ID': 'Original_Protein_ID',
        'Gene_Name': 'Gene_Name',
        'GO_ID': 'GO_ID',
        'GO_Term': 'GO_Term',
        'Score': 'Score',
        'Mapped_UniProtKB_ID': 'Mapped_UniProtKB_ID',
        'GeneID': 'GeneID',
        'Ensembl': 'Ensembl',
        'RefSeq': 'RefSeq',
        'Entrez_Gene': 'Entrez_Gene',
        'STRING_ID': 'STRING_ID',
        'Mapping_Status': 'Mapping_Status'
    }
    
    result_df = compartment_df[list(result_cols.keys())].rename(columns=result_cols)
    
    # Sauvegarder les résultats
    print(f"Sauvegarde des résultats dans {output_file}...")
    result_df.to_csv(output_file, sep='\t', index=False)
    
    # Calculer les statistiques
    total = len(result_df)
    mapped = (result_df['Mapping_Status'] == 'Success').sum()
    stats = {'total': total, 'mapped': mapped, 'unmapped': total - mapped}
    
    # Sauvegarder les stats
    stats_file = output_file.with_name(output_file.stem + "_stats.txt")
    with open(stats_file, 'w') as f:
        f.write(f"Total proteins: {stats['total']}\n")
        f.write(f"Mapped proteins: {stats['mapped']} ({stats['mapped']/stats['total']:.1%})\n")
        f.write(f"Unmapped proteins: {stats['unmapped']} ({stats['unmapped']/stats['total']:.1%})\n")
    
    # Sauvegarder les non-mappés si nécessaire
    if stats['unmapped'] > 0:
        unmapped_file = output_file.with_name(output_file.stem + "_unmapped.txt")
        unmapped_proteins = result_df[result_df['Mapping_Status'] == 'Failed']['Original_Protein_ID']
        unmapped_proteins.to_csv(unmapped_file, index=False, header=False)
        print(f"{stats['unmapped']} protéines non mappées sauvegardées dans {unmapped_file}")
    
    print("Traitement terminé avec succès!")
    return stats

# ==============================================
# CONFIGURATION PRINCIPALE
# ==============================================

# Chemins absolus des fichiers
BASE_DIR = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data")
RAW_DATA_DIR = BASE_DIR / "raw data" / "autres"
CLEAN_DATA_DIR = BASE_DIR / "clean data"

human_mapping_path = RAW_DATA_DIR / "HUMAN_9606_idmapping.dat"
yeast_mapping_path = RAW_DATA_DIR / "YEAST_559292_idmapping.dat"
human_compartment_path = RAW_DATA_DIR / "human_compartment_integrated_full.tsv"
yeast_compartment_path = RAW_DATA_DIR / "yeast_compartment_integrated_full.tsv"
output_dir = CLEAN_DATA_DIR / "autres"

# Vérification des fichiers
print("\nVérification des fichiers...")
required_files = {
    "Human mapping": human_mapping_path,
    "Yeast mapping": yeast_mapping_path,
    "Human compartment": human_compartment_path,
    "Yeast compartment": yeast_compartment_path
}

for name, path in required_files.items():
    if not path.exists():
        raise FileNotFoundError(f"Fichier {name} introuvable à l'emplacement: {path}")
    print(f"- {name}: {path} (OK)")

# Créer le dossier de sortie
os.makedirs(output_dir, exist_ok=True)
print(f"\nDossier de sortie: {output_dir}")

# ==============================================
# EXÉCUTION PRINCIPALE
# ==============================================

start_time = time.time()

try:
    # 1. Charger les fichiers de mapping
    print("\n" + "="*50)
    print("CHARGEMENT DES FICHIERS DE MAPPING")
    print("="*50)
    
    human_mapping, human_conversion = load_mapping_file(human_mapping_path)
    yeast_mapping, yeast_conversion = load_mapping_file(yeast_mapping_path)
    
    print(f"\nTemps de chargement: {time.time() - start_time:.2f} secondes")
    print(f"Protéines humaines chargées: {len(human_mapping)}")
    print(f"Protéines yeast chargées: {len(yeast_mapping)}")
    
    # 2. Traitement des données humaines
    print("\n" + "="*50)
    print("TRAITEMENT DES DONNÉES HUMAINES")
    print("="*50)
    
    human_stats = map_compartment_data(
        human_compartment_path,
        human_mapping,
        human_conversion,
        output_dir / "human_compartment_mapped.txt",
        'human'
    )
    
    # 3. Traitement des données yeast
    print("\n" + "="*50)
    print("TRAITEMENT DES DONNÉES YEAST")
    print("="*50)
    
    yeast_stats = map_compartment_data(
        yeast_compartment_path,
        yeast_mapping,
        yeast_conversion,
        output_dir / "yeast_compartment_mapped.txt",
        'yeast'
    )
    
    # 4. Affichage des résultats finaux
    print("\n" + "="*50)
    print("RÉSULTATS FINAUX")
    print("="*50)
    print(f"HUMAIN - Total: {human_stats['total']}, Mappés: {human_stats['mapped']} ({human_stats['mapped']/human_stats['total']:.1%})")
    print(f"YEAST  - Total: {yeast_stats['total']}, Mappés: {yeast_stats['mapped']} ({yeast_stats['mapped']/yeast_stats['total']:.1%})")
    print(f"\nTemps total d'exécution: {time.time() - start_time:.2f} secondes")

except Exception as e:
    print(f"\nERREUR: {str(e)}")
    raise


Vérification des fichiers...
- Human mapping: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\autres\HUMAN_9606_idmapping.dat (OK)
- Yeast mapping: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\autres\YEAST_559292_idmapping.dat (OK)
- Human compartment: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\autres\human_compartment_integrated_full.tsv (OK)
- Yeast compartment: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\autres\yeast_compartment_integrated_full.tsv (OK)

Dossier de sortie: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\autres

CHARGEMENT DES FICHIERS DE MAPPING
Chargement du fichier de mapping: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\autres\HUMAN_9606_idmapping.dat
Ligne 100000 traitée...
Ligne 200000 traitée...
Ligne 300000 traitée...
Ligne 400000 traitée...
Ligne 500000 traitée...
Ligne 600000 traitée...
Ligne 700000 traitée...
Ligne 800000 traitée...
Ligne 900000 traitée...
Ligne 1000000 traitée...
Ligne 1100