# 1. Filtrage

## 1.1. Base de donnees STRING

In [9]:
import pandas as pd
from pathlib import Path

# Chemins des fichiers
input_path = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Protein_Interactions\STRING_Interactions.txt")
output_path = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\STRING_filtered_interactions.txt")

# 1. Chargement et filtrage initial
df = pd.read_csv(input_path, sep=" ")

# Créer un masque de filtrage
mask = (df["combined_score"] >= 900) & (
    df[["experimental", "coexpression", "database", "textmining"]].max(axis=1) > 90
)

# Appliquer le filtre et faire une copie explicite
filtered_df = df.loc[mask].copy()

# 2. Nettoyage des données
# Supprimer "4932." des noms de protéines
for col in ['protein1', 'protein2']:
    filtered_df.loc[:, col] = filtered_df[col].str.replace('4932.', '')

# 3. Gestion des interactions uniques
# Créer des identifiants d'interaction canoniques
filtered_df.loc[:, 'interaction_key'] = filtered_df.apply(
    lambda x: frozenset({x['protein1'], x['protein2']}), axis=1
)

# Supprimer les doublons en gardant la première occurrence
unique_interactions_df = filtered_df.drop_duplicates(subset='interaction_key')

# 4. Sauvegarde des résultats
unique_interactions_df[['protein1', 'protein2']].to_csv(
    output_path, sep="\t", index=False, header=False
)

# 5. Calcul des statistiques
unique_proteins = pd.unique(
    unique_interactions_df[['protein1', 'protein2']].values.ravel('K')
)
num_interactions = len(unique_interactions_df)

# Affichage des résultats
print("\n Traitement (filtrage) terminé avec succès.")
print(f"  - Nombre de protéines uniques : {len(unique_proteins)}")
print(f"  - Nombre d'interactions uniques : {num_interactions}")


 Traitement (filtrage) terminé avec succès.
  - Nombre de protéines uniques : 4708
  - Nombre d'interactions uniques : 53150


## 1.2. Base de donnees DIP

In [18]:
import xml.etree.ElementTree as ET
from pathlib import Path

def process_dip_interactions():
    # Configuration des chemins
    input_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Protein_Interactions\DIP_Interactions.mif25")
    output_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\DIP_filtered_interactions.txt")
    
    NS = {'mif': 'http://psi.hupo.org/mi/mif'}
    TARGET_TAXID = "4932"  # TaxID de Saccharomyces cerevisiae
    
    tree = ET.parse(input_file)
    root = tree.getroot()

    # 1. Extraction des protéines avec vérification de l'organisme
    id_to_protein = {}
    for interactor in root.findall(".//mif:interactor", NS):
        interactor_id = interactor.get("id")
        if not interactor_id:
            continue
            
        # Vérification que la protéine appartient à la levure
        organism = interactor.find(".//mif:organism", NS)
        if organism is None or organism.get("ncbiTaxId") != TARGET_TAXID:
            continue
            
        uniprot_ref = interactor.find(".//mif:xref/mif:secondaryRef[@db='uniprot knowledge base']", NS)
        refseq_ref = interactor.find(".//mif:xref/mif:secondaryRef[@db='refseq']", NS)
        
        protein_id = None
        if uniprot_ref is not None:
            protein_id = uniprot_ref.get("id")
        elif refseq_ref is not None:
            protein_id = refseq_ref.get("id")
        else:
            short_label = interactor.find(".//mif:names/mif:shortLabel", NS)
            protein_id = short_label.text if short_label is not None else None
        
        if protein_id:
            id_to_protein[interactor_id] = protein_id

    # 2. Extraction des interactions avec contrôle qualité
    unique_interactions = set()
    protein_set = set()  # Pour stocker les protéines uniques
    
    for interaction in root.findall(".//mif:interaction", NS):
        participants = interaction.findall(".//mif:participant/mif:interactorRef", NS)
        
        if len(participants) != 2:
            continue
            
        id1, id2 = participants[0].text, participants[1].text
        
        # Vérification que les deux protéines existent, sont différentes et appartiennent à la levure
        if (id1 not in id_to_protein or 
            id2 not in id_to_protein or 
            id_to_protein[id1] == id_to_protein[id2]):
            continue
            
        prot1, prot2 = id_to_protein[id1], id_to_protein[id2]
        
        # Ajout aux protéines uniques
        protein_set.add(prot1)
        protein_set.add(prot2)
        
        # Vérification du score
        score_element = interaction.find(".//mif:confidence/mif:value", NS)
        if score_element is not None:
            try:
                score = float(score_element.text)
                if score <= 0.8:
                    continue
            except (ValueError, TypeError):
                continue
                
        # Ajout sous forme triée pour éviter les doublons A-B vs B-A
        sorted_interaction = tuple(sorted((prot1, prot2)))
        unique_interactions.add(sorted_interaction)

    # 3. Sauvegarde
    with open(output_file, "w") as f:
        f.write("Protein1\tProtein2\n")
        for prot1, prot2 in unique_interactions:
            f.write(f"{prot1}\t{prot2}\n")

    # 4. Calcul et affichage des statistiques
    num_unique_proteins = len(protein_set)
    num_unique_interactions = len(unique_interactions)
    
    print("\nRésultats du traitement (Saccharomyces cerevisiae uniquement):")
    print(f"- Protéines uniques: {num_unique_proteins}")
    print(f"- Interactions uniques: {num_unique_interactions}")
    print(f"Fichier sauvegardé: {output_file}")

if __name__ == "__main__":
    process_dip_interactions()


Résultats du traitement (Saccharomyces cerevisiae uniquement):
- Protéines uniques: 5012
- Interactions uniques: 22436
Fichier sauvegardé: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\DIP_filtered_interactions.txt


## 1.3. Base de donnes BIOGRID

In [28]:
import pandas as pd
import re
from pathlib import Path

def process_biogrid_high_confidence():
    # Chemins des fichiers
    input_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\raw data\Protein_Interactions\BIOGRID-ORGANISM-Saccharomyces_cerevisiae.txt")
    output_file = Path(r"C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\BIOGRID_top_40000.tsv")
    
    # Charger les données
    df = pd.read_csv(input_file, sep='\t', comment='#', header=None)
    df.columns = [
        'ID_A', 'ID_B', 'Alt_IDs_A', 'Alt_IDs_B', 
        'Aliases_A', 'Aliases_B', 'Method', 'Author',
        'PubIDs', 'TaxID_A', 'TaxID_B', 'IntType',
        'SourceDB', 'IntIDs', 'Confidence'
    ]
    
    # 1. Filtrer pour la levure S288c
    yeast_df = df[(df['TaxID_A'] == 'taxid:559292') & 
                 (df['TaxID_B'] == 'taxid:559292')].copy()
    
    # 2. Critères de haute confiance
    high_conf_methods = [
        'affinity chromatography',
        'two hybrid',
        'pull down',
        'coimmunoprecipitation'
    ]
    
    high_conf_types = [
        'direct interaction',
        'physical association'
    ]
    
    # 3. Filtrer avec critères combinés
    method_mask = yeast_df['Method'].str.contains('|'.join(high_conf_methods), case=False, na=False)
    type_mask = yeast_df['IntType'].str.contains('|'.join(high_conf_types), case=False, na=False)
    
    high_conf = yeast_df[method_mask & type_mask].copy()
    
    # 4. Extraire les UniProt IDs
    def extract_uniprot(alt_ids):
        if pd.isna(alt_ids): return None
        match = re.search(r'uniprot/swiss[\W-]?prot:([A-Z0-9]{6,10})', str(alt_ids))
        return match.group(1) if match else None
    
    high_conf.loc[:, 'Protein1'] = high_conf['Alt_IDs_A'].apply(extract_uniprot)
    high_conf.loc[:, 'Protein2'] = high_conf['Alt_IDs_B'].apply(extract_uniprot)
    
    # 5. Nettoyage final
    clean_df = high_conf.dropna(subset=['Protein1', 'Protein2'])
    clean_df = clean_df[clean_df['Protein1'] != clean_df['Protein2']]
    
    # 6. Sélection des 40 000 interactions les plus fréquentes
    # Compter les occurrences de chaque interaction
    interaction_counts = pd.concat([
        clean_df.groupby('Protein1').size().rename('Count'),
        clean_df.groupby('Protein2').size().rename('Count')
    ], axis=1).fillna(0)
    interaction_counts['Total'] = interaction_counts.sum(axis=1)
    
    # Ajouter les scores aux interactions
    clean_df = clean_df.merge(
        interaction_counts[['Total']].rename(columns={'Total': 'Score1'}),
        left_on='Protein1', right_index=True
    ).merge(
        interaction_counts[['Total']].rename(columns={'Total': 'Score2'}),
        left_on='Protein2', right_index=True
    )
    clean_df['InteractionScore'] = clean_df['Score1'] + clean_df['Score2']
    
    # Trier et sélectionner le top 40 000
    top_interactions = clean_df.sort_values('InteractionScore', ascending=False)\
                             .drop_duplicates(subset=['Protein1', 'Protein2'])\
                             .head(40000)
    
    # 7. Statistiques
    unique_proteins = pd.unique(top_interactions[['Protein1', 'Protein2']].values.ravel('K'))
    
    print("\nRésultats sélection:")
    print(f"- Interactions totales (levure): {len(yeast_df)}")
    print(f"- Interactions haute confiance: {len(high_conf)}")
    print(f"- Interactions sélectionnées: {len(top_interactions)}")
    print(f"- Protéines uniques: {len(unique_proteins)}")
    
    # 8. Sauvegarde
    top_interactions[['Protein1', 'Protein2']].to_csv(
        output_file, 
        sep='\t', 
        index=False, 
        header=False
    )
    print(f"\nTop 40k interactions sauvegardées dans: {output_file}")

if __name__ == "__main__":
    process_biogrid_high_confidence()


Résultats sélection:
- Interactions totales (levure): 855577
- Interactions haute confiance: 224490
- Interactions sélectionnées: 40000
- Protéines uniques: 5519

Top 40k interactions sauvegardées dans: C:\Users\PC\Documents\M2 HPC\PFE\PFE_CODE\Data\clean data\interactions\BIOGRID_top_40000.tsv
