- read in a STRING txt database dump
- add new columns: protein1, protein2, pair_id
- remove duplicates
- save as csv

In [14]:
import os
import pandas as pd

# File paths
STRING_DB_DUMP = "../../data/STRING/9606.protein.links.full.v12.0.txt"
MAPPING_FILE = "../../data/STRING/string_to_uniprot_mapping.txt"
OUTPUT_PATH = "../../data/STRING/9606.protein.links.full.v12.0.txt_processed.csv"

In [15]:
def load_string_to_uniprot_mapping(mapping_file):
    """
    Load the mapping from STRING ID to UniProt ID from the mapping file.
    
    Args:
        mapping_file (str): Path to the mapping file
        
    Returns:
        dict: Dictionary mapping STRING IDs to UniProt IDs
    """
    string_to_uniprot = {}
    try:
        with open(mapping_file, 'r') as f:
            for line in f:
                parts = line.strip().split('\t')
                if len(parts) == 2:
                    string_id, uniprot_id = parts
                    string_to_uniprot[string_id] = uniprot_id
        print(f"Loaded {len(string_to_uniprot)} STRING to UniProt mappings")
    except FileNotFoundError:
        print(f"Error: Mapping file {mapping_file} not found")
        exit(1)
    
    return string_to_uniprot

In [16]:
string_txt = pd.read_csv(STRING_DB_DUMP, sep=' ')

In [17]:
string_to_uniprot = load_string_to_uniprot_mapping(MAPPING_FILE)

Loaded 19110 STRING to UniProt mappings


In [18]:
# mapd IDs
string_txt['p1_Uniprot'] = string_txt['protein1'].apply(lambda x: string_to_uniprot.get(x, 'no_uniprot_result'))
string_txt['p2_Uniprot'] = string_txt['protein2'].apply(lambda x: string_to_uniprot.get(x, 'no_uniprot_result'))
string_txt['pair_id'] = string_txt.apply(lambda row: tuple(sorted([row['p1_Uniprot'], row['p2_Uniprot']])), axis=1)

In [19]:
# Remove duplicates
string_txt.set_index('pair_id', inplace=True)
string_txt = string_txt[~string_txt.index.duplicated(keep='first')]

# Reset index to move pair_id back to a column
string_txt.reset_index(inplace=True)

In [20]:
# save to csv file

# Save the DataFrame to the new file
string_txt.to_csv(OUTPUT_PATH, sep=',', index=False)
print(f"DataFrame saved to: {OUTPUT_PATH}")

DataFrame saved to: ../../data/STRING/9606.protein.links.full.v12.0.txt_processed.csv
