In [1]:
import pandas as pd
import copy
import re

In [2]:
hugo_translate_file = '/mbshome/nvelthuijs/Cofactors/20200402_Transcriptionregulationlist/Datafiles/hugo_ensg_uniprot_approved.txt'

# Create dictionary for translating
# UniProtIDs to gene names:
hugo_uniprot_dict = {}
with open(hugo_translate_file) as hugo_translate:
    next(hugo_translate)
    for line in hugo_translate:
        line = line.strip().split('\t')
        if len(line) >= 3:
            gene_name = line[0]
            uniprot_ids = line[2]
            if not ',' in uniprot_ids:
                hugo_uniprot_dict[uniprot_ids] = gene_name
            else:
                uniprot_ids = uniprot_ids.split(', ')
                for single_id in uniprot_ids:
                    hugo_uniprot_dict[single_id] = gene_name
hugo_uniprot_dict.pop('')

n = 0
for item in hugo_uniprot_dict.items():
    print(item)
    n += 1
    if n > 5:
        break

('P04217', 'A1BG')
('Q9NQ94', 'A1CF')
('P01023', 'A2M')
('A8K2U0', 'A2ML1')
('U3KPV4', 'A3GALT2')
('Q9NPC4', 'A4GALT')


In [3]:
# Read complexportal.tab into dataframe with complex IDs as index, complex name and subunits as columns:
with open('/mbshome/nvelthuijs/Cofactors/20200402_Transcriptionregulationlist/Datafiles/complexportal.txt') as file:
    df = pd.read_table(file, usecols = ['#Complex ac', 'Recommended name', 'Identifiers (and stoichiometry) of molecules in complex'])
df.rename(columns = {'#Complex ac':'ComplexID', 'Recommended name':'Complex name', 'Identifiers (and stoichiometry) of molecules in complex':'Subunits'}, inplace = True)
df.set_index('ComplexID', inplace = True)

# Make dictionary where the keys are complex IDs and their values the subunits in that complex:
trans_dict = {complex_id : row['Subunits'].split('|') for complex_id, row in df.iterrows()}

# As long as there are complexes listed as a subunit, remove the complexID and replace with
# actual protein-subunits:
while any(prot.startswith('CPX-') for value in trans_dict.values() for prot in value):
    trans_dict_copy = copy.deepcopy(trans_dict) # Deep copy to iterate over, while changing the original dict.
    
    for key, value in trans_dict_copy.items():
        for prot in value:
            if prot.startswith('CPX-'):
                trans_dict[key].remove(prot)
                trans_dict[key] += trans_dict[prot[:-3]]

# Make a new dictionary translating uniprotIDs to gene symbols and leave out anything other than proteins:
def regex_uniprot(some_str):
    """Find a UniProtID in a string. If none is found, return None"""
    method_object = re.search(r'[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}', some_str)
    if method_object:
        return method_object.group(0)
    else:
        return None

prot_only_dict = {key : list(filter(None, (hugo_uniprot_dict.get(regex_uniprot(subunit), None) for subunit in value))) for key, value in trans_dict.items()}

# Write corrected dictionary to .txt file:
with open('/mbshome/nvelthuijs/Cofactors/20200402_Transcriptionregulationlist/Datafiles_corrected_names/complexportal_corrected_names.txt', 'w') as outfile:
    for complexid, subunit_list in prot_only_dict.items():
        complex_name = df.loc[complexid, 'Complex name']
        outfile.write('{}\t{}\t'.format(complexid, complex_name) + '\t'.join(subunit_list) + '\n')


In [4]:
# Testcases::

testdf = pd.DataFrame([
    ['C-1', 'prot1|prot2|prot3|C-4'],
    ['C-2', 'prot4|prot5|prot6|C-3'],
    ['C-3', 'prot1|prot8'],
    ['C-4', 'prot6|prot7|C-5'],
    ['C-5', 'prot9|prot10|C-6'],
    ['C-6', 'prot11|prot12'],
    ['C-7', 'prot1|prot13|C-2'],
    ['C-8', 'C-3|C-6'],
    ['C-9', 'C-2|prot14'],
    ['C-10', 'C-3|prot14|C-9'],
    ['C-11', 'C-10|C-9|C-6'],
    ['C-12', 'C-6|C-3']
], columns = ['ComplexID', 'Subunits']).set_index('ComplexID')

trans_dict = {complex_id : row['Subunits'].split('|') for complex_id, row in testdf.iterrows()}

while any(prot.startswith('C-') for value in trans_dict.values() for prot in value):
    trans_dict_copy = copy.deepcopy(trans_dict)
    
    for key, value in trans_dict_copy.items():
        for prot in value:
            if prot.startswith('C-'):
                trans_dict[key].remove(prot)
                trans_dict[key] += trans_dict[prot]

for index, row in testdf.iterrows():
    print(index)
    print(row['Subunits'])
    print(trans_dict[index])
    print()

C-1
prot1|prot2|prot3|C-4
['prot1', 'prot2', 'prot3', 'prot6', 'prot7', 'prot9', 'prot10', 'prot11', 'prot12']

C-2
prot4|prot5|prot6|C-3
['prot4', 'prot5', 'prot6', 'prot1', 'prot8']

C-3
prot1|prot8
['prot1', 'prot8']

C-4
prot6|prot7|C-5
['prot6', 'prot7', 'prot9', 'prot10', 'prot11', 'prot12']

C-5
prot9|prot10|C-6
['prot9', 'prot10', 'prot11', 'prot12']

C-6
prot11|prot12
['prot11', 'prot12']

C-7
prot1|prot13|C-2
['prot1', 'prot13', 'prot4', 'prot5', 'prot6', 'prot1', 'prot8']

C-8
C-3|C-6
['prot1', 'prot8', 'prot11', 'prot12']

C-9
C-2|prot14
['prot14', 'prot4', 'prot5', 'prot6', 'prot1', 'prot8']

C-10
C-3|prot14|C-9
['prot14', 'prot1', 'prot8', 'prot14', 'prot4', 'prot5', 'prot6', 'prot1', 'prot8']

C-11
C-10|C-9|C-6
['prot14', 'prot1', 'prot8', 'prot14', 'prot4', 'prot5', 'prot6', 'prot1', 'prot8', 'prot14', 'prot4', 'prot5', 'prot6', 'prot1', 'prot8', 'prot11', 'prot12']

C-12
C-6|C-3
['prot11', 'prot12', 'prot1', 'prot8']

