In [1]:
import pandas as pd

## Define Nodes

In [2]:
node_info = {}

# There seem to be many types of sequence variants,
# But the edges that link them are pretty unique, so will define based on
# Those edges rather than a Q-ID
node_info['Sequence Variant'] = {
    'abbrev': 'V',
    'id': None}

node_info['Binding Site'] = {
    'abbrev': 'BS',
    'id': 'Q616005'}

node_info['Active Site'] = {
    'abbrev': 'AS',
    'id': 'Q423026'}

node_info['Super-Secondary Structure'] = {
    'abbrev': 'SS',
    'id': 'Q7644128'}

node_info['Chemical Hazard'] = {
    'abbrev': 'CH',
    'id': 'Q21167512'}

node_info['Molecular Function'] = {
    'abbrev': 'MF',
    'id': 'Q14860489'}
    
node_info['Pharmaceutical Product'] = {
    'abbrev': 'PP',
    'id': 'Q28885102'}

node_info['Structural Motif'] = {
    'abbrev': 'SM',
    'id': 'Q3273544'}

node_info['Protein Family'] = {
    'abbrev': 'PF',
    'id': 'Q417841'}

node_info['Compound'] = {
    'abbrev': 'C',
    'id': 'Q11173'}

node_info['Disease'] = {
    'abbrev': 'D',
    'id': 'Q12136'}

## All proteins are encoded by genes, so qualify with 'gene encodes protein'
node_info['Protein'] = {
    'abbrev': 'P',
    'id': 'Q8054',
    'qualifiers': [{'s': 'Gene', 'p': 'P688', 'o': 'Protein', 'not': False},
                   {'s': 'Gene', 'p': 'P703', 'o': 'Q15978631', 'not': False}]}

## Genes will be defined as a human gene that does not encode a ANYTHING
## None in Object means it only cares about P688.... (if you specifiy proteins, get memory erros...)
node_info['Gene'] = {
    'abbrev': 'G',
    'id': 'Q7187',
    'qualifiers': [{'s': 'Gene', 'p': 'P688', 'o': None, 'not': True}, 
                   {'s': 'Gene', 'p': 'P703', 'o': 'Q15978631', 'not': False}]}
    
node_info['Protein Domain'] = {
    'abbrev': 'PD',
    'id': 'Q898273'}
    
node_info['Cellular Component'] = {
    'abbrev': 'CC',
    'id': 'Q5058355'}
    
node_info['Biological Pathway'] = {
    'abbrev': 'PW',
    'id': 'Q4915012'}

node_info['Biological Process'] = {
    'abbrev': 'BP',
    'id': 'Q2996394'}

# Don't want symptoms that could also be diseases....
# Get a lot of overlapping types espectially on chemcial hazard edges
node_info['Symptom'] = {
    'abbrev': 'S',
    'id': 'Q169872',
    'qualifiers': [{'s': 'Symptom', 'p': 'P31', 'o': 'Q12136', 'not': True},
                   {'s': 'Symptom', 'p': 'P279', 'o': 'Q12136', 'not': True}]}
    
node_info['Medical Specialty'] = {
    'abbrev': 'MS',
    'id': 'Q930752'}

# Chemical Role doesn't exist, but is an important type....
# Again defining based on the edge
node_info['Chemical Role'] = {
    'abbrev': 'CR',
    'id': None}
    
node_info['Anatomical Structure'] = {
    'abbrev': 'A',
    'id': 'Q4936952'}

In [3]:
edge_info = {}

edge_info['BIOLOGICAL_PROCESS'] = {
    'abbrev': 'bp',
    'id': 'P682'}

edge_info['BIOLOGICAL_VARIANT_OF'] = {
    'abbrev': 'v',
    'id': 'P3433'}

edge_info['CELL_COMPONENT'] = {
    'abbrev': 'cc',
    'id': 'P681'}

edge_info['MEDICAL_CONDITION_TREATED'] = {
    'abbrev': 't',
    'id': 'P2175',
    'rev_id': 'P2176'}

edge_info['GENETIC_ASSOCIATION'] = {
    'abbrev': 'a',
    'id': 'P2293'}

edge_info['HAS_ACTIVE_INGREDIENT'] = {
    'abbrev': 'ai',
    'id': 'P3781'}

edge_info['HAS_CAUSE'] = {
    'abbrev': 'hc',
    'id': 'P828'}

edge_info['HAS_PART'] = {
    'abbrev': 'hp',
    'id': 'P527'}

edge_info['MOLECULAR_FUNCTION'] = {
    'abbrev': 'mf',
    'id': 'P680'}

edge_info['NEGATIVE_DIAGNOSTIC_PREDICTOR'] = {
    'abbrev': 'nd',
    'id': 'P3357'}

edge_info['NEGATIVE_THERAPEUTIC_PREDICTOR'] = {
    'abbrev': 'nt',
    'id': 'P3355'}

edge_info['PHYSICALLY_INTERACTS_WITH'] = {
    'abbrev': 'iw',
    'id': 'P129'}

edge_info['POSITIVE_DIAGNOSTIC_PREDICTOR'] = {
    'abbrev': 'pd',
    'id': 'P3356'}

edge_info['POSITIVE_THERAPEUTIC_PREDICTOR'] = {
    'abbrev': 'pt',
    'id': 'P3354'}

edge_info['SUBCLASS_OF'] = {
    'abbrev': 's',
    'id': 'P279'}

edge_info['SYMPTOMS'] = {
    'abbrev': 'sy',
    'id': 'P780'}

edge_info['MEDICAL_SPECIALTY'] = {
    'abbrev': 'ms',
    'id': 'P1995'}

edge_info['SIGNIFICANT_DRUG_INTERACTION'] = {
    'abbrev': 'di',
    'id': 'P769'}

edge_info['AFFLICTS'] = {
    'abbrev': 'af',
    'id': 'P689'}

edge_info['HAS_ROLE'] = {
    'abbrev': 'r',
    'id': 'P2868'}

edge_info['THERAPEUTIC_AREA'] = {
    'abbrev': 'ta',
    'id': 'P4044'}

edge_info['ANATOMICAL_LOCATION'] = {
    'abbrev': 'l',
    'id': 'P927'}


In [4]:
node_abbreviations = {k: v['abbrev'] for k, v in node_info.items()}
edge_abbreviations = {k: v['abbrev'] for k, v in edge_info.items()}

In [5]:
node_abv_to_full = {v: k for k, v in node_abbreviations.items()}
edge_abv_to_full = {v: k for k, v in edge_abbreviations.items()}

## Generate queries for the edges

In [6]:
import sys
sys.path.append('../../hetnet-ml/src/')
import graph_tools as gt

In [7]:
def generate_instance_tag(qname, n_id):
    return "\n    {n} wdt:P31|wdt:P279 wd:{n_id} .".format(n=qname, n_id=n_id)

def to_query_name(name):
    return '?'+name.lower().replace(' ', '_').replace('-', '')

In [8]:
def build_query_from_abbrev(abbrev, target=None):
    # Split the edge abbreviation in to subject, predicate, object
    s_abv, p_abv, o_abv = gt.parse_edge_abbrev(abbrev)
    
    # Get the full versions of names
    s_name = node_abv_to_full[s_abv]
    p_name = edge_abv_to_full[p_abv]
    o_name = node_abv_to_full[o_abv]
    
    # Sanitize names for query purposes
    s_qname = to_query_name(s_name)
    o_qname = to_query_name(o_name)
    
    # Need to differentiate start and end if self referential edge
    if s_qname == o_qname:
        s_qname += '1'
        o_qname += '2'
    
    # Get the proper return types
    query_text = "SELECT DISTINCT {s} {s}Label {o} {o}Label".format(s=s_qname, o=o_qname)
    query_text += "\nWHERE {"
    
    return_types = {o_name}
    
    # Build the appropriate info to get the correct subject and object node-types
    # Use set so not repeated in cases of self-referential edges
    for name, qname in [(s_name, s_qname), (o_name, o_qname)]:
        if node_info[name]['id']:
            query_text += '\n\n    # Initial typing for {}'.format(name)
            query_text += generate_instance_tag(qname, node_info[name]['id'])
            return_types.update({name})
    
        # Get any qualifiers for subjects and objects
        for qual in node_info[name].get('qualifiers', []):
            query_text += '\n    # Qualifier for {}'.format(name)
            query_text += generate_qualifier(qual, return_types=return_types)
            # Ensure there isn't the same nodetype definition multiple times
            # E.G. ?gene wdt:P31|wdt:P279 wd:wd:Q7187 # ?gene subclass/instace of Gene
            return_types.update({qual['s'], qual['o']})
                
    # Allows for changing of the target of an edge... for example if we want 
    # Sequnce Variant - variant of - Protein... SV's only go to Genes, so we can use 
    # SV - v - Gene - enc - Pro to get the relationship... 
    # Qualifiers in the protein definito handle the Gene-enc-Pro relationship, but the Target
    # variable handles the fact that SVs point to genes
    if target:
        o_qname = to_query_name(target)
    
    # The actual edge we're interested in
    query_text += "\n\n    # Edge of interest {s} {p} {o}".format(s=s_name, p=p_name, o=o_name)
    
    # Some edges are bi-directonal e.g. drug-used-for-treatment and medical-condition-treated
    # Get edges in eitehr dircetion
    rev_id = edge_info[p_name].get('rev_id', None)
    if rev_id is not None:
        query_text += "\n    {{ {s} wdt:{p} {o} }}".format(s=s_qname, p=edge_info[p_name]['id'], o=o_qname)
        query_text += "\n    UNION {{ {s} wdt:{p} {o} }}".format(s=o_qname, p=rev_id, o=s_qname)
    else:
        query_text += "\n    {s} wdt:{p} {o} ".format(s=s_qname, p=edge_info[p_name]['id'], o=o_qname)

    
    # Make sure labels return in english
    query_text += '\n\n    SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGAGE],en" }\n}'

    return query_text
    
    

In [9]:
def generate_qualifier(qual, return_types=None):
    node_q_names = {}
    
    for node in ['s', 'o']:
        node_name = qual[node]
        if node_name in node_info.keys():
            node_q_names[node] = to_query_name(node_name)
        elif node_name is None:
            node_q_names[node] = '?na'
        else:    
            node_q_names[node] = 'wd:'+node_name

    qual_out = ''
    
    # if we have true node-types, we need to ensure they're used
    for semtype, qname in node_q_names.items():
        if not qname.startswith('wd:') and qual[semtype] not in return_types and qname is not '?na':
            qual_out += generate_instance_tag(qname, node_info[qual[semtype]]['id'])        
    
    # Negative qualifiers need to filter out edges
    if qual['not']:
        qual_out += '\n    FILTER NOT EXISTS {{ {s} wdt:{p} {o} }} .'.format(s=node_q_names['s'], 
                                                                            p=qual['p'], o=node_q_names['o'])
    # Otherwise add the edges
    else:
        qual_out += '\n    {s} wdt:{p} {o} .'.format(s=node_q_names['s'], 
                                                    p=qual['p'], o=node_q_names['o'])
    
    return qual_out

In [10]:
graph_edge_info = [
    {'abbrev': 'CdiC'},
    {'abbrev': 'CtD'},
    {'abbrev': 'PPaiC'},
    {'abbrev': 'CHhcC'},
    {'abbrev': 'PWhpC'},
    {'abbrev': 'PiwC'},
    {'abbrev': 'VntC'},
    {'abbrev': 'VptC'},
    {'abbrev': 'DaP', 'target': 'Gene'},
    {'abbrev': 'DaG'},
    {'abbrev': 'DsyS'},
    {'abbrev': 'DmsMS'},
    {'abbrev': 'CHsyS'},
    {'abbrev': 'CHsyD'},
    {'abbrev': 'VndD'},
    {'abbrev': 'VpdD'},
    {'abbrev': 'VvP', 'target': 'Gene'},
    {'abbrev': 'VvG'},
    {'abbrev': 'PWhpP', 'target': 'Gene'},
    {'abbrev': 'PWhpG'},
    {'abbrev': 'PccCC'},
    {'abbrev': 'PbpBP'},
    {'abbrev': 'PmfMF'},
    {'abbrev': 'PhpPD'},
    {'abbrev': 'PhpSS'},
    {'abbrev': 'PsPF'},
    {'abbrev': 'PhpBS'},
    {'abbrev': 'PhpAS'},
    {'abbrev': 'PhpSM'},
    {'abbrev': 'PPtaD'},
    {'abbrev': 'CrCR'},
    {'abbrev': 'DlA'},
    {'abbrev': 'CHafA'}
]

In [11]:
queries = [build_query_from_abbrev(**g_edge) for g_edge in graph_edge_info]

## Run the queries

In [12]:
import functools
from wikidataintegrator.wdi_core import WDItemEngine
from tqdm import tqdm

endpoint='http://avalanche.scripps.edu:9999/bigdata/sparql'

def parse_result_uris(result):
    for c in result:
        if 'Label' not in c:
            idx = result[c].str.startswith('http://www.wikidata.org/entity')
            result[c][idx] = result[c][idx].str.split('/', expand=True).iloc[:,-1]
    return result.drop_duplicates()

query_func = functools.partial(WDItemEngine.execute_sparql_query, endpoint=endpoint, as_dataframe=True)

def execute_sparql_query(query_text):
    # Enforce the proper column order
    col_order = query_text.split('\n')[0].split(' ?')[1:]
    return parse_result_uris(query_func(query_text))[col_order]

In [14]:
res = {e['abbrev']: execute_sparql_query(q) for e, q in tqdm(zip(graph_edge_info, queries), total=len(graph_edge_info))}

100%|██████████| 33/33 [02:43<00:00,  1.66s/it]


In [15]:
# How many total edges retrieved?
'{:,} raw edges'.format(sum(len(v) for v in res.values()))

'460,735 raw edges'

## Two potnetial issues to look into...

1. Self-referential edges...  
    These edges may contain `node1 - node2` edges as well as `node2 - node1` edges
    
2. Multi or miss-typed nodes...  
    Some node IDs may show up as multiple types... especially chemical roles and chemical compounds.

In [16]:
node_ids = dict()
id_to_name = dict()
self_ref = set()

for e, r in res.items():
    n_types = [c for c in r.columns if not c.endswith('Label')]
    
    for nt in n_types:
        if nt.endswith('1') or nt.endswith('2'):
            node_type = nt[:-1]
            self_ref.add(e)
        else:
            node_type = nt
    
        if node_type in node_ids:
            node_ids[node_type].update(set(r[nt]))
        else:
            node_ids[node_type] = set(r[nt])
    
        id_to_name.update(r.set_index(nt)[nt+'Label'].to_dict())
    

### Fix the Self-Ref edges where forward and backward edges may be duplicated

In [17]:
len(res.keys())

33

In [18]:
res['VvP']

Unnamed: 0,sequence_variant,sequence_variantLabel,protein,proteinLabel
0,Q32965028,ABL1 BCR-ABL F311L,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."
1,Q32965040,ABL1 BCR-ABL H396P,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."
2,Q29938707,ABL1 BCR-ABL F359C,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."
3,Q29938716,ABL1 BCR-ABL V299L,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."
4,Q29938702,ABL1 BCR-ABL E255V,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."
5,Q32964912,ABL1 E450G,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."
6,Q32965282,ABL1 M237V,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."
7,Q32965302,ABL1 E281K,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."
8,Q32965315,ABL1 A365V,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."
9,Q32964949,ABL1 L298V,Q587961,"ABL proto-oncogene 1, non-receptor tyrosine ki..."


In [19]:
fixed = dict()

for kind in tqdm(self_ref):
    
    # no need to worry about forward vs reverse in directed edges
    if '>' in kind or '<' in kind:
        continue
    
    # Only look at 1 kind of edge at a time
    this_edges = res[kind]
    col_names = this_edges.columns
    
    edge_ids = set()
    
    for row in this_edges.itertuples():
        # Grab the edge ID, sorting, so lowest ID first:
        #     If both 'Q00001 -- Q00002' and 'Q00002 -- Q00001' exist, effectively standarizes to 
        #     'Q00001 -- Q00002'
        edge_id = tuple(sorted([row[1], row[3]]))
        edge_ids.add(edge_id)

    start_ids = []
    start_names = []
    end_ids = []
    end_names = []

        
    for edge_id in edge_ids:
        start_ids.append(edge_id[0])
        start_names.append(id_to_name[edge_id[0]])
        end_ids.append(edge_id[1])
        end_names.append(id_to_name[edge_id[1]])
    
    fixed[kind] = pd.DataFrame({col_names[0]: start_ids, col_names[1]: start_names, col_names[2]: end_ids, col_names[3]: end_names})

100%|██████████| 1/1 [00:00<00:00, 114.88it/s]


In [20]:
len(res['CdiC']), len(fixed['CdiC'])

(1807, 1105)

In [21]:
# numver of unique node ids
'{:,} nodes (possibly sume duplicats across type)'.format(sum(len(v) for v in node_ids.values()))

'66,677 nodes (possibly sume duplicats across type)'

In [22]:
for k, v in node_ids.items():
    print('{} : {}'.format(k, len(v)))

compound : 8087
disease : 6205
pharmaceutical_product : 2305
chemical_hazard : 689
biological_pathway : 563
protein : 22742
sequence_variant : 1758
gene : 215
symptom : 253
medical_specialty : 69
cellular_component : 1729
biological_process : 12126
molecular_function : 4267
protein_domain : 4296
supersecondary_structure : 455
protein_family : 16
binding_site : 55
active_site : 89
structural_motif : 108
chemical_role : 494
anatomical_structure : 156


In [23]:
total = set()
for v in node_ids.values():
    total = total.union(v) 
'{:,} unique nodes ids'.format(len(total))

'66,673 unique nodes ids'

In [24]:
'{:,} total nodes with multiple types'.format(sum(len(v) for v in node_ids.values()) - len(total))

'4 total nodes with multiple types'

## Format nodes into Hetnet

In [25]:
nodes = []
for k, v in node_ids.items():
    curr_nodes = pd.DataFrame({'id': list(v), 'label': len(v)*[k]})
    curr_nodes['name'] = curr_nodes['id'].map(id_to_name)
    nodes.append(curr_nodes)
nodes = pd.concat(nodes).reset_index(drop=True)

In [26]:
nodes[nodes.duplicated(keep=False, subset=['id'])].sort_values('id')

Unnamed: 0,id,label,name
5545,Q127060,compound,folic acid
66483,Q127060,chemical_role,folic acid
12187,Q408089,disease,mercury poisoning
16826,Q408089,chemical_hazard,mercury poisoning
1883,Q414964,compound,Vasoactive intestinal peptide
31845,Q414964,protein,Vasoactive intestinal peptide
11420,Q7860879,disease,type IV hypersensitivity
47860,Q7860879,biological_process,type IV hypersensitivity


In [27]:
label_map = {to_query_name(k)[1:]: k for k in node_info.keys()}

In [28]:
nodes['label'] = nodes['label'].map(label_map)

In [29]:
nodes.head(10)

Unnamed: 0,id,label,name
0,Q3429577,Compound,Rhodamine 123
1,Q474880,Compound,(21R)-argatroban
2,Q5282570,Compound,Disperse Orange 1
3,Q27278051,Compound,transclopenthixol
4,Q27114806,Compound,metamizole sodium hydrate
5,Q5986473,Compound,icilin
6,Q368222,Compound,azapropazone
7,Q3698301,Compound,Cromakalim
8,Q27108895,Compound,2-hydroxyglutarate(2-)
9,Q5065694,Compound,3-[4-[[4-(aminomethyl)cyclohexyl]-oxomethoxy]p...


## Format Edges into Hetnet

In [30]:
edges = []

for k, v in res.items():
    if k in fixed.keys():
        v = fixed[k]
    
    
    col_names = v.columns
    keep_cols = [c for c in col_names if not c.endswith('Label')]
    col_name_map = {keep_cols[0]: 'start_id', keep_cols[1]: 'end_id'}
    
    # Replace Proteins with Genes, to merge the protein and gene metanodes
    parsed_edge = gt.parse_edge_abbrev(k)
    if 'P' in parsed_edge:
        idx = parsed_edge.index('P')
        parsed_edge = list(parsed_edge)
        parsed_edge[idx] = 'G'
        k = ''.join(parsed_edge)
    
    v = v[keep_cols].rename(columns=col_name_map)
    v['type'] = edge_abv_to_full[parsed_edge[1]] + '_' + k
    
    edges.append(v)
edges = pd.concat(edges).reset_index(drop=True)

In [31]:
edges['type'].value_counts()

BIOLOGICAL_PROCESS_GbpBP               161545
CELL_COMPONENT_GccCC                    99926
MOLECULAR_FUNCTION_GmfMF                87111
HAS_PART_GhpPD                          30490
HAS_PART_PWhpG                          24882
HAS_ROLE_CrCR                            9933
MEDICAL_SPECIALTY_DmsMS                  6531
MEDICAL_CONDITION_TREATED_CtD            5213
HAS_PART_GhpSS                           4130
PHYSICALLY_INTERACTS_WITH_GiwC           3645
SYMPTOMS_CHsyS                           3450
HAS_PART_PWhpC                           3384
GENETIC_ASSOCIATION_DaG                  3043
BIOLOGICAL_VARIANT_OF_VvG                2528
HAS_PART_GhpSM                           2238
HAS_ACTIVE_INGREDIENT_PPaiC              2166
SYMPTOMS_CHsyD                           1481
AFFLICTS_CHafA                           1170
THERAPEUTIC_AREA_PPtaD                   1124
SIGNIFICANT_DRUG_INTERACTION_CdiC        1105
HAS_PART_GhpAS                           1046
HAS_PART_GhpBS                    

In [32]:
# merge the genes and proteins in edges
idx = nodes.query('label == "Protein"').index
nodes.loc[idx, 'label'] = 'Gene'

In [33]:
nodes['label'].value_counts()

Gene                         22957
Biological Process           12126
Compound                      8087
Disease                       6205
Protein Domain                4296
Molecular Function            4267
Pharmaceutical Product        2305
Sequence Variant              1758
Cellular Component            1729
Chemical Hazard                689
Biological Pathway             563
Chemical Role                  494
Super-Secondary Structure      455
Symptom                        253
Anatomical Structure           156
Structural Motif               108
Active Site                     89
Medical Specialty               69
Binding Site                    55
Protein Family                  16
Name: label, dtype: int64

In [34]:
duplicated_nodes = nodes[nodes.duplicated(keep=False, subset=['id'])]['id'].unique()

In [35]:
duplicated_nodes

array(['Q414964', 'Q127060', 'Q7860879', 'Q408089'], dtype=object)

In [36]:
combo = gt.combine_nodes_and_edges(nodes, edges)

In [37]:
combo.drop_duplicates(subset='type')

Unnamed: 0,start_id,end_id,type,start_name,end_name,start_label,end_label
0,Q18216,Q414275,SIGNIFICANT_DRUG_INTERACTION_CdiC,aspirin,tolbutamide,Compound,Compound
1105,Q408535,Q131755,MEDICAL_CONDITION_TREATED_CtD,quetiapine,bipolar disorder,Compound,Disease
6318,Q47521690,Q212272,HAS_ACTIVE_INGREDIENT_PPaiC,Hydrea,hydroxyurea,Pharmaceutical Product,Compound
8484,Q21174183,Q416728,HAS_CAUSE_CHhcC,Diethylenetriamine exposure,diethylenetriamine,Chemical Hazard,Compound
9081,Q44015511,Q190012,HAS_PART_PWhpC,Methionine De Novo and Salvage Pathway,adenosine,Biological Pathway,Compound
12465,Q21132915,Q265352,PHYSICALLY_INTERACTS_WITH_GiwC,Carbonic anhydrase 7,ethoxzolamide,Gene,Compound
16110,Q28546459,Q18936,NEGATIVE_THERAPEUTIC_PREDICTOR_VntC,RB1 OVEREXPRESSION,doxorubicin,Sequence Variant,Compound
16651,Q28531486,Q32089,POSITIVE_THERAPEUTIC_PREDICTOR_VptC,TSC1 FRAMESHIFT TRUNCATION,Sirolimus,Sequence Variant,Compound
17372,Q1476525,Q21117791,GENETIC_ASSOCIATION_DaG,lipid metabolism disorder,"Carnitine O-palmitoyltransferase 1, liver isoform",Disease,Gene
20415,Q4454701,Q186889,SYMPTOMS_DsyS,Taeniasis saginata,nausea,Disease,Symptom


In [84]:
resolved_multiples = {
    'Q414964': 'Gene',
    'Q127060': 'Compound', 
    'Q7860879': 'Biological Process',
    'Q408089': 'Chemical Hazard'
}