In [1]:
# create CheBI Graph using a saved local chebi file
# this will ensure the reproducilibility of chebi ontology graph
# possibly.. also using bioservices
# from bioservices import *

# Create CHEIB-G (Last updated Apr. 29, 2022)
## 1. Graph with only primary IDs
## 2. Dictionary that maps all secondary IDs to primary ids
## 3. Dictionary: chebi to full formula 
## 4. Dictionary: chebi to shortened formula

In [1]:
import collections
import compress_pickle
import networkx as nx
import os
import owlready2
import pickle
import re
import xmltodict



In [2]:
DATA_DIR = '/Users/woosubs/Desktop/AutomateAnnotation/DATA'
CHEBI_DIR = os.path.join(DATA_DIR, "chebi")

In [3]:
# onto = owlready2.get_ontology("chebi.owl").load()

In [4]:
# onto.save("chebi.xml")

In [5]:
contents = open("chebi.xml").read()
ch = xmltodict.parse(contents)
chebis = ch['rdf:RDF']['owl:Class']
len(chebis)

178349

In [6]:
chebis[1]['oboI:id']

OrderedDict([('@rdf:datatype', 'http://www.w3.org/2001/XMLSchema#string'),
             ('#text', 'CHEBI:18357')])

In [7]:
# with open(os.path.join(CHEBI_DIR, 'chebis.pickle'), 'wb') as handle:
#     pickle.dump(chebis, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [8]:
## TODO: chebi ID: Chebi Label (name)
## mapping is done only with primary ids
chebi2label = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi:
    primary_id = one_chebi['oboI:id']['#text']
    chebi_label = one_chebi['rdfs:label']['#text']
    chebi2label[primary_id] = chebi_label

In [9]:
compress_pickle.dump(chebi2label, 'chebi2label_18jan2023.lzma',
                     compression="lzma", set_default_extension=False)

In [10]:
# map secondary ID to primary ID
second2prime_dict = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi:
    primary_id = one_chebi['oboI:id']['#text']
    if 'oboI:hasAlternativeId' in one_chebi:
      # if there is only one secondary ID, it produces an ordered dict
      if isinstance(one_chebi['oboI:hasAlternativeId'], collections.OrderedDict):
        secondary_id = []
        secondary_id = secondary_id.append(one_chebi['oboI:hasAlternativeId']['#text'])
      # if there is more than one secondary ID, you get a list; treat properly
      else:
        secondary_id = [val['#text'] for val in one_chebi['oboI:hasAlternativeId']]
    else:
      secondary_id = None
    second2prime_dict[primary_id] = primary_id
    if secondary_id:
      for one_secondary in secondary_id:
        second2prime_dict[one_secondary] = primary_id

In [11]:
len(second2prime_dict)

176629

In [12]:
# save the dictionary and graph
with open(os.path.join(CHEBI_DIR, 'chebi_second2prime_29apr2022.pickle'), 'wb') as handle:
    pickle.dump(second2prime_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

## First of all, this version assumes that every CHEBI term has been mapped into its primary form

In [13]:
def getCHEBIEdges(inp_one_chebi):
  """
  Parse ordered dictionary of a chebi term
  and extract a list of (children, parent) pairs. 
  Result will get directly fed into a
  networkx object.
  
  Currently the relationship is restricted to 
  'is' relationship, so 'owl:Restriction' qualifiers
  will not be considered. 
  
  If no such is-pair exists, return None. 
  
  Paremters
  ---------
  inp_one_chebi: collections.OrderedDict
  
  Returns
  -------
  edge_mapper: list-str/None
  """
  # if an edge is detected, flag will be True
  flag = False
  if 'oboI:id' in inp_one_chebi and 'rdfs:subClassOf' in inp_one_chebi:
    # if list, check whether terms with '@rdf:resource' key exists
    if isinstance(inp_one_chebi['rdfs:subClassOf'], list):
      filtered_terms = [val['@rdf:resource'] for val in inp_one_chebi['rdfs:subClassOf'] \
                     if '@rdf:resource' in val.keys()]
      # if at least one term with a regular parent exists, create edge
      if len(filtered_terms) > 0:
        edge_tos = [val.split('/')[-1].replace('_', ':') \
                    for val in filtered_terms]
        flag = True
    # if not list (i.e., orderedDict, determine if '@rdf:resource' is given
    elif '@rdf:resource' in inp_one_chebi['rdfs:subClassOf']: 
      edge_tos = [inp_one_chebi['rdfs:subClassOf']['@rdf:resource'].split('/')[-1].replace('_', ':')]
      flag = True
  # if flag=True, edge_tos should have been already created
  if flag:
    edge_from = [inp_one_chebi['oboI:id']['#text']]
    edge_mapper = list(zip(edge_from*len(edge_tos), edge_tos))
    return edge_mapper
  else:
    return None
#                   for val in inp_one_chebi['rdfs:subClassOf'] \
#       edge_tos = [val['@rdf:resource'].split('/')[-1].replace('_', ':') \
#                   for val in inp_one_chebi['rdfs:subClassOf'] \
#                   if '@rdf:resource' in val]

In [14]:
getCHEBIEdges(one_chebi)

[('CHEBI:99999', 'CHEBI:24995'), ('CHEBI:99999', 'CHEBI:52898')]

In [15]:
# tested a few caes :) 
# for one_chebi in chebis:
#   res = getCHEBIEdges(one_chebi)
#   if res is None and 'oboI:id' in one_chebi:
#     print("======================================")
#     print("found :) ")
#     print(one_chebi)
#     print("======================================")
#     print()
#   else:
#     print(getCHEBIEdges(one_chebi))

In [16]:
g = nx.DiGraph()
for one_chebi in chebis:
  chebi_edges = getCHEBIEdges(one_chebi)
  if chebi_edges is not None:
    g.add_edges_from(chebi_edges)

In [17]:
# ## original code :) 
# g = nx.DiGraph()
# for one_chebi in chebis:
#   if 'oboI:id' in one_chebi and 'rdfs:subClassOf' in one_chebi:
#     edge_from = [one_chebi['oboI:id']['#text']]
#     if isinstance(one_chebi['rdfs:subClassOf'], list):
#       edge_tos = [val['@rdf:resource'].split('/')[-1].replace('_', ':') for val in one_chebi['rdfs:subClassOf'] if\
#                  '@rdf:resource' in val]
#     else:
#       edge_tos = [one_chebi['rdfs:subClassOf']['@rdf:resource'].split('/')[-1].replace('_', ':')]
#     edge_mapper = list(zip(edge_from*len(edge_tos), edge_tos))
#     g.add_edges_from(edge_mapper)

In [18]:
g.number_of_edges()

231283

In [19]:
'CHEBI:24431' in g.nodes

True

In [20]:
with open(os.path.join(CHEBI_DIR, 'chebi_primary_graph_full_30apr2022.pickle'), 'wb') as handle:
    pickle.dump(g, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Below is another, comprehensive version of mapping CHEBI -> Formula
# Previous mapper was just confined within RHEA-components
# This one will include all CHEBI terms with formula

In [21]:
# just get the first of the formulae if multiple are given. 
#######################################
## In case multiple formulae are given, 
## just return the list of formula-str
#######################################
chebi_full_formula = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi and 'cheb:formula' in one_chebi:
    if isinstance(one_chebi['cheb:formula'], list):
      chebi_formula = [val['#text'] for val in one_chebi['cheb:formula']]
    else:
      chebi_formula = one_chebi['cheb:formula']['#text']
    chebi_full_formula[one_chebi['oboI:id']['#text']] = chebi_formula

In [22]:
len(chebi_full_formula)

148331

In [23]:
chebi_full_formula['CHEBI:189081']

'C41H77NO7'

In [24]:
with open(os.path.join(CHEBI_DIR, 'chebi_full_formula_30apr2022.pickle'), 'wb') as handle:
    pickle.dump(chebi_full_formula, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [122]:
# one_chebi['cheb:formula']

In [114]:
# re.findall('[A-Za-z]+','(C5H7O4R)n.(C5H8O4)n')

['C', 'H', 'O', 'R', 'n', 'C', 'H', 'O', 'n']

In [25]:
# one example; multiple formulae minus R
print(one_chebi['oboI:id']['#text'])
print(one_chebi['cheb:formula'])

CHEBI:99999
OrderedDict([('@rdf:datatype', 'http://www.w3.org/2001/XMLSchema#string'), ('#text', 'C25H30N4O4S')])


In [26]:
def findCHEBITerm(search_term, inp_chebis=chebis):
  """
  List of CHEBI terms, where
  CHEBI terms are given as ordered dictionary.
  
  Paramters
  ---------
  inp_chebis: list-orderedDict
  
  Returns
  -------
  : orderedDict
  """
  for one_chebi in inp_chebis:
    if 'oboI:id' in one_chebi:
      chebi_id = one_chebi['oboI:id']['#text']
      if search_term == chebi_id:
        return one_chebi
  return None

In [27]:
def removeAtom(input_formula, atoms_to_remove):
  """
  Remove a list of Atoms
  from the string formula.
  Primary goal is to remove
  H and D (heavy hydrogen). 
  :param str input_formula:
  :param list-str atoms_to_remove:
  :return str:
  """
  idx_to_remove = []
  letters = re.findall('[A-Z][a-z]?|\d+|.', input_formula)
  for one_letter in atoms_to_remove:
    if one_letter in letters:
      one_idx = letters.index(one_letter)
      idx_to_remove.append(one_idx)
      # if one_idx is the last element, pass
      if len(letters) == (one_idx+1):
        pass
      elif letters[one_idx+1].isdigit():
        idx_to_remove.append(one_idx+1)
  res = [val for idx, val in enumerate(letters) if idx not in idx_to_remove]
  return "".join(res)
      
#       found_val = letters[one_id  

In [28]:
one_m = 'A2B4Hg2H10S9D2'
removeAtom(one_m, ['H', 'D'])

'A2B4Hg2S9'

In [29]:
removeAtom('HD3', ['H', 'D'])

''

In [30]:
def getCHEBIOriginalFormula(one_info):
  """
  Extract chemical formula
  and return it or None.
  Original formula conserves all information
  given as formula. 
  param: collections.OrderedDict one_info:
  return: str/None
  """
  if 'cheb:formula' in one_info:
    if isinstance(one_info['cheb:formula'], list):
      res = [val['#text'] for val in one_info['cheb:formula']]
    else:
      res = one_info['cheb:formula']['#text']
  else:
    res = None
  return res

In [31]:
# We will be conservative, will choose all formulas mapped from each CHEBI IDs
def getShortenedCHEBIFormula(one_info):
  """
  Extract chemical formula
  and return it or None.
  Formulas that contain n, ., R (radical)
  are not considered. 
  
  Parameters
  ----------
  one_info: collections.OrderedDict
  
  Returns
  -------
  return: str/None
  """
  if 'cheb:formula' in one_info:
    # if multiple formulae are given, choose the shortest (rare situation, imo)
    if isinstance(one_info['cheb:formula'], list):
      form = sorted([val['#text'] for val in one_info['cheb:formula']], key=len)[0]
    else:
      form = one_info['cheb:formula']['#text']
##############################################
#### Removed Below#############################
#     if 'n' in form or '.' in form or 'R' in form:
#       res = None
#     else:
#       # Otherwise, return after removing 'H' and 'D'
##############################################    
    
    res = removeAtom(input_formula=form, atoms_to_remove=['H', 'D'])
    # if res is only composed of 'H' or 'D'. assign 'H'
    if res == '':
      res = 'H'
  else:
    res = None
  return res

In [32]:
chebis[0]

OrderedDict([('@rdf:about', 'http://purl.obolibrary.org/obo/CHEBI_1'),
             ('obo:IAO_0000231',
              OrderedDict([('@rdf:resource',
                            'http://purl.obolibrary.org/obo/IAO_0000227')])),
             ('obo:IAO_0100001',
              OrderedDict([('@rdf:resource',
                            'http://purl.obolibrary.org/obo/CHEBI_18357')])),
             ('owl:deprecated',
              OrderedDict([('@rdf:datatype',
                            'http://www.w3.org/2001/XMLSchema#boolean'),
                           ('#text', 'true')]))])

In [33]:
res_chebi = findCHEBITerm('CHEBI:16136')
s = getShortenedCHEBIFormula(res_chebi)
print(s)

S


In [34]:
all_shortened_chebi_to_formula = {one_chebi['oboI:id']['#text']:getShortenedCHEBIFormula(one_chebi) for\
                                  one_chebi in chebis if 'oboI:id' in one_chebi}
len(all_shortened_chebi_to_formula)

159840

In [35]:
chebi_shortened_formula = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi:
    one_shortened_formula = getShortenedCHEBIFormula(one_chebi)
    if one_shortened_formula is not None:
      chebi_shortened_formula[one_chebi['oboI:id']['#text']] = one_shortened_formula

In [36]:
chebi_shortened_formula['CHEBI:16136']

'S'

In [37]:
print(len(chebi_shortened_formula))

148331


In [38]:
with open(os.path.join(CHEBI_DIR, 'chebi_shortened_formula_30apr2022.pickle'), 'wb') as handle:
    pickle.dump(chebi_shortened_formula, handle, protocol=pickle.HIGHEST_PROTOCOL)

## In chebi_full_formula, some have (624) multiple formulas aas list; 
## only the first are chosen in this case

In [69]:
def getOneFormula(one_f):
  """
  Get formula from full formula items.
  If item is str, just return it; 
  If item is list, sort and return the shortest
  """
  if isinstance(one_f, str):
    return one_f
  elif isinstance(one_f, list):
    return sorted(one_f)[0]
full_unique_formulas = set([getOneFormula(chebi_full_formula[k]) for k in chebi_full_formula.keys()])
len(full_unique_formulas)

49260

In [70]:
len(chebi_full_formula)

148331

In [71]:
shortened_unique_formulas = set([getOneFormula(chebi_shortened_formula[k]) for k in chebi_shortened_formula.keys()])
len(shortened_unique_formulas)

22600

In [66]:
len()

148331