# Create ChEBI reference files

1. REF_CHEBI2LABEL: dictionary of chebi to label
2. REF_CHEBI2FORMULA: dictionary of chebi to shortened formula
3. CHARCOUNT_COMB_DF: dataframe of character count for each chebi label

In [1]:
import urllib.request
import gzip
import collections
import compress_pickle
import networkx as nx
import os
import owlready2
import re
import xmltodict
import string
import pandas as pd
import numpy as np
DATA_DIR = '/Users/luna/Desktop/CRBM/AMAS_proj/Data'

## Data
Data are downloaded from the ChEBI website: https://www.ebi.ac.uk/chebi/downloadsForward.do.   
Ontology file (.owl.gz) are obtained using the link: https://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz.

In [38]:
# download chebi.owl.gz and decompress it
chebi_owl_url = "https://ftp.ebi.ac.uk/pub/databases/chebi/ontology/chebi.owl.gz"
chebi_owl_file = os.path.join(DATA_DIR, "chebi.owl.gz")
urllib.request.urlretrieve(chebi_owl_url, chebi_owl_file)
with gzip.open(chebi_owl_file, 'rb') as f_in:
    with open(os.path.join(DATA_DIR, "chebi.owl"), 'wb') as f_out:
        f_out.write(f_in.read())

# convert chebi.owl to chebi.xml
onto = owlready2.get_ontology(os.path.join(DATA_DIR, "chebi.owl")).load()
onto.save(os.path.join(DATA_DIR, "chebi.xml"))

In [2]:
contents = open(os.path.join(DATA_DIR, "chebi.xml")).read()
ch = xmltodict.parse(contents)
chebis = ch['rdf:RDF']['owl:Class']
len(chebis)

220816

In [13]:
chebis[1]['oboI:id']

{'@rdf:datatype': 'http://www.w3.org/2001/XMLSchema#string',
 '#text': 'CHEBI:10'}

## Create REF_CHEBI2LABEL

In [15]:
## chebi ID: Chebi Label (name)
## mapping is done only with primary ids
chebi2label = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi:
    primary_id = one_chebi['oboI:id']['#text']
    chebi_label = one_chebi['rdfs:label']['#text']
    chebi2label[primary_id] = chebi_label

In [17]:
len(chebi2label)

202209

In [18]:
compress_pickle.dump(chebi2label, os.path.join(DATA_DIR,'chebi2label_2jan2025.lzma'),
                     compression="lzma", set_default_extension=False)

## Create REF_CHEBI2FORMULA

In [28]:
# just get the first of the formulae if multiple are given. 
chebi_full_formula = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi and 'cheb:formula' in one_chebi:
    if isinstance(one_chebi['cheb:formula'], list):
      chebi_formula = [val['#text'] for val in one_chebi['cheb:formula']]
    else:
      chebi_formula = one_chebi['cheb:formula']['#text']
    chebi_full_formula[one_chebi['oboI:id']['#text']] = chebi_formula

In [29]:
len(chebi_full_formula)

190461

In [32]:
def removeAtom(input_formula, atoms_to_remove):
  """
  Remove a list of Atoms
  from the string formula.
  Primary goal is to remove
  H and D (heavy hydrogen). 
  :param str input_formula:
  :param list-str atoms_to_remove:
  :return str:
  """
  idx_to_remove = []
  letters = re.findall('[A-Z][a-z]?|\d+|.', input_formula)
  for one_letter in atoms_to_remove:
    if one_letter in letters:
      one_idx = letters.index(one_letter)
      idx_to_remove.append(one_idx)
      # if one_idx is the last element, pass
      if len(letters) == (one_idx+1):
        pass
      elif letters[one_idx+1].isdigit():
        idx_to_remove.append(one_idx+1)
  res = [val for idx, val in enumerate(letters) if idx not in idx_to_remove]
  return "".join(res)

# We will be conservative, will choose all formulas mapped from each CHEBI IDs
def getShortenedCHEBIFormula(one_info):
  """
  Extract chemical formula
  and return it or None.
  Formulas are shortened by removing H and D.
  
  Parameters
  ----------
  one_info: collections.OrderedDict
  
  Returns
  -------
  str/None
  """
  if 'cheb:formula' in one_info:
    # if multiple formulae are given, choose the shortest (rare situation, imo)
    if isinstance(one_info['cheb:formula'], list):
      form = sorted([val['#text'] for val in one_info['cheb:formula']], key=len)[0]
    else:
      form = one_info['cheb:formula']['#text']

    res = removeAtom(input_formula=form, atoms_to_remove=['H', 'D'])
    # if res is only composed of 'H' or 'D'. assign 'H'
    if res == '':
      res = 'H'
  else:
    res = None
  return res

chebi_shortened_formula = dict()
for one_chebi in chebis:
  if 'oboI:id' in one_chebi:
    one_shortened_formula = getShortenedCHEBIFormula(one_chebi)
    if one_shortened_formula is not None:
      chebi_shortened_formula[one_chebi['oboI:id']['#text']] = one_shortened_formula

print(len(chebi_shortened_formula))

190461


In [33]:
with open(os.path.join(DATA_DIR, 'chebi_shortened_formula_2jan2025.lzma'), 'wb') as handle:
    compress_pickle.dump(chebi_shortened_formula, handle, compression="lzma", set_default_extension=False)

## Create CHARCOUNT_COMB_DF

In [3]:
# save all synonyms for each chebi id to a dictionary
# used to create the character count dataframe
chebi2synonyms = dict()
for one_chebi in chebis:
    if 'oboI:id' in one_chebi:
        primary_id = one_chebi['oboI:id']['#text']
        synonyms = []
        # 'oboI:hasExactSynonym'
        if 'oboI:hasExactSynonym' in one_chebi:
            exact_synonyms = one_chebi['oboI:hasExactSynonym']
            if isinstance(exact_synonyms, list):  # If it's a list, extract all texts
                synonyms.extend([syn['#text'] for syn in exact_synonyms])
            else:  # If it's a single dictionary, extract its text
                synonyms.append(exact_synonyms['#text'])
        # 'oboI:hasRelatedSynonym'
        if 'oboI:hasRelatedSynonym' in one_chebi:
            related_synonyms = one_chebi['oboI:hasRelatedSynonym']
            if isinstance(related_synonyms, list):  # If it's a list, extract all texts
                synonyms.extend([syn['#text'] for syn in related_synonyms])
            else:  # If it's a single dictionary, extract its text
                synonyms.append(related_synonyms['#text'])
        if not synonyms:
            continue
        chebi2synonyms[primary_id] = synonyms

len(chebi2synonyms)

151547

In [24]:
# Expand the chebi2synonyms dictionary into a DataFrame
chebi2synonyms_expanded = [{"chebi": chebi_id, "synonym": synonym} 
                           for chebi_id, synonyms in chebi2synonyms.items() 
                           for synonym in synonyms]

expanded_chebi_df = pd.DataFrame(chebi2synonyms_expanded)
expanded_chebi_df

Unnamed: 0,chebi,synonym
0,CHEBI:10,(+)-Atherospermoline
1,CHEBI:100,(-)-Medicarpin
2,CHEBI:100,(-)-medicarpin
3,CHEBI:100,"(6aR,11aR)-9-methoxy-6a,11a-dihydro-6H-[1]benz..."
4,CHEBI:100,medicarpin
...,...,...
385524,CHEBI:9986,Vinclozoline
385525,CHEBI:9986,rac-vinclozolin
385526,CHEBI:9986,racemic vinclozolin
385527,CHEBI:9996,Virgaureasaponin I


In [25]:
# Function to calculate scaled character frequency
def scaled_char_frequency_row(row, char_columns):
    label = str(row['synonym']).lower()  # Convert to lowercase for case insensitivity
    char_counts = {char: label.count(char) for char in char_columns}  # Count each character
    vector_length = np.sqrt(sum(value ** 2 for value in char_counts.values()))  # Vector length (L2 norm)

    # Avoid division by zero; normalize if vector length > 0
    scaled_counts = {char: (count / vector_length if vector_length > 0 else 0) for char, count in char_counts.items()}
    scaled_counts['chebi'] = row['chebi']
    scaled_counts['synonym'] = row['synonym']
    return scaled_counts

# Define the characters for the first 36 columns
letters = list(string.ascii_lowercase)  # a-z
numbers = list(string.digits)  # 0-9
char_columns = letters + numbers

# Compute character frequency for each row
char_freq_data = [scaled_char_frequency_row(row, char_columns) for _, row in expanded_chebi_df.iterrows()]
CHARCOUNT_COMB_DF = pd.DataFrame(char_freq_data)

with open(os.path.join(DATA_DIR, 'charcount_df_scaled_2jan2025.lzma'), 'wb') as handle:
    compress_pickle.dump(CHARCOUNT_COMB_DF, handle, compression="lzma", set_default_extension=False)

In [39]:
CHARCOUNT_COMB_DF

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,...,2,3,4,5,6,7,8,9,chebi,synonym
0,0.196116,0.000000,0.000000,0.000000,0.588348,0.000000,0.000000,0.196116,0.196116,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,CHEBI:10,(+)-Atherospermoline
1,0.288675,0.000000,0.288675,0.288675,0.288675,0.000000,0.000000,0.000000,0.577350,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,CHEBI:100,(-)-Medicarpin
2,0.288675,0.000000,0.288675,0.288675,0.288675,0.000000,0.000000,0.000000,0.577350,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,CHEBI:100,(-)-medicarpin
3,0.306786,0.076696,0.153393,0.153393,0.230089,0.076696,0.000000,0.306786,0.076696,0.0,...,0.076696,0.153393,0.0,0.0,0.230089,0.0,0.0,0.076696,CHEBI:100,"(6aR,11aR)-9-methoxy-6a,11a-dihydro-6H-[1]benz..."
4,0.288675,0.000000,0.288675,0.288675,0.288675,0.000000,0.000000,0.000000,0.577350,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,CHEBI:100,medicarpin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385524,0.000000,0.000000,0.223607,0.000000,0.223607,0.000000,0.000000,0.000000,0.447214,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,CHEBI:9986,Vinclozoline
385525,0.204124,0.000000,0.408248,0.000000,0.000000,0.000000,0.000000,0.000000,0.408248,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,CHEBI:9986,rac-vinclozolin
385526,0.166667,0.000000,0.500000,0.000000,0.166667,0.000000,0.000000,0.000000,0.500000,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,CHEBI:9986,racemic vinclozolin
385527,0.522233,0.000000,0.000000,0.000000,0.174078,0.000000,0.174078,0.000000,0.522233,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,CHEBI:9996,Virgaureasaponin I
