# Prepare the molecules

## Prepare chemical ontologies

Let's prepare the edges of Chemont

In [1]:
import obonet
import networkx as nx
import pandas as pd

In [3]:
#first read the Classyfire ontology
chemont = obonet.read_obo('./data/molecules/ChemOnt_2_1.obo')

#then write it as an edge list
nx.write_edgelist(chemont, "./data/molecules/chemont_edges.csv",
                  data=False,
                 delimiter="\t")


x = pd.read_csv("./data/molecules/chemont_edges.csv", sep="\t", header=None)
x[2] = 'biolink:subclass_of'
x.rename(columns={0: 'child', 1: 'parent', 2: 'type'}, inplace=True)
x.to_csv("./data/molecules/chemont_edges.csv")
del x

Now the nodes

In [4]:
def parse_file(file_path):
    id_list = []
    name_list = []

    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('id: '):
                id_list.append(line.strip().split(': ')[1])
            elif line.startswith('name: '):
                name_list.append(line.strip().split(': ')[1])
                
    df = pd.DataFrame({
        'id': id_list,
        'name': name_list,
    })

    return df

In [5]:
chemontid_to_name = parse_file("./data/molecules/ChemOnt_2_1.obo")
chemontid_to_name['type'] = 'biolink:ChemicalEntity'
chemontid_to_name.to_csv("./data/molecules/chemont_nodes.csv", sep="\t")

## Now we should link the molecules to the chemical ontologies

In [6]:
lotus = pd.read_csv("./data/molecules/230106_frozen_metadata.csv.gz", low_memory=False)

In [7]:
lotus['wd_structure'] = lotus['structure_wikidata'].str.extract(r"(Q\d+)")
lotus['wd_structure'] = 'wd:' + lotus['wd_structure']

In [17]:
edges = pd.DataFrame({
    'child': lotus.wd_structure,
    'parent_name': lotus.structure_taxonomy_classyfire_04directparent
}).drop_duplicates().dropna().reset_index(drop=True)

In [14]:
chemont_nodes = pd.read_csv("./data/molecules/chemont_nodes.csv", sep="\t", index_col=0)

In [16]:
mapping = {i:j for i,j in zip(chemont_nodes['name'], chemont_nodes['id'])}

In [19]:
edges['parent'] = edges['parent_name'].map(mapping)

In [21]:
edges.drop(columns=['parent_name'], inplace=True)

In [43]:
edges['type'] = "biolink:subclass_of"

In [44]:
nodes = pd.DataFrame({'node': pd.concat([edges.child, edges.parent]),
                     'type': 'biolink:ChemicalEntity'}).drop_duplicates().reset_index(drop=True)

In [45]:
edges.to_csv("./data/molecules/mol_to_chemont_edges.csv")
nodes.to_csv("./data/molecules/mol_to_chemont_nodes.csv")