In [1]:
import json
import pandas as pd

In [2]:
with open('./data/molecules/NPClassifier_index.json','r') as total:
    index = json.load(total)

index_class = list(index['Class'].keys())
index_superclass = list(index['Superclass'].keys())
index_pathway = list(index['Pathway'].keys())

In [3]:
class_ = pd.DataFrame(index['Class'], index=['class_num']).T

In [4]:
class_['class'] = class_.index

In [5]:
class_.reset_index(inplace=True, drop=True)

In [6]:
superclass = pd.DataFrame(index['Superclass'], index=['superclass_num']).T
superclass['superclass'] = superclass.index

In [7]:
pathway = pd.DataFrame(index['Pathway'], index=['pathway_num']).T
pathway['pathway'] = pathway.index

In [8]:
class_hierarchy = pd.DataFrame(index['Class_hierarchy']).T

In [9]:
class_hierarchy = class_hierarchy.applymap(lambda x: x[0])

In [10]:
class_hierarchy['class_num'] = class_hierarchy.index

In [11]:
class_hierarchy.class_num = class_hierarchy.class_num.astype(int)

In [12]:
step1 = pd.merge(class_, class_hierarchy, on='class_num')

In [13]:
step2 = pd.merge(step1, superclass, left_on="Superclass", right_on="superclass_num")

In [14]:
df = pd.merge(step2, pathway, left_on="Pathway", right_on="pathway_num")

In [15]:
df = df.drop(columns=['class_num', 'Pathway', 'Superclass', 'superclass_num', 'pathway_num'])

We have now a dataframe `df` that contains the link between the NP clssifier class $\rightarrow$ Superclass $\rightarrow$ pathway. 

Have a look : 

In [16]:
df.head()

Unnamed: 0,class,superclass,pathway
0,12-oxophytodienoic acid metabolites,Octadecanoids,Fatty acids
1,Jasmonic acids,Octadecanoids,Fatty acids
2,Other Octadecanoids,Octadecanoids,Fatty acids
3,Phytofurans,Octadecanoids,Fatty acids
4,Phytoprostanes,Octadecanoids,Fatty acids


In [17]:
np_classifier_edges = pd.DataFrame({
    'child': pd.concat([df['class'], df['superclass']]) ,
    'parent': pd.concat([df['superclass'], df['pathway']]),
    'type': 'biolink:subclass_of'
}).drop_duplicates().reset_index(drop=True)

In [18]:
np_classifier_nodes = pd.DataFrame({
    'node': pd.concat([np_classifier_edges.child, np_classifier_edges.parent]),
    'type': 'biolink:ChemicalEntity'
}).drop_duplicates().reset_index(drop=True)

# Load LOTUS to link molecules to NP Classifier

In [19]:
lotus = pd.read_csv("./data/molecules/230106_frozen_metadata.csv.gz", low_memory=False)

In [20]:
lotus['wd'] = 'wd:'+ lotus['structure_wikidata'].str.extract(r"(Q\d+)")

In [21]:
mol_to_np = lotus[['wd', 'structure_taxonomy_npclassifier_03class']].drop_duplicates().dropna().reset_index(drop=True)

In [22]:
mol_to_np['type'] = 'biolink:subclass_of'

In [23]:
mol_to_np.rename(columns={
    'wd': 'child',
    'structure_taxonomy_npclassifier_03class': 'parent'
}, inplace=True)

In [33]:
mol_to_np_edges = pd.concat(
    [
        mol_to_np,
        np_classifier_edges
    ]
).drop_duplicates().dropna().reset_index(drop=True)

In [34]:
mol_to_np_nodes = pd.DataFrame({
    'node': pd.concat([mol_to_np_edges.child, mol_to_np_edges.parent]),
    'type' : 'biolink:ChemicalEntity'
}).drop_duplicates().dropna().reset_index(drop=True)

In [35]:
mol_to_np_edges.to_csv("./data/molecules/mol_to_np_edges.csv", sep="\t")
mol_to_np_nodes.to_csv("./data/molecules/mol_to_np_nodes.csv", sep="\t")