## Testing out the API on the Cheminformatics Modules

In [1]:
from pathlib import Path

In [2]:
TOP = Path.cwd().as_posix().replace('notebooks','')
raw_dir = Path(TOP) / 'data' /'raw'
model_dir = Path(TOP) / 'arn_cats' /'data'

In [3]:
from rdkit import Chem
import pickle
import numpy as np
import pandas as pd
from glob import glob
import textwrap

In [4]:
import json, requests
import urllib.parse, urllib.request



In [33]:
def arn(smi):
        encoded_smiles = urllib.parse.quote_plus(smi)
        webtest = f"https://hazard-dev.sciencedataexperts.com/api/arn_cats?smiles={encoded_smiles}&model=RF"
        a = json.loads(urllib.request.urlopen(webtest).read().decode())
        return a
   

In [34]:
test = arn("CC(C)c1ccc2sc3ccccc3c(=O)c2c1")

In [35]:
test

{'chemical': 'CC(C)c1ccc2sc3ccccc3c(=O)c2c1',
 'in_domain': True,
 'group': 'thioxanthenones'}

In [6]:
reach_examples = pd.read_csv(raw_dir/'echa_50_arns.csv')

In [8]:
reach_examples.head()

Unnamed: 0.1,Unnamed: 0,ec_number,substance_name,smiles,orig_grp1,orig_grp2,orig_grp3,in_domain,predicted group 1,predicted group 2,predicted group 3,predicted group 1 probability,predicted group 2 probability,predicted group 3 probability
0,0,213-668-5,"1,1,1,3,3,3-hexamethyldisilazane",C[Si](C)(C)N[Si](C)(C)C,hydrocarbyl siloxanes,Simple Lithium compounds,miscellaneous chemistry,False,hydrocarbyl siloxanes,Simple Lithium compounds,miscellaneous chemistry,0.274541,0.159919,0.089563
1,1,202-808-0,4-nitrotoluene,Cc1ccc([N+](=O)[O-])cc1,nitroalkanes,miscellaneous chemistry,chlorinated aromatic hydrocarbons,False,nitroalkanes,miscellaneous chemistry,chlorinated aromatic hydrocarbons,0.240193,0.112222,0.094778
2,2,202-805-4,"N,N-dimethyl-p-toluidine",Cc1ccc(N(C)C)cc1,chlorinated aromatic hydrocarbons,miscellaneous chemistry,Unsubstituted and linear aliphatic-substituted...,True,chlorinated aromatic hydrocarbons,miscellaneous chemistry,Unsubstituted and linear aliphatic-substituted...,0.161841,0.14031,0.127222
3,3,202-804-9,4-hydroxybenzoic acid,O=C(O)c1ccc(O)cc1,"Paraben acid, salts and esters","Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry,True,"Paraben acid, salts and esters","Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry,0.509563,0.169198,0.113762
4,4,202-803-3,p-toluic acid,Cc1ccc(C(=O)O)cc1,"Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry,"Paraben acid, salts and esters",True,"Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry,"Paraben acid, salts and esters",0.268333,0.150849,0.11731


In [47]:
data_list = []

for i, row in reach_examples.iterrows():
    ec_number = row['ec_number']
    # Assuming arn function returns a dictionary with keys 'chemical' and 'in_domain'
    smiles_output = arn(row['smiles'])
    
    # Append a dictionary containing the ec_number and unpacked API output to the list
    data_list.append({
        'ec_number': ec_number,
        'chemical': smiles_output['chemical'],
        'group': smiles_output['group'],
        'in_domain': smiles_output.get('in_domain', None)  # Use .get() for optional keys
        # Add other keys from the API output as needed
    })

# Convert the list of dictionaries into a DataFrame
result_df = pd.DataFrame(data_list)



In [51]:
result_df = (result_df
 .rename(columns = {'group':'modules_group', 'in_domain': 'modules_in_domain'})
)

In [57]:
compare = (pd.merge(reach_examples, result_df[['ec_number','modules_group','modules_in_domain']] , on = 'ec_number', how = 'left')
.drop(['Unnamed: 0'], axis = 1)
)

In [76]:
compare[['ec_number','orig_grp1','in_domain', 'predicted group 1', 'modules_group', 'modules_in_domain']].to_csv(raw_dir/'all_preds.csv')

In [64]:
# Identify rows where the columns are not identical

are_identical = (compare['orig_grp1'] == compare['predicted group 1']) & (compare['predicted group 1'] == compare['modules_group'])

# Check if all rows are True (i.e., columns are identical across all rows)
all_identical1 = are_identical.all()


In [68]:
(compare['orig_grp1'] == compare['modules_group']).all()

False

In [75]:
compare[(compare['orig_grp1'] != compare['modules_group'])][['ec_number', 'orig_grp1','modules_group']]

Unnamed: 0,ec_number,orig_grp1,modules_group
0,213-668-5,hydrocarbyl siloxanes,miscellaneous chemistry
11,202-795-1,Unsubstituted and linear aliphatic-substituted...,miscellaneous chemistry
12,202-794-6,Unsubstituted and linear aliphatic-substituted...,miscellaneous chemistry
15,213-650-7,Branched/cyclic dialiphatic ethers (excluding ...,miscellaneous chemistry
17,700-377-8,chlorinated aromatic hydrocarbons,miscellaneous chemistry
22,213-637-6,hydrocarbyl siloxanes,miscellaneous chemistry
25,213-635-5,Simple Lithium compounds,miscellaneous chemistry
29,202-772-6,Aralkylaldehydes,miscellaneous chemistry
37,202-765-8,nitroalkanes,miscellaneous chemistry
40,202-762-1,nitroalkanes,miscellaneous chemistry


In [74]:
compare[(compare['orig_grp1'] != compare['modules_group'])][['ec_number', 'orig_grp1','modules_group']].to_csv(raw_dir/'mismatches_reach.csv')