# Walking through the application of the RF model developed for the ARN Categories for a new set of chemicals

In [1]:
from pathlib import Path

In [2]:
TOP = Path.cwd().as_posix().replace('notebooks','')

In [3]:
raw_dir = Path(TOP) / 'data' /'raw'
model_dir = Path(TOP) / 'arn_cats' /'data'

In [4]:
import sys
import os

In [5]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.insert(0, project_root)

In [7]:
import matplotlib.pyplot as plt
import seaborn as sns
#from tqdm import tqdm
import textwrap

In [8]:
from arn_cats.chm.cheminfo_toolkit import Molecule,Fingerprint_engine

In [9]:
from arn_cats.model.build_model import group_predictor_rf,  build_random_forest_classifier, select_groups

In [10]:
from arn_cats.model.model_domain import Domain

In [11]:
from rdkit import Chem
import pickle
import numpy as np
import pandas as pd
from glob import glob
import textwrap

In [12]:
from arn_cats.chm import cheminfo_toolkit

Load the set of molecules from the ARN groups themselves

In [14]:
from arn_cats.data.data_load import arn_groupings

Resolved path: C:\Users\GPatlewi\OneDrive - Environmental Protection Agency (EPA)\TSCA\ECHA-TSCA\ARN\arn_cats\arn_cats\data\molecules_all.pickle


In [15]:
from arn_cats.data.data_load import molecules, best_model_rf

In [16]:
molecules_regrouped = select_groups(molecules,
                                    minimum_group_size=10,
                                    small_groups_as_negative=True,
                                    pulled_small_group_name="miscellaneous chemistry")


In [17]:
fingerprint_engine = Fingerprint_engine.Morgan(radius=2, nBits=2560)

Instantiate domain for the substances

In [18]:
domain_rf = Domain(molecules_regrouped, fingerprint_engine=fingerprint_engine)

Load a new dataset - for sake of example, the first 5 substances in the TSCA inventory set are loaded and filtered for the specific columns of interest

In [22]:
tsca_head = (pd.read_excel(raw_dir/'tsca_categorisation_071124_wmappingdict.xlsx')
 .head(50)
             
)

In [24]:
#tsca_head

In [25]:
tsca_head =(tsca_head
 .filter(['dtxsid', 'PREFERRED_NAME', 'CASRN', 'smiles'])
)

Loop across the molecules in this set on the basis of their SMILES - converted to MOLS in order to evaluate whether they fall within the applicability domain of the "training set chemicals"

In [26]:
mol_entries = []
for i, row in tsca_head.iterrows():
    mol_entry = dict(row)
    try:
        mol = Molecule(Chem.MolFromSmiles(row['smiles']))
    except:
        continue
    mol_entry['mol'] = mol
    mol_entry['in_domain'] = domain_rf.in_domain(mol)
    predicted_groups = (
                    pd.Series(group_predictor_rf(mol, model_details=best_model_rf, all_groups=True))
                    .sort_values(ascending=False)
                    .head(3)
                    .rename('group probability')
                    .reset_index()
                    .rename({'index': 'group name'}, axis='columns'))
    mol_entry['predicted group 1'], mol_entry['predicted group 2'], mol_entry['predicted group 3'] = predicted_groups['group name'].to_list()
    mol_entry['predicted group 1 probability'], mol_entry['predicted group 2 probability'], mol_entry['predicted group 3 probability'] = predicted_groups['group probability'].to_list()
    mol_entries.append(mol_entry)




In [27]:
mol_entries = (pd.DataFrame(mol_entries)
 .drop(['mol'], axis = 1)
              )

Wrangle the data into the original dataframe so that a new column is created for the chemicals of interest to indicate whether the substance is in/out of domain and what the top 3 arn_groups are for the substances profiled

In [28]:
mol_entries

Unnamed: 0,dtxsid,PREFERRED_NAME,CASRN,smiles,in_domain,predicted group 1,predicted group 2,predicted group 3,predicted group 1 probability,predicted group 2 probability,predicted group 3 probability
0,DTXSID4063036,1-Nonyne,3452-09-3,CCCCCCCC#C,True,Aliphatic nitriles,Simple Lithium compounds,tetrahydroxymethyl and tetraalkyl phosphonium ...,0.462056,0.143764,0.036833
1,DTXSID30870753,1-Hexyne,693-02-7,CCCCC#C,True,Aliphatic nitriles,Simple Lithium compounds,tetrahydroxymethyl and tetraalkyl phosphonium ...,0.391984,0.137514,0.055325
2,DTXSID7062374,"1,8-Nonadiyne",2396-65-8,C#CCCCCCC#C,False,Aliphatic nitriles,primary aliphatic diamines and their salts,Simple Lithium compounds,0.405481,0.170556,0.071101
3,DTXSID9061097,1-Pentadecyne,765-13-9,CCCCCCCCCCCCCC#C,True,Aliphatic nitriles,Simple Lithium compounds,tetrahydroxymethyl and tetraalkyl phosphonium ...,0.462056,0.143764,0.036833
4,DTXSID1061233,"1,7-Octadiyne",871-84-1,C#CCCCCC#C,False,Aliphatic nitriles,primary aliphatic diamines and their salts,Simple Lithium compounds,0.410799,0.133905,0.066176
5,DTXSID4061096,1-Tetradecyne,765-10-6,CCCCCCCCCCCCC#C,True,Aliphatic nitriles,Simple Lithium compounds,tetrahydroxymethyl and tetraalkyl phosphonium ...,0.462056,0.143764,0.036833
6,DTXSID8072405,"1,11-Hexadecadiyne",71673-32-0,CCCCC#CCCCCCCCCC#C,True,Aliphatic nitriles,Simple Lithium compounds,"Organic phosphonic acids, salts and esters",0.454389,0.12432,0.042762
7,DTXSID0066955,"1-Octyne, 8-chloro-",24088-97-9,ClCCCCCCC#C,False,Aliphatic nitriles,chlorinated aromatic hydrocarbons,primary aliphatic diamines and their salts,0.296815,0.114706,0.098722
8,DTXSID6026379,Ethyne,74-86-2,C#C,False,Zirconium and its simple inorganic compounds,simple vanadium compounds,Simple Lithium compounds,0.194734,0.176813,0.162029
9,DTXSID7060835,1-Pentyne,627-19-0,CCCC#C,False,Aliphatic nitriles,Simple Lithium compounds,tetrahydroxymethyl and tetraalkyl phosphonium ...,0.382653,0.093899,0.082024


In [35]:
mol_entries.to_csv(raw_dir/'tsca_50_arns.csv')

In [29]:
import openpyxl

Attempt to do the same for a small subset of the REACH substances that were part of the SI in the original publication

In [30]:
reach_redact = pd.read_excel(raw_dir / 'S5_rf_application_1_results_redacted.xlsx', index_col = [0])

In [31]:
reach_redact = (reach_redact
 .rename(columns = {'SMILES used for the invetory predictions': 'smiles', 'predicted group 1': 'orig_grp1','predicted group 2': 'orig_grp2','predicted group 3': 'orig_grp3'})
 .filter(['ec_number', 'substance_name', 'CASRN', 'smiles', 'orig_grp1', 'orig_grp2', 'orig_grp3'])
 .query('smiles.notnull()')
)


In [37]:
reach_redact.head()

Unnamed: 0,ec_number,substance_name,smiles,orig_grp1,orig_grp2,orig_grp3
3353,213-668-5,"1,1,1,3,3,3-hexamethyldisilazane",C[Si](C)(C)N[Si](C)(C)C,hydrocarbyl siloxanes,Simple Lithium compounds,miscellaneous chemistry
985,202-808-0,4-nitrotoluene,Cc1ccc([N+](=O)[O-])cc1,nitroalkanes,miscellaneous chemistry,chlorinated aromatic hydrocarbons
984,202-805-4,"N,N-dimethyl-p-toluidine",Cc1ccc(N(C)C)cc1,chlorinated aromatic hydrocarbons,miscellaneous chemistry,Unsubstituted and linear aliphatic-substituted...
983,202-804-9,4-hydroxybenzoic acid,O=C(O)c1ccc(O)cc1,"Paraben acid, salts and esters","Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry
982,202-803-3,p-toluic acid,Cc1ccc(C(=O)O)cc1,"Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry,"Paraben acid, salts and esters"


In [32]:
mol_entries1 = []
for i, row in reach_redact.head(50).iterrows():
    mol_entry = dict(row)
    try:
        mol = Molecule(Chem.MolFromSmiles(row['smiles']))
    except:
        continue
    mol_entry['mol'] = mol
    mol_entry['in_domain'] = domain_rf.in_domain(mol)
    predicted_groups = (
                    pd.Series(group_predictor_rf(mol, model_details=best_model_rf, all_groups=True))
                    .sort_values(ascending=False)
                    .head(3)
                    .rename('group probability')
                    .reset_index()
                    .rename({'index': 'group name'}, axis='columns'))
    mol_entry['predicted group 1'], mol_entry['predicted group 2'], mol_entry['predicted group 3'] = predicted_groups['group name'].to_list()
    mol_entry['predicted group 1 probability'], mol_entry['predicted group 2 probability'], mol_entry['predicted group 3 probability'] = predicted_groups['group probability'].to_list()
    mol_entries1.append(mol_entry)

In [33]:
mol_entries1 = (pd.DataFrame(mol_entries1)
 .drop(['mol'], axis = 1)
              )

In [34]:
mol_entries1.head()

Unnamed: 0,ec_number,substance_name,smiles,orig_grp1,orig_grp2,orig_grp3,in_domain,predicted group 1,predicted group 2,predicted group 3,predicted group 1 probability,predicted group 2 probability,predicted group 3 probability
0,213-668-5,"1,1,1,3,3,3-hexamethyldisilazane",C[Si](C)(C)N[Si](C)(C)C,hydrocarbyl siloxanes,Simple Lithium compounds,miscellaneous chemistry,False,hydrocarbyl siloxanes,Simple Lithium compounds,miscellaneous chemistry,0.274541,0.159919,0.089563
1,202-808-0,4-nitrotoluene,Cc1ccc([N+](=O)[O-])cc1,nitroalkanes,miscellaneous chemistry,chlorinated aromatic hydrocarbons,False,nitroalkanes,miscellaneous chemistry,chlorinated aromatic hydrocarbons,0.240193,0.112222,0.094778
2,202-805-4,"N,N-dimethyl-p-toluidine",Cc1ccc(N(C)C)cc1,chlorinated aromatic hydrocarbons,miscellaneous chemistry,Unsubstituted and linear aliphatic-substituted...,True,chlorinated aromatic hydrocarbons,miscellaneous chemistry,Unsubstituted and linear aliphatic-substituted...,0.161841,0.14031,0.127222
3,202-804-9,4-hydroxybenzoic acid,O=C(O)c1ccc(O)cc1,"Paraben acid, salts and esters","Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry,True,"Paraben acid, salts and esters","Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry,0.509563,0.169198,0.113762
4,202-803-3,p-toluic acid,Cc1ccc(C(=O)O)cc1,"Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry,"Paraben acid, salts and esters",True,"Isophthalates, Terephthalates and Trimellitates",miscellaneous chemistry,"Paraben acid, salts and esters",0.268333,0.150849,0.11731


In [36]:
mol_entries1.to_csv(raw_dir/'echa_50_arns.csv')