# Apply model application domain developed for the ARN Categories

In [1]:
from pathlib import Path

In [2]:
TOP = Path.cwd().as_posix().replace('notebooks','')

In [3]:
raw_dir = Path(TOP) / 'data' /'raw'
model_dir = Path(TOP) / 'arn_cats' /'data'

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import textwrap

In [23]:
import sys
import os

# Get the absolute path of the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add it to sys.path
sys.path.insert(0, project_root)


In [24]:
from arn_cats.utl import logger
log = logger.get_logger(__name__)

In [25]:
from arn_cats.chm.cheminfo_toolkit import Molecule,Fingerprint_engine

In [26]:
from arn_cats.model.build_model import group_predictor_rf, group_predictor_kn, build_random_forest_classifier, select_groups

In [28]:
from arn_cats.model.model_domain import Domain

In [9]:
from arn_cats.visualisation.visualise_ARN_groups import visualise_ARN_groups

In [8]:
from rdkit import Chem
import pickle
import numpy as np
import pandas as pd
from glob import glob
import textwrap

In [11]:
from arn_cats.chm import cheminfo_toolkit

Load the set of molecules from the ARN groups themselves

In [9]:
from arn_cats.data.data_load import arn_groups

Resolved path: /home/grace/Documents/python/arn_cats/arn_cats/data/molecules_all.pickle


In [10]:
from arn_cats.data.data_load import molecules

In [15]:
molecules_regrouped = select_groups(molecules,
                                    minimum_group_size=10,
                                    small_groups_as_negative=True,
                                    pulled_small_group_name="miscellaneous chemistry")


In [19]:
fingerprint_engine = Fingerprint_engine.Morgan(radius=2, nBits=2560)

Instantiate domain for the substances

In [29]:
domain_rf = Domain(molecules_regrouped, fingerprint_engine=fingerprint_engine)

Load a new dataset - for sake of example, the first 5 substances in the TSCA inventory set are loaded and filtered for the specific columns of interest

In [32]:
tsca_head = (pd.read_excel(raw_dir/'tsca_categorisation_071124_wmappingdict.xlsx')
 .head()
             
)

In [42]:
tsca_head =(tsca_head
 .filter(['dtxsid', 'PREFERRED_NAME', 'CASRN', 'smiles'])
)

Loop across the molecules in this set on the basis of their SMILES - converted to MOLS in order to evaluate whether they fall within the applicability domain of the "training set chemicals"

In [45]:
mol_entries = []
for i, row in tsca_head.iterrows():
    mol_entry = dict(row)
    try:
        mol = Molecule(Chem.MolFromSmiles(row['smiles']))
    except:
        continue
    mol_entry['mol'] = mol
    mol_entry['in_domain'] = domain_rf.in_domain(mol)
    mol_entries.append(mol_entry)




In [46]:
mol_entries = (pd.DataFrame(mol_entries)
 .drop(['mol'], axis = 1)
              )

Wrangle the data into the original dataframe so that a new column is created for the chemicals of interest to indicate whether the substance is in/out of domain

In [47]:
mol_entries

Unnamed: 0,dtxsid,PREFERRED_NAME,CASRN,smiles,in_domain
0,DTXSID4063036,1-Nonyne,3452-09-3,CCCCCCCC#C,True
1,DTXSID30870753,1-Hexyne,693-02-7,CCCCC#C,True
2,DTXSID7062374,"1,8-Nonadiyne",2396-65-8,C#CCCCCCC#C,False
3,DTXSID9061097,1-Pentadecyne,765-13-9,CCCCCCCCCCCCCC#C,True
4,DTXSID1061233,"1,7-Octadiyne",871-84-1,C#CCCCCC#C,False
