In [1]:
import dill as pickle

In [2]:
import pymongo
from pymongo import MongoClient

In [3]:
client = MongoClient('mongodb://removed:27017/')

In [4]:
client.list_database_names()

['admin', 'config', 'genra_db', 'genra_dev_v5', 'local']

In [5]:
DB = client.genra_dev_v5

In [6]:
loael_smiles=DB.compounds.find({},{'_id':0,'dsstox_sid':1,'smiles':1}).limit(10)

In [7]:
smiles_dict={record['dsstox_sid']:record['smiles'] for record in loael_smiles if record['smiles']}

In [8]:
loael_logp = [e for e in DB.physprop.find({'dsstox_sid': {'$in': list(smiles_dict.keys())}}, {"dsstox_sid":1, "predicted_props.OPERA_LogP" : 1, "_id":0})]

In [9]:
logp_dict = {record['dsstox_sid']: record.get('predicted_props',{})['OPERA_LogP'][0] for record in loael_logp}

In [10]:
loael_ws = [e for e in DB.physprop.find({'dsstox_sid': {'$in': list(smiles_dict.keys())}}, {"dsstox_sid":1, "predicted_props.OPERA_WS" : 1, "_id":0})]

In [11]:
ws_dict = {record['dsstox_sid']: record.get('predicted_props',{})['OPERA_WS'][0] for record in loael_ws}


In [12]:
loael_weight = [e for e in DB.compounds.find({'dsstox_sid': {'$in': list(smiles_dict.keys())}}, {"dsstox_sid":1, "mol_weight" : 1, "_id":0})]

In [13]:
weight_dict={record['dsstox_sid']:record['mol_weight'] for record in loael_weight}

In [14]:
sids=set(logp_dict.keys())&set(ws_dict.keys())&set(weight_dict.keys())&set(smiles_dict.keys())

In [16]:
from rdkit import Chem

In [17]:
import os

In [18]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
figures_dir = TOP + 'reports/figures/'

In [19]:

records=[]
for sid in sids:
    records.append({'dsstox_sid':sid,'smiles':smiles_dict[sid],'logp':logp_dict[sid],'ws':ws_dict[sid],'mol_weight':weight_dict[sid],'mol':Chem.MolFromSmiles(smiles_dict[sid])})
records=[record for record in records if record['mol']]

In [21]:
with open(external_dir+"epa_cats.pkl", "rb") as output_file:
     new_tests = pickle.load(output_file)

In [23]:
#new_tests

In [25]:
def convert_ppb(x): #OPERA results stored as mol/L
    ws=x['ws']
    mol_weight=x['mol_weight']
    return ws*mol_weight*10**6

In [27]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}
prop_dict={
    'log Kow':'logp',
    'Molecular Weight':'mol_weight',
    'Molecular weight':'mol_weight',
    'Water Solubility': 'ws'
}

In [28]:
import math
for record in records:
    #print(record)
    epa_categories=sorted([category for category,test in new_tests.items() if test(record)])
    if 'Neutral Organics' in epa_categories and len(epa_categories)>1:
        epa_categories.remove('Neutral Organics')
    record['categories']=tuple(epa_categories)

In [30]:
import pandas as pd

In [31]:
pd.DataFrame(records)

Unnamed: 0,dsstox_sid,smiles,logp,ws,mol_weight,mol,categories
0,DTXSID7020009,CC#N,-0.316894,8.59913,41.053,<rdkit.Chem.rdchem.Mol object at 0x76801ab43610>,()
1,DTXSID7020007,CC(=O)C1=CC=C(C=C1)S(=O)(=O)NC(=O)NC1CCCCC1,1.65144,0.001395,324.4,<rdkit.Chem.rdchem.Mol object at 0x76801ab43680>,()
2,DTXSID6020010,CC(C)=NO,0.882667,1.69356,73.095,<rdkit.Chem.rdchem.Mol object at 0x76801ab436f0>,()
3,DTXSID2020008,CC(C)=NNC1=NC=C(S1)C1=CC=C(O1)[N+]([O-])=O,1.54363,3.3e-05,266.28,<rdkit.Chem.rdchem.Mol object at 0x76801ab43760>,"(Hydrazines and Related Compounds,)"
4,DTXSID7020005,CC(N)=O,-1.2057,19.0029,59.068,<rdkit.Chem.rdchem.Mol object at 0x76801ab437d0>,()
5,DTXSID1020013,CC(=O)NNC(=O)C1=CC=NC=C1,-0.577089,0.380852,179.179,<rdkit.Chem.rdchem.Mol object at 0x76801ab43840>,"(Hydrazines and Related Compounds,)"
6,DTXSID2020006,CC(=O)NC1=CC=C(O)C=C1,0.875449,1.11026,151.165,<rdkit.Chem.rdchem.Mol object at 0x76801ab438b0>,"(Phenols (Acute toxicity),)"
7,DTXSID7020001,NC1=NC2=C(C=C1)C1=CC=CC=C1N2,2.63969,0.015397,183.214,<rdkit.Chem.rdchem.Mol object at 0x76801ab43920>,()
8,DTXSID6020012,CC(=O)NNC1=CC=C(CO)C=C1,0.764281,0.686068,180.207,<rdkit.Chem.rdchem.Mol object at 0x76801ab43990>,"(Hydrazines and Related Compounds,)"
9,DTXSID2020004,CC=NO,0.267325,5.8831,59.068,<rdkit.Chem.rdchem.Mol object at 0x76801ab43a00>,()


In [33]:
tsca_df = pd.read_excel(interim_dir+'TSCA_OPERA_predictions_080722.xlsx')
tsca_df2 = tsca_df[tsca_df['errors'].isnull()]
tsca_df2 = tsca_df2.reset_index()
epa_cats_df = tsca_df2[['dtxsid','qsar_ready_smiles','LogP_pred','WS_pred_mol/L','MolWeight']]

In [34]:
epa_cats_df.columns = ['dsstox_sid', 'smiles', 'logp', 'ws', 'mol_weight']

In [36]:
#epa_cats_df

In [37]:
epa_cats_df['mol'] = [Chem.MolFromSmiles(e) for e in epa_cats_df['smiles']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  epa_cats_df['mol'] = [Chem.MolFromSmiles(e) for e in epa_cats_df['smiles']]


In [38]:
epa_cats_dict = epa_cats_df.to_dict('records')

In [40]:
import re

In [41]:
for record in epa_cats_dict:
    #print(record)
    epa_categories=sorted([category for category,test in new_tests.items() if test(record)])
    if 'Neutral Organics' in epa_categories and len(epa_categories)>1:
        epa_categories.remove('Neutral Organics')
    record['categories']=tuple(epa_categories)

In [42]:
epa_cats_dict_df = pd.DataFrame(epa_cats_dict)

In [43]:
epa_cats_dict_df['categories'].value_counts()

categories
()                                                                        5699
(Neutral Organics,)                                                       2667
(Esters (Acute toxicity),)                                                1310
(Phenols (Acute toxicity),)                                                502
(Anilines (Acute toxicity),)                                               407
                                                                          ... 
(Esters (Chronic toxicity), Substituted Triazines (Chronic toxicity))        1
(Benzotriazoles (Acute toxicity), Polynitroaromatics (Acute toxicity))       1
(Phenols (Acute toxicity), Thiols (Acute toxicity))                          1
(Hydrazines and Related Compounds, Imides (Acute toxicity))                  1
(Epoxides, Esters (Chronic toxicity))                                        1
Name: count, Length: 143, dtype: int64

In [45]:
import asyncio
import aiohttp