In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import openpyxl
from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage

#Show mols in dataframes
from rdkit.Chem import PandasTools
from rdkit import Chem
from rdkit.Chem.Draw import MolsToGridImage
from IPython.core.display import HTML
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.Chem import rdDepictor
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit import DataStructs
import os

In [2]:
from scipy.spatial.distance import pdist, squareform

In [3]:
pwd

'/ccte/home1/gpatlewi/python/tsca_categories/notebooks'

In [4]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
processed_dir = TOP + 'data/processed/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
figures_dir = TOP + 'reports/figures/'

Re-evaluate ClassyFire results to evaluate whether any substances can be reassigned. Check OPERA predictions for substances to do sanity check

In [5]:
classyfire_out = pd.read_csv(processed_dir+'tsca_classyfied.csv', index_col = 0)

In [6]:
classyfire_out.shape

(14247, 9)

In [7]:
classyfire_out.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14247 entries, 0 to 14246
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   INCHIKEY    14247 non-null  object 
 1   Kingdom     14247 non-null  object 
 2   Superclass  14247 non-null  object 
 3   Class       14247 non-null  object 
 4   Subclass    14247 non-null  object 
 5   Unnamed: 6  0 non-null      float64
 6   INCHIKEY.1  14247 non-null  object 
 7   Unnamed: 8  14247 non-null  object 
 8   dtxsid      14247 non-null  object 
dtypes: float64(1), object(8)
memory usage: 1.1+ MB


In [8]:
classyfire_out['Unnamed: 8'].value_counts(dropna = False)

OK    14247
Name: Unnamed: 8, dtype: int64

In [9]:
classyfire_out.drop(['Unnamed: 8', 'Unnamed: 6'], axis = 1, inplace = True)

In [10]:
cols = classyfire_out.columns.tolist()

In [11]:
cols = [cols[-1]] + cols[:-1]


In [12]:
cols = cols[:-1]
cols

['dtxsid', 'INCHIKEY', 'Kingdom', 'Superclass', 'Class', 'Subclass']

In [13]:
classyfire_out = classyfire_out[cols]

In [14]:
missing_cats = classyfire_out[classyfire_out['Kingdom']=="['err', 'err', 'err', 'err']"]

In [16]:
missing_df = missing_cats[['INCHIKEY']].copy()


In [17]:
missing_df

Unnamed: 0,INCHIKEY
12,AASDJASZOZGYMM-UHFFFAOYSA-N
130,AGQKROMSWCHOND-UHFFFAOYSA-N
344,AQMKNVZQXJVVNL-UHFFFAOYNA-N
359,ARIDPXUNLFCABW-UHFFFAOYSA-N
395,ASWYBBQZAQJKMQ-UHFFFAOYNA-N
...,...
14085,ZSMNRKGGHXLZEC-UHFFFAOYSA-N
14139,ZUVBIBLYOCVYJU-UHFFFAOYSA-N
14149,ZVEZMVFBMOOHAT-UHFFFAOYSA-N
14177,ZWNPUELCBZVMDA-UHFFFAOYSA-N


In [19]:
import re
import time
import requests

In [21]:
groups = ['kingdom','superclass','class','subclass']

classy_data = []
for idx, inchikey in missing_df['INCHIKEY'].items():
    classyfire = [inchikey]

    response = requests.get(f'http://classyfire.wishartlab.com/entities/{inchikey}.json').json()
    if 'code' in response:
        time.sleep(1)
        response = requests.get(f'http://classyfire.wishartlab.com/entities/{inchikey}.json').json()
        try:
            for group in groups:
                if response[group] is not None:        
                    category_name = response[group]['name']
                    classyfire.append(category_name)
                else:
                    classyfire.append('N/a')
        except:
            classyfire.append(['err','err','err','err'])
    classy_data.append(classyfire)

In [23]:
df = pd.DataFrame(classy_data,columns=['INCHIKEY','Kingdom','Superclass','Class','Subclass'])

In [27]:
df

Unnamed: 0,INCHIKEY,Kingdom,Superclass,Class,Subclass
0,AASDJASZOZGYMM-UHFFFAOYSA-N,,,,
1,AGQKROMSWCHOND-UHFFFAOYSA-N,,,,
2,AQMKNVZQXJVVNL-UHFFFAOYNA-N,,,,
3,ARIDPXUNLFCABW-UHFFFAOYSA-N,,,,
4,ASWYBBQZAQJKMQ-UHFFFAOYNA-N,,,,
...,...,...,...,...,...
189,ZSMNRKGGHXLZEC-UHFFFAOYSA-N,Organic compounds,Organometallic compounds,Organometalloid compounds,Organosilicon compounds
190,ZUVBIBLYOCVYJU-UHFFFAOYSA-N,,,,
191,ZVEZMVFBMOOHAT-UHFFFAOYSA-N,,,,
192,ZWNPUELCBZVMDA-UHFFFAOYSA-N,,,,


In [30]:
df2 = pd.merge(classyfire_out, df, on = 'INCHIKEY', how = 'left')

In [39]:
df2['Kingdom'] = df2[['Kingdom_x', 'Kingdom_y']].apply(lambda x: x['Kingdom_x'] if np.all(pd.isnull(x['Kingdom_y'])) else x['Kingdom_y'], axis = 1)

#apply(lambda x: my_func(x) if(np.all(pd.notnull(x[1]))) else x, axis = 1)

In [41]:
df2['Superclass'] = df2[['Superclass_x', 'Superclass_y']].apply(lambda x: x['Superclass_x'] if np.all(pd.isnull(x['Superclass_y'])) else x['Superclass_y'], axis = 1)

In [42]:
df2['Class'] = df2[['Class_x', 'Class_y']].apply(lambda x: x['Class_x'] if np.all(pd.isnull(x['Class_y'])) else x['Class_y'], axis = 1)

In [44]:
df2.to_csv(interim_dir+'chk.csv')

In [50]:
df2[df2['Kingdom_x'] == "['err', 'err', 'err', 'err']"].head()

Unnamed: 0,dtxsid,INCHIKEY,Kingdom_x,Superclass_x,Class_x,Subclass_x,Kingdom_y,Superclass_y,Class_y,Subclass_y,Kingdom,Superclass,Class
12,DTXSID90505110,AASDJASZOZGYMM-UHFFFAOYSA-N,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']",,,,,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']"
130,DTXSID901015293,AGQKROMSWCHOND-UHFFFAOYSA-N,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']",,,,,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']"
344,DTXSID101021036,AQMKNVZQXJVVNL-UHFFFAOYNA-N,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']",,,,,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']"
359,DTXSID30893947,ARIDPXUNLFCABW-UHFFFAOYSA-N,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']",,,,,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']"
395,DTXSID901119506,ASWYBBQZAQJKMQ-UHFFFAOYNA-N,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']",,,,,"['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']","['err', 'err', 'err', 'err']"


In [None]:
for index in df_rev[mask_gene].index:
    if df_rev.loc[index, 'standard_assay_type'] == 'Other mutation':
        df_rev.loc[index, 'standard_assay_type'] = 'bacterial reverse mutation test'


In [62]:
for index in df2.index:
    if df2.loc[index, 'Kingdom'] == "['err', 'err', 'err', 'err']":
        df2.loc[index, 'Kingdom'] = 'Other'

In [63]:
df2['Kingdom'].value_counts()

Organic compounds       13477
Inorganic compounds       593
Other                     133
[err, err, err, err]       46
Name: Kingdom, dtype: int64

In [64]:
for index in df2.index:
    if df2.loc[index, 'Superclass'] == "['err', 'err', 'err', 'err']":
        df2.loc[index, 'Superclass'] = 'Other'

In [65]:
for index in df2.index:
    if df2.loc[index, 'Class'] == "['err', 'err', 'err', 'err']":
        df2.loc[index, 'Class'] = 'Other'

In [74]:
df2['Kingdom'].astype(str).unique()

array(['Organic compounds', 'Other', 'Inorganic compounds',
       "['err', 'err', 'err', 'err']"], dtype=object)

In [82]:
for index in df2[~df2['Kingdom'].isin(['Organic compounds', 'Other', 'Inorganic compounds'])].index:
    df2[~df2['Kingdom'].isin(['Organic compounds', 'Other', 'Inorganic compounds'])].loc[index, 'Kingdom'] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[~df2['Kingdom'].isin(['Organic compounds', 'Other', 'Inorganic compounds'])].loc[index, 'Kingdom'] = 'Other'


In [84]:
df2.Kingdom.value_counts()

Organic compounds       13477
Inorganic compounds       593
Other                     133
[err, err, err, err]       46
Name: Kingdom, dtype: int64

In [91]:
df2['Kingdom'] = df2['Kingdom'].astype(str)

In [92]:
df2.Kingdom.value_counts()

Organic compounds               13477
Inorganic compounds               593
Other                             133
['err', 'err', 'err', 'err']       46
Name: Kingdom, dtype: int64

In [93]:
for index in df2.index:
    if df2.loc[index, 'Kingdom'] == "['err', 'err', 'err', 'err']":
        df2.loc[index, 'Kingdom'] = 'Other'

In [94]:
df2.Kingdom.value_counts()

Organic compounds      13477
Inorganic compounds      593
Other                    179
Name: Kingdom, dtype: int64

In [96]:
df2.to_csv(interim_dir+'chk.csv')

In [98]:
df3 = df2.copy()

In [102]:
df3 = df3[['dtxsid', 
       'Kingdom', 'Superclass', 'Class']]

In [103]:
df3.to_csv(processed_dir+'ClassyFire.csv')