In [1]:
import pandas as pd
import numpy as np
import numba
#import dask.dataframe as dd

import sys

from IPython.display import HTML
from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools # headsup: this import change the behavior of dataframes with mols in them
# some global configuration of the pandastools
PandasTools.molRepresentation = 'svg'
PandasTools.molSize = (200,200)

#constatns
HA_TO_EV = 27.211399
PERIODICTABLE = Chem.GetPeriodicTable()

# this is a little helper function to render images inside a dataframe
# once again, there are ways to monkey patch the rendering of dataframes, but I am trying to 
# avoid most of that to make things a bit easier to understand

def show(df):
    return HTML(df.to_html(escape=False))

In [3]:
# from dask.distributed import Executor
# e = Executor('localhost:8786', set_as_default=True)

In [4]:
import os
import glob

path = r'/home/denn/home/ml/data/sf/'
all_files = glob.glob(os.path.join(path, "MW120_1000_*.xls"))
all_files.append('/home/denn/home/ml/data/sf/reaxys_uv_all_anthracene_plus_mw1300_1_4999.xls')
all_files.append('/home/denn/home/ml/data/sf/reaxys_uv_all_anthracene_plus_mw1300_5000_9361.xls')
all_files.append('/home/denn/home/ml/data/sf/reaxys_uv_all_perylene_1_2250.xls')
pds = []
for f in all_files:
    print(f)
    pds.append(pd.read_csv(f,delimiter='\t',low_memory=False))
    
reax_full=pd.concat(pds, ignore_index=True)
print(reax_full.shape)

/home/denn/home/ml/data/sf/MW120_1000_UV360_440_80000_84999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_100000_104999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_25000_29999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_5_9999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_185000_189999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_140000_144999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_15000_19999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_55000_59999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_50000_54999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV440_70000_74999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_125000_129999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV440_90000_92739.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_105000_109999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_195000_199999.xls
/home/denn/home/ml/data/sf/MW120_1000_UV360_440_135000_139999.xls
/home/denn/home/ml/data/sf/MW120_1000_U

In [5]:
# uv_cols = [col for col in reax_full.columns if 'UV' in col]
# cols=uv_cols+['Structure']
#print(reax_full.columns)
#reax_full[cols].head()
need_cols=['Ext./Abs. Coefficient [l·mol-1cm-1]','Comment (UV/VIS Spectroscopy)','Molecular Formula','Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)','Solvent (UV/VIS Spectroscopy)','Linear Structure Formula','Structure','Number of References','Absorption Maxima (UV/VIS) [nm]']
ref_cols_name=[col for col in reax_full.columns if col.startswith('Ref')]

reax_uv=reax_full[need_cols+ref_cols_name].dropna(subset=['Absorption Maxima (UV/VIS) [nm]']).dropna(axis=1,how='all')
reax_uv=reax_uv[reax_uv['Comment (UV/VIS Spectroscopy)'].isnull()]
# reax_uv=reax_uv[reax_uv.Structure.str.match('([0-9CNOFS@+\-\[\]\(\)\\\/%=#$]+)$',case=False,as_indexer=True)]
#reax_uv=reax_uv[reax_uv.Structure.str.match('^((?!(Co|Se|Cs|Os|\[O\+8\]|\[C-3\]|\[N\+5\]|\[O\+3\]|\.)).)*$',case=True,as_indexer=True)]
reax_uv=reax_uv[~reax_uv.Structure.str.contains('.',regex=False)]
#reax_uv[cols][:30]
#print(reax_uv.columns)
#reax_full=reax_full.compute()
# reax_uv=reax_uv.drop_duplicates()
# print(reax_uv.shape)
solvent_syn = {'Solvent (UV/VIS Spectroscopy)': 
               {'CH2Cl2': 'dichloromethane',
                'CHCl3':'chloroform',
                'H2O':'water',
                'dimethyl sulfoxide':'dimethylsulfoxide',
               'N,N-dimethyl-formamide':'dimethylformamide'}
              }
reax_uv.replace(solvent_syn,inplace=True)

#print(reax_uv['Absorption Maxima (UV/VIS) [nm]'].count())
print(reax_uv.shape)

(363851, 97)


In [6]:
reax_uv=reax_uv.drop_duplicates()
print(reax_uv.shape)

(358006, 97)


In [7]:
import re
def mf2dict(mf_str):
    molfor=re.findall(r'([A-Z][a-z]*)(\d*)', mf_str)
    molfor1=map(lambda e: (e[0], 1) if e[1] == '' else (e[0],int(e[1])), molfor)
    return dict(molfor1)
reax_uv['mf']=reax_uv['Molecular Formula'].apply(mf2dict)
reax_uv['C_c']=reax_uv['mf'].apply(lambda d: d.get('C',0))

In [8]:
reax_uv=reax_uv[reax_uv.mf.apply(lambda m: bool(set(m.keys()) - set(['H','C','N','O','F','S','Cl','Br','I','P']))==False)]
print(reax_uv.shape)

(299341, 99)


In [9]:
print(reax_uv['Absorption Maxima (UV/VIS) [nm]'].count())

299341


In [10]:
reax_uv['Solvent (UV/VIS Spectroscopy)']=reax_uv['Solvent (UV/VIS Spectroscopy)'].fillna('not given')
#reax_uv.replace({‘a’: {‘b’: nan}})

In [11]:

# class1=''
# solvent_syn = {'Solvent (UV/VIS Spectroscopy)': 
#                {'CH2Cl2': class1,
#                 'CHCl3':'clorohydrocarbon',
#                 'dichloromethane':'clorohydrocarbon',
                
#                 'H2O':'water',
#                 'dimethyl sulfoxide':'dimethylsulfoxide'}
#               }
sol=reax_uv['Solvent (UV/VIS Spectroscopy)'].value_counts()
sol_count=sol[sol>600]
print(sol_count)
#print(set(sol_count.index.get_values())-set(['not given','methanol; various solvent(s)','various solvent(s)'])) #,'aq. NaOH','aq. buffer',''
#reax_uv[reax_uv['Solvent (UV/VIS Spectroscopy)']=='ethanol'][cols+['nm']]
#reax_full[reax_full['Solvent (UV/VIS Spectroscopy)']=='ethanol'][cols].sample(10)

methanol                        47132
ethanol                         37237
dichloromethane                 32770
chloroform                      28047
not given                       26624
acetonitrile                    18261
tetrahydrofuran                  7664
toluene                          7642
dimethylsulfoxide                7494
dimethylformamide                6692
cyclohexane                      5981
hexane                           5955
water                            5637
dioxane                          4636
benzene                          4510
various solvent(s)               3829
acetone                          3171
ethyl acetate                    2207
diethyl ether                    1674
aq. ethanol                      1643
1,4-dioxane                      1608
methanol; various solvent(s)     1605
acetic acid                      1053
CCl4                             1045
H2SO4                             837
aq. NaOH                          827
propan-2-ol 

In [12]:
reax_uv=reax_uv[reax_uv['Solvent (UV/VIS Spectroscopy)'].isin(set(sol_count.index.get_values())-set(['methanol; various solvent(s)','various solvent(s)']))]
print(reax_uv.shape)


(264379, 99)


In [13]:
reax_uv=reax_uv[(reax_uv.C_c>6) & (reax_uv.C_c<51)]
print(reax_uv.shape)

reax_uv=reax_uv[~(reax_uv['Chemical Name'].fillna('').str.contains('radical',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Chemical Name'].fillna('').str.contains('cation',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Chemical Name'].fillna('').str.contains('anion',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)'].fillna('').str.contains('two-photon',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(1+)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(1-)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(2-)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(2+)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(3-)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(3+)',regex=False))]
print(reax_uv.shape)

(245762, 99)
(244478, 99)
(244338, 99)
(244138, 99)
(243874, 99)
(242292, 99)
(241635, 99)
(241463, 99)
(241311, 99)
(241279, 99)
(241259, 99)


In [25]:
#'C_c',

# ref_cols_name=[col for col in reax_uv.columns if col.startswith('Ref')]
# def fix_ref(rows):
#     ref_col=rows.dropna(axis=1,how='all')
#     if len(ref_col)>0:
#         return ref_col.ix[:,0]
#     else:
#         return None

# # def fix_ref(rows):
# #     ref_col=rows[ref_cols_name].dropna(axis=1,how='all').columns
# #     if len(ref_col)>0:
# #         return rows[ref_col[0]]
# #     else:
# #         return None
# #reax_ref=reax.set_index(['InChI Key','Solvent (UV/VIS Spectroscopy)'])
# reference=reax_uv[need_cols+ref_cols_name].groupby(['InChI Key','Solvent (UV/VIS Spectroscopy)'],group_keys=False).apply(lambda rows: fix_ref(rows[ref_cols_name]))
# #reax_uv['ref']
# #reference=reax_uv[:100].set_index(['InChI Key','Solvent (UV/VIS Spectroscopy)'])[ref_cols_name].groupby(level=(0,1)).apply(lambda rows: fix_ref(rows[ref_cols_name]))


In [16]:
need_cols=['Ext./Abs. Coefficient [l·mol-1cm-1]','Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)','Solvent (UV/VIS Spectroscopy)','Linear Structure Formula','Structure','Number of References','Absorption Maxima (UV/VIS) [nm]']
ref_cols_name=[col for col in reax_uv.columns if col.startswith('Ref')]
def fix_ref(rows):
    counts=rows.count()
    max_indx=counts.argmax()
    #ref_col=rows.dropna(axis=1,how='any')
    if counts[max_indx]>0:
        return rows.loc[:,max_indx]
    else:
        return None

reference=reax_uv[need_cols+ref_cols_name].groupby(['Substance Identification: Reaxys Registry Number','Solvent (UV/VIS Spectroscopy)','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)'],group_keys=False).apply(lambda rows: fix_ref(rows[ref_cols_name]))


In [66]:
#reference.fillna('').count()

150087

In [17]:
reax_uv['ref']=reference.fillna('')


In [94]:
reax_uv['ref']=reax_uv['ref'].fillna('')
reax_uv['UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)']=reax_uv['UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)'].fillna('')
#reax_uv.sample(10)

In [95]:
#'C_c',
need_cols=['ref','Ext./Abs. Coefficient [l·mol-1cm-1]','Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)','Solvent (UV/VIS Spectroscopy)','Linear Structure Formula','Structure','Number of References','Absorption Maxima (UV/VIS) [nm]']
def fix_reax(rows):
    if rows['ref'].iloc[0]!='':
        if rows.shape[0]==2:
            nms1=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[0].split(';')]
            nms2=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[1].split(';')]
            nms12 = nms1 + nms2
            nms21 = nms2 + nms1
            if nms12==sorted(nms12) or nms12==sorted(nms12,reverse=True) or nms21==sorted(nms21) or nms21==sorted(nms21,reverse=True):
                try:
                    eps=len([float(x) for x in rows['Ext./Abs. Coefficient [l·mol-1cm-1]'].str.cat(sep='; ').split(';')])
                except ValueError:
                    eps=0
                if eps==0 or eps==len(nms12):
                    ret_r=rows.head(1)
                    ret_r['Absorption Maxima (UV/VIS) [nm]']=rows['Absorption Maxima (UV/VIS) [nm]'].str.cat(sep='; ') #['Absorption Maxima (UV/VIS) [nm]'].apply(lambda a: a.str.cat(sep=';'))  .apply(lambda a: print(a.dropna(axis=1)))
                    ret_r['Ext./Abs. Coefficient [l·mol-1cm-1]']=rows['Ext./Abs. Coefficient [l·mol-1cm-1]'].str.cat(sep='; ')
                    return ret_r
        elif rows.shape[0]==3:
            nms1=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[0].split(';')]
            nms2=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[1].split(';')]
            nms3=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[2].split(';')]
            nms123 = nms1 + nms2 + nms3
            nms321 = nms3 + nms2 + nms1
            nms231 = nms2 + nms3 + nms1
            if nms123==sorted(nms123) or nms123==sorted(nms123,reverse=True) or nms321==sorted(nms321) or nms321==sorted(nms321,reverse=True) or (nms231==sorted(nms231) and len(nms2)==4 and len(nms3)==4):
                try:
                    eps=len([float(x) for x in rows['Ext./Abs. Coefficient [l·mol-1cm-1]'].str.cat(sep='; ').split(';')])
                except ValueError:
                    eps=0
                if eps==0 or eps==len(nms123):
                    ret_r=rows.head(1)
                    ret_r['Absorption Maxima (UV/VIS) [nm]']=rows['Absorption Maxima (UV/VIS) [nm]'].str.cat(sep='; ') #['Absorption Maxima (UV/VIS) [nm]'].apply(lambda a: a.str.cat(sep=';'))  .apply(lambda a: print(a.dropna(axis=1)))
                    ret_r['Ext./Abs. Coefficient [l·mol-1cm-1]']=rows['Ext./Abs. Coefficient [l·mol-1cm-1]'].str.cat(sep='; ')
                    return ret_r
        else:
            return rows
    return rows
    
reax_fix1=reax_uv[:][need_cols].drop_duplicates().groupby(['Substance Identification: Reaxys Registry Number','Solvent (UV/VIS Spectroscopy)','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)','ref'],group_keys=False).apply(lambda rows: fix_reax(rows))


#,group_keys=False

In [96]:
print(sum(reax_uv.index.duplicated(keep=False)))
print(sum(reax_fix1.index.duplicated(keep=False)))
reax_uv[:][need_cols].drop_duplicates().info()
reax_fix1.info()

0
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 232371 entries, 0 to 700664
Data columns (total 12 columns):
ref                                                       232371 non-null object
Ext./Abs. Coefficient [l·mol-1cm-1]                       119913 non-null object
Substance Identification: Reaxys Registry Number          232371 non-null int64
InChI Key                                                 231528 non-null object
CAS Registry Number                                       135335 non-null object
Chemical Name                                             180765 non-null object
UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)    232371 non-null object
Solvent (UV/VIS Spectroscopy)                             232371 non-null object
Linear Structure Formula                                  232371 non-null object
Structure                                                 232371 non-null object
Number of References                                      232371 non-null in

In [97]:
print(sum(reax_uv.duplicated(subset='InChI Key')))
print(sum(reax_uv.duplicated(subset='Substance Identification: Reaxys Registry Number')))

114247
113049


In [99]:
col='Substance Identification: Reaxys Registry Number'
col='InChI Key'
print(reax_uv[col].value_counts()[:10])
print(reax_uv[col].unique().shape)
print(reax_fix1[col].value_counts()[:10])
print(reax_fix1[col].unique().shape)


YNHJECZULSZAQK-LWQDQPMZSA-N    346
REFJWTPEDVJJIY-UHFFFAOYSA-N    334
IKGXIBQEEMLURG-NVPNHPEKSA-N    233
KZNIFHPLKGYRTM-UHFFFAOYSA-N    206
IYRMWMYZSQPJKC-UHFFFAOYSA-N    191
IQPNAANSBPBGFQ-UHFFFAOYSA-N    183
OVSQVDMCBVZWGM-QSOFNFLRSA-N    182
JPUKWEQWGBDDQB-QSOFNFLRSA-N    170
UWOVWIIOKHRNKU-UHFFFAOYSA-N    159
OENHQHLEOONYIE-JLTXGRSLSA-N    150
Name: InChI Key, dtype: int64
(127012,)
REFJWTPEDVJJIY-UHFFFAOYSA-N    322
YNHJECZULSZAQK-LWQDQPMZSA-N    303
IKGXIBQEEMLURG-NVPNHPEKSA-N    230
KZNIFHPLKGYRTM-UHFFFAOYSA-N    200
IYRMWMYZSQPJKC-UHFFFAOYSA-N    183
OVSQVDMCBVZWGM-QSOFNFLRSA-N    181
IQPNAANSBPBGFQ-UHFFFAOYSA-N    180
JPUKWEQWGBDDQB-QSOFNFLRSA-N    164
UWOVWIIOKHRNKU-UHFFFAOYSA-N    152
OENHQHLEOONYIE-JLTXGRSLSA-N    148
Name: InChI Key, dtype: int64
(127012,)


In [100]:
#print(reax_fix1[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']][reax_fix1['Absorption Maxima (UV/VIS) [nm]'].str.contains('|',regex=False)].count())
#reax_fix1[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']][reax_fix1['Absorption Maxima (UV/VIS) [nm]'].str.contains('|',regex=False)]
reax_fix1.shape

(220551, 12)

In [31]:
#show(reax_uv.loc[28351:28353].dropna(axis=1,how='all'))

In [None]:
#reax_uv1=reax_uv

In [101]:
reax_uv1=reax_fix1.copy()

In [102]:
# [Br,C,Ca,Cl,F,H,I,K,Mg,N,Na,O,P,S,Zn]
# a = set(['H','C','N','O','F','S','Cl','Br','I'])
# b = set(['C','O','Fe'])

# b-a

In [103]:
# ref_cols_name=[col for col in reax_uv.columns if col.startswith('Ref')]
# def fix_reax(rows):
#     if rows.shape[1]>1:
#         ref_col=rows[ref_cols_name].dropna(axis=1).columns[0]
#         print(rows.groupby(ref_col)['Absorption Maxima (UV/VIS) [nm]'].apply(lambda a: a.str.cat(sep=';'))) #['Absorption Maxima (UV/VIS) [nm]'].apply(lambda a: a.str.cat(sep=';'))  .apply(lambda a: print(a.dropna(axis=1)))
#     return rows
    
# t1=reax_uv[reax_uv['InChI Key']=='WCSKDPQVNKZNQD-LWQDQPMZSA-N'].groupby(['InChI Key','Solvent (UV/VIS Spectroscopy)']).apply(lambda rows: fix_reax(rows))



In [104]:
#t1

In [105]:
# from cytoolz.curried import pipe, map, filter
# def max_nm(nms_str:str):
#     #nms = [float(x) for x in nms_str.split(';')]
#     nms_split=nms_str.split(';')
#     nms=pipe(nms_split, map(float),filter(lambda x: x<2000),list)
#     return np.max(nms)

# %timeit max_nm('401.929; 238.095; 250; 285.714')

from builtins import map, filter
def max_nm(nms_str:str):
    nms = [float(x) for x in nms_str.split(';')]
    #there are some very high values >10000 nm
    #nms = list(filter(lambda x: x<2000,map(float, nms_str.split(';'))))
    if len(nms)==0:
        return np.nan
    else:
        return np.max(nms)

def max_eps(nms_str:str):
    nms = [float(x) for x in str(nms_str).split(';')]
    if len(nms)==0:
        return np.nan
    else:
        return nms[0]

    
def s2_nm(nms_str:str):
    nms = [float(x) for x in nms_str.split(';')]
    #there are some very high values >10000 nm
    #nms = list(filter(lambda x: x<2000,map(float, nms_str.split(';'))))7110
    if len(nms)<2:
        return np.nan
    else:
        return nms[1]

def s2_eps(nms_str:str):
    nms = [float(x) for x in str(nms_str).split(';')]
    if len(nms)<2:
        return np.nan
    else:
        return nms[1]

    
def s3_nm(nms_str:str):
    #nms = [float(x) for x in nms_str.split(';')]
    #there are some very high values >10000 nm7110
    nms = list(filter(lambda x: x<2000,map(float, nms_str.split(';'))))
    if len(nms)<3:
        return np.nan
    else:
        return nms[2]

def uv_parse(abs_nm,eps):
    abs_nms = np.array([float(x) for x in str(abs_nm).split(';')])
    s1_nm=np.nan
    s1_eps=np.nan
    s2_nm=np.nan
    s2_eps=np.nan
    s3_nm=np.nan
    s3_eps=np.nan
    len_a = len(abs_nms)
    idx_a = np.argsort(abs_nms)
    if len_a>0:
        s1_nm=abs_nms[idx_a][-1]
    if len_a>1:
        s2_nm=abs_nms[idx_a][-2]
    if len_a>2:
        s3_nm=abs_nms[idx_a][-3]
    try:
        epss = np.array([float(x) for x in str(eps).split(';')])
        if (len_a==len(epss)):
            if len_a>0:
                s1_eps=epss[idx_a][-1]
            if len_a>1:
                s2_eps=epss[idx_a][-2]
            if len_a>2:
                s3_eps=epss[idx_a][-3]
    except ValueError:
        pass
    return pd.Series([s1_nm,s1_eps,s2_nm,s2_eps,s3_nm,s3_eps,len_a], ['s1_nm','s1_eps','s2_nm','s2_eps','s3_nm','s3_eps','abs_count'])


# def prs1(abs_nms,epss):
#     s1_nm=np.nan
#     s1_eps=np.nan
#     s2_nm=np.nan
#     s2_eps=np.nan
#     s3_nm=np.nan
#     s3_eps=np.nan
#     len_a = len(abs_nms)
#     idx_a = np.argsort(abs_nms)
#     if len_a>0:
#         s1_nm=abs_nms[idx_a][-1]
#     if len_a>1:
#         s2_nm=abs_nms[idx_a][-2]
#     if len_a>2:
#         s3_nm=abs_nms[idx_a][-3]
#     if (len_a==len(epss)):
#         if len_a>0:
#             s1_eps=epss[idx_a][-1]
#         if len_a>1:
#             s2_eps=epss[idx_a][-2]
#         if len_a>2:
#             s3_eps=epss[idx_a][-3]
    
# def uv_parse(abs_nm,eps):
#     abs_nms = np.array([float(x) for x in str(abs_nm).split(';')])
#     epss = np.array([float(x) for x in str(eps).split(';')])

#     return pd.Series([s1_nm,s1_eps,s2_nm,s2_eps,s3_nm,s3_eps], ['s1_nm','s1_eps','s2_nm','s2_eps','s3_nm','s3_eps',abs])

res = reax_uv1[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']].apply(lambda row: uv_parse(row['Absorption Maxima (UV/VIS) [nm]'],row['Ext./Abs. Coefficient [l·mol-1cm-1]']),axis=1)
#fnuv_parse = np.vectorize(uv_parse)
reax_uv1 = reax_uv1.join(res)
#reax_uv['s1_nm']=reax_uv['Absorption Maxima (UV/VIS) [nm]'].apply(max_nm)#, meta=('x', 'f8'))
# reax_uv['eps']=reax_uv['Ext./Abs. Coefficient [l·mol-1cm-1]'].apply(max_eps)
# reax_uv['s2_nm']=reax_uv['Absorption Maxima (UV/VIS) [nm]'].apply(s2_nm)#, meta=('x', 'f8'))
# reax_uv['s2_eps']=reax_uv['Ext./Abs. Coefficient [l·mol-1cm-1]'].apply(s2_eps)
# reax_uv['s3_nm']=reax_uv['Absorption Maxima (UV/VIS) [nm]'].apply(s3_nm)#, meta=('x', 'f8'))
#reax_uv['nm']=reax_uv['Absorption Maxima (UV/VIS) [nm]'].apply(lambda nms_str: max_nm(nms_str.split(';')))#, meta=('x', 'f8'))
#print(max_nm('401.929; 238.095; 250; 285.714'))
#reax_uv[cols+['nm']][:50]
#reax_uv=reax_uv.dropna(subset=['nm'])
#%timeit max_nm1('401.929; 238.095; 250; 285.714')

In [34]:
#reference
#t1=t1.dropna(how='all')
#t1['Absorption Maxima (UV/VIS) [nm]'].str.contains('|',regex=False)
#t1[t1['Absorption Maxima (UV/VIS) [nm]'].str.contains('|',regex=False)]
#reax_fix1.to_pickle('/home/denn/home/ml/data/sf/reaxys4_fix.pickle')

In [35]:
#reax_uv1.ix[225444]
#reax_uv[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']].values[:10]
#res=fnuv_parse(reax_uv[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']].values[:10])
#res=fnuv_parse(reax_uv['Absorption Maxima (UV/VIS) [nm]'].values[:10],reax_uv['Ext./Abs. Coefficient [l·mol-1cm-1]'].values[:10])

In [26]:
#res[res.s1_nm>2000].count()

In [134]:
# sample=reax_uv.sample(10).copy()
# sample[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']]

In [133]:
#sample.dropna(axis=1,how='all')
#sample.count()

In [141]:
# reax_uv=reax_uv[(reax_uv['Solvent (UV/VIS Spectroscopy)']!='water')&(reax_uv['Solvent (UV/VIS Spectroscopy)']!='various solvent(s)')&(reax_uv['Solvent (UV/VIS Spectroscopy)']!='methanol; various solvent(s)')\
#                &(reax_uv['Solvent (UV/VIS Spectroscopy)']!='aq. ethanol')]

In [28]:
#print(sum(reax_uv1.index.duplicated(keep=False)))
#reax_uv1.info()

0


In [106]:
reax_uv1=reax_uv1[(reax_uv1.s1_nm<1600) & (reax_uv1.s1_nm>150)]
print(reax_uv1.shape)


(220431, 19)


In [276]:
#reax_uv1.sample(5)

In [None]:
# reax_solv.shape

In [None]:
#reax_solv['Molecular Weight'].median()
#reax_solv[reax_solv['Number of References']==1]['Number of References'].count()

In [72]:
#reax_solv[reax_solv.mf.apply(len)==2].sample(10).dropna(axis=1,how='all')
#reax_uv[reax_uv.C_c==0].sample(5).dropna(axis=1,how='all')
#reax_uv[reax_uv['InChI Key']=='MWPLVEDNUUSJAV-UHFFFAOYSA-N'].dropna(axis=1,how='all')

In [None]:
#reax_solv.count()

In [None]:
#reax_solv['Comment (UV/VIS Spectroscopy)'].dropna()[:100]
#'UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)'

In [146]:
# need_cols=['s1_nm','s1_eps','s2_nm','s2_eps','s3_nm','s3_eps','Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)','Solvent (UV/VIS Spectroscopy)','C_c','Linear Structure Formula','Structure','Number of References','Absorption Maxima (UV/VIS) [nm]']

In [30]:
# print(reax_uv1.shape)
# reax=reax_uv1.drop_duplicates()
# #reax_solv_i=reax_solv[need_cols].set_index(['InChI Key']) #,'Solvent (UV/VIS Spectroscopy)'
# #reax_solv[reax_solv.C_c==0 | [reax_solv['CAS Registry Number'].isnull()][need_cols]
# print(reax.shape)

(134474, 19)
(134474, 19)


In [120]:
#show(reax.sort_values(by='s1_nm',ascending=False)[:30].dropna(axis=1,how='all'))

In [278]:
# def reject_outliers(sr, iq_range=0.5):
#     pcnt = ((1 - iq_range) / 2)*100
#     qlow, median, qhigh = np.percentile(sr,[pcnt, 50, 100-pcnt])
#     iqr = qhigh - qlow
#     return sr[ np.abs(sr - median) <= iqr]

In [293]:
# reax1=reax.groupby(['InChI Key','Solvent (UV/VIS Spectroscopy)'])['s1_nm'].agg({
# #                                                    'mode':lambda x: x.value_counts().idxmax(),
# #                                                    'mode_count':lambda x: x.value_counts().max(),
#                                                    'std':'std',
# #                                                    'median':np.median,
#                                                    'max':'max',
#                                                    'min':'min',
#                                                    'max_min':lambda x: np.max(x)-np.min(x),
#                                                    'count':'count',
#                                                    'mean':'mean',
#                                                    'mean_rj':lambda x: np.mean(reject_outliers(x,iq_range=0.5)),
#                                                    'std_rj':lambda x: np.std(reject_outliers(x,iq_range=0.5))
#                                                    })


In [31]:
# print(reax1[(reax1['max_min']<20)&(reax1['count']>1)].count())
# reax1[(reax1['max_min']>100)&(reax1['count']>0)].sort_values(by='max_min',ascending=False)[:30]

In [32]:
# #reax[reax['InChI Key']=='OYJOYFYJDAFOEI-UHFFFAOYSA-N']
# #show(reax[reax['InChI Key']=='ZGUGWUXLJSTTMA-UHFFFAOYSA-N'].dropna(axis=1))
# show(reax[reax['Substance Identification: Reaxys Registry Number']==2121226].dropna(axis=1))

In [99]:
#df_solvents=reax_solv[['Solvent (UV/VIS Spectroscopy)','nm','Structure','InChI Key']].set_index([]).pivot(columns='Solvent (UV/VIS Spectroscopy)')
#df_solvents=reax_solv_i[['Solvent (UV/VIS Spectroscopy)','nm']].pivot(columns='Solvent (UV/VIS Spectroscopy)')

In [33]:


# #dft = np.logical_not(reax_uv['Solvent (UV/VIS Spectroscopy)'].str.contains('acid|buffer|H2O',na=False))
# dft = (reax_uv['Solvent (UV/VIS Spectroscopy)']=='methanol')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='ethanol')
# #dft = (reax_uv['Solvent (UV/VIS Spectroscopy)']=='methanol')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='ethanol')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='acetonitrile')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='tetrahydrofuran')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='dioxane')
# #dft = (reax_uv['Solvent (UV/VIS Spectroscopy)']=='CHCl3')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='CH2Cl2')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='dichloromethane')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='hexane')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='cyclohexane')
# reax_uv1=reax_uv[dft]
# reax_uv1.nm.count()

In [107]:
reax_uv1['s1_ev']=1240./reax_uv1['s1_nm']

In [108]:
reax_s1=reax_uv1[:].groupby('Substance Identification: Reaxys Registry Number')['s1_ev','s1_eps'].agg({
                                                   #'mode':lambda x: x.value_counts().idxmax(),
                                                   #'mode_count':lambda x: x.value_counts().max(),
                                                   's1_ev': {
                                                       'std':'std',
                                                       'median':'median',
                                                       'max':'max',
                                                       'min':'min',
                                                       #'max_min':lambda x: np.max(x)-np.min(x),
                                                       'count':'count',
                                                       'mean':'mean',
                                                       #'mean_rj':lambda x: np.mean(reject_outliers(x.values,iq_range=0.5)),
                                                       #'std_rj':lambda x: np.std(reject_outliers(x,iq_range=0.5))
                                                   },
                                                    's1_eps': {'max_eps':'max'}
                                                    })

reax_s1['max_min']=reax_s1['max']-reax_s1['min']

In [109]:
print(sum(reax_s1.index.duplicated(keep=False)))
reax_s1.info()
#reax_s1['count'].value_counts()

0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 128171 entries, 1807 to 30480261
Data columns (total 8 columns):
max_eps    76690 non-null float64
median     128171 non-null float64
count      128171 non-null int64
mean       128171 non-null float64
std        31933 non-null float64
min        128171 non-null float64
max        128171 non-null float64
max_min    128171 non-null float64
dtypes: float64(7), int64(1)
memory usage: 8.8 MB


In [110]:
import math
@numba.jit(nopython=True,cache=True)
def percentile(N, percent):
    """
    Find the percentile of a list of values.

    @parameter N - is a list of values. Note N MUST BE already sorted.
    @parameter percent - a float value from 0.0 to 1.0.
    @parameter key - optional key function to compute value from each element of N.

    @return - the percentile of the values
    """
    k = (N.shape[0]-1) * percent
    f = math.floor(k)
    c = math.ceil(k)
    if f == c:
        return (N[int(k)])
    d0 = (N[int(f)]) * (c-k)
    d1 = (N[int(c)]) * (k-f)
    return d0+d1

@numba.jit(nopython=True,cache=True)
def reject_outliers3(sr, iq_range=0.5):
    pcnt = ((1-iq_range) / 2)
    N=sr
    qlow = percentile(N,pcnt)
    median = percentile(N,0.5)
    qhigh = percentile(N,1-pcnt)
    iqr = qhigh - qlow
    return sr[ np.abs(sr - median) <= iqr]

@numba.jit(nopython=True,cache=True)
def mean_rj(sr, iq_range=0.5):
    sr.sort()
    return np.mean(reject_outliers3(sr,iq_range=0.5))

@numba.jit(nopython=True,cache=True)
def std_rj(sr, iq_range=0.5):
    sr.sort()
    return np.std(reject_outliers3(sr,iq_range=0.5))    
    
def reject_outliers1(sr, iq_range=0.5):
    pcnt = (1 - iq_range) / 2
    qlow, median, qhigh = sr.dropna().quantile([pcnt, 0.50, 1-pcnt])
    iqr = qhigh - qlow
    return sr[ (sr - median).abs() <= iqr]

# ar=np.array([1,2,3,4,5,5,5,5,4,7,8,9],dtype=int)
# sr=pd.Series(ar)
# %timeit mean_rj(np.array(sr.values,dtype=int),0.1)
# %timeit std_rj(np.array(sr.values,dtype=int),0.1)
#%timeit reject_outliers1(sr,0.1)
#print(reject_outliers(sr,0.5))
#print(reject_outliers1(sr,0.5))

#help(sr.values)

In [111]:
reax_s1_rj=reax_uv1[:].groupby('Substance Identification: Reaxys Registry Number')['s1_ev'].agg({
                                                   #'mode':lambda x: x.value_counts().idxmax(),
                                                   #'mode_count':lambda x: x.value_counts().max(),
                                                   's1_ev': {
                                                      'mean_rj':lambda x: mean_rj(x.values,iq_range=0.5) if x.shape[0]>2 else x.mean(),
                                                      'std_rj':lambda x: std_rj(x.values,iq_range=0.5) if x.shape[0]>2 else np.nan,
                                                   },
                                                    })

In [39]:
#reax_s1_rj.dropna().sample(10)

In [112]:
reax_s1[['std_rj','mean_rj']]=reax_s1_rj['s1_ev'][['std_rj','mean_rj']]
#reax_s1['std_rj']=reax_s1_rj['std_rj']
#reax_s1.columns


In [113]:
#'Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name',
print(reax_s1.count())
print(reax_s1.shape)
print(reax_s1['max_min'].mean())

reax_s1[reax_s1['std_rj']>0.1][:10]
print(sum(reax_s1.index.duplicated(keep=False)))
reax_s1.info()

max_eps     76690
median     128171
count      128171
mean       128171
std         31933
min        128171
max        128171
max_min    128171
std_rj      14512
mean_rj    128171
dtype: int64
(128171, 10)
0.112650039786
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 128171 entries, 1807 to 30480261
Data columns (total 10 columns):
max_eps    76690 non-null float64
median     128171 non-null float64
count      128171 non-null int64
mean       128171 non-null float64
std        31933 non-null float64
min        128171 non-null float64
max        128171 non-null float64
max_min    128171 non-null float64
std_rj     14512 non-null float64
mean_rj    128171 non-null float64
dtypes: float64(9), int64(1)
memory usage: 10.8 MB


In [114]:


# clean=reax_uv_gr[(reax_uv_gr['std_rj']>10)]
# print(.count())
df_filter=reax_s1.copy()
# df_filter=df_filter[np.logical_not((df_filter['std_rj']>10) & (df_filter['count']>2))]
# print(df_filter['count'].count())
#df_filter=df_filter[(df_filter['std'].isnull())|(df_filter['std']<10)]
#df_filter=df_filter[(df_filter['std_rj'].isnull())|(df_filter['std_rj']<20)]
df_filter=df_filter[df_filter['max_min']<0.5]
print(df_filter['count'].count())
print(df_filter['max_min'].max())
print(df_filter['max_min'].mean())
#print(df_filter[df_filter['count']==1].count())
print(df_filter.shape)
show(df_filter.sample(10))


118417
0.499876593994
0.0231741482123
(118417, 10)


Unnamed: 0_level_0,max_eps,median,count,mean,std,min,max,max_min,std_rj,mean_rj
Substance Identification: Reaxys Registry Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6762322,,2.616034,1,2.616034,,2.616034,2.616034,0.0,,2.616034
5150902,8128.0,2.119658,1,2.119658,,2.119658,2.119658,0.0,,2.119658
6604652,13500.0,2.171629,1,2.171629,,2.171629,2.171629,0.0,,2.171629
803687,,3.289125,5,3.291051,0.027331,3.263158,3.333333,0.070175,0.013722,3.280481
341301,15488.0,3.39726,1,3.39726,,3.39726,3.39726,0.0,,3.39726
5316275,12303.0,2.87703,1,2.87703,,2.87703,2.87703,0.0,,2.87703
175360,21380.0,3.604651,3,3.525369,0.175334,3.324397,3.647059,0.322662,0.021204,3.625855
7881471,,2.206406,1,2.206406,,2.206406,2.206406,0.0,,2.206406
4212388,14125.0,2.818182,1,2.818182,,2.818182,2.818182,0.0,,2.818182
30293429,,2.039474,1,2.039474,,2.039474,2.039474,0.0,,2.039474


In [115]:
df_filter[df_filter['max_min']>0.3].count()

max_eps    1855
median     2887
count      2887
mean       2887
std        2887
min        2887
max        2887
max_min    2887
std_rj     1573
mean_rj    2887
dtype: int64

In [116]:
print(sum(df_filter.index.duplicated(keep=False)))
df_filter.info()

0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 118417 entries, 2372 to 30480261
Data columns (total 10 columns):
max_eps    69761 non-null float64
median     118417 non-null float64
count      118417 non-null int64
mean       118417 non-null float64
std        22179 non-null float64
min        118417 non-null float64
max        118417 non-null float64
max_min    118417 non-null float64
std_rj     9201 non-null float64
mean_rj    118417 non-null float64
dtypes: float64(9), int64(1)
memory usage: 9.9 MB


In [144]:
cols_merge=['Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name','Structure','Linear Structure Formula']
reax_merge=reax_fix1[cols_merge].drop_duplicates(subset='Substance Identification: Reaxys Registry Number').set_index('Substance Identification: Reaxys Registry Number')#.drop_duplicates(subset='InChI Key')
df_flt1 = pd.merge(reax_merge,df_filter, how='inner',left_index=True,right_index=True)
print(df_flt1.shape)
df_flt1.count()

(118417, 15)


InChI Key                   117842
CAS Registry Number          62261
Chemical Name                84504
Structure                   118417
Linear Structure Formula    118417
max_eps                      69761
median                      118417
count                       118417
mean                        118417
std                          22179
min                         118417
max                         118417
max_min                     118417
std_rj                        9201
mean_rj                     118417
dtype: int64

In [145]:
print(sum(df_filter.index.duplicated(keep=False)))
df_filter.info()
print(sum(reax_merge.index.duplicated(keep=False)))
reax_merge.info()
print(sum(df_flt1.index.duplicated(keep=False)))
df_flt1.info()

0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 118417 entries, 2372 to 30480261
Data columns (total 10 columns):
max_eps    69761 non-null float64
median     118417 non-null float64
count      118417 non-null int64
mean       118417 non-null float64
std        22179 non-null float64
min        118417 non-null float64
max        118417 non-null float64
max_min    118417 non-null float64
std_rj     9201 non-null float64
mean_rj    118417 non-null float64
dtypes: float64(9), int64(1)
memory usage: 9.9 MB
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 128210 entries, 1807 to 30480261
Data columns (total 5 columns):
InChI Key                   127615 non-null object
CAS Registry Number         67851 non-null object
Chemical Name               92751 non-null object
Structure                   128210 non-null object
Linear Structure Formula    128210 non-null object
dtypes: object(5)
memory usage: 5.9+ MB
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 118417 entries, 2372 to 30

In [136]:
#reax_merge[reax_merge.index.duplicated(keep=False)]
#df_flt1[df_flt1.mean_rj.isnull()][:10]
#df_flt1.dropna(subset=['mean_rj']).info()

In [83]:
#show(df_flt1[:10])
# df_flt1=df_filter.reset_index()
# df_flt1=df_flt1[df_flt1.Structure.str.match('([0-9CNOFS@+\-\[\]\(\)\\\/%=#$]+)$',case=False,as_indexer=True)]
# df_flt1=df_flt1[df_flt1.Structure.str.match('^((?!(\[Co\]|\[Se\]|\[O\+8\]|\[C-3\]|\[N\+5\]|\[O\+3\]|\.)).)*$',case=True,as_indexer=True)]
# print(df_flt1.Structure.count())
# df_flt1.sample(10)

In [164]:
#np.save('/home/denn/home/ml/data/reaxys3_MW120-1000_UV360-440_ftr1.npy',df_flt1.as_matrix())

In [146]:
import pybel

def obl(xyz):
    m=pybel.readstring('smiles',xyz)
    return pd.Series({'can':m.write('can'),'check_inchi':m.write('inchikey')})

#df_check['check_inchi']=df_check['xyz'].apply(lambda m: pybel.readstring('xyz',m).write('inchikey').split('-')[0])
#df_check['check_smi']=df_check['xyz'].apply(lambda m: pybel.readstring('xyz',m).write('can'))
#df_check=df_check1['xyz'].apply(obl))
df_obl=df_flt1.Structure.apply(obl)
#df_flt2=df_flt1.join(df_flt1.Structure.apply(obl),how)
#%timeit obl('BrC1=CC=C(C=C1)C1CC(=NN1C1=CC=CC=C1)C1=CC=C(C=C1)C#N')

In [147]:
df_obl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118417 entries, 2372 to 30480261
Data columns (total 2 columns):
can            118417 non-null object
check_inchi    118417 non-null object
dtypes: object(2)
memory usage: 2.7+ MB


In [161]:
df_flt2=df_flt1.join(df_obl,how='inner')
df_flt2['can']=df_flt2.can.str.strip()
df_flt2['can']=df_flt2.can.str.strip('\\n')
df_flt2[:10]

Unnamed: 0_level_0,InChI Key,CAS Registry Number,Chemical Name,Structure,Linear Structure Formula,max_eps,median,count,mean,std,min,max,max_min,std_rj,mean_rj,can,check_inchi
Substance Identification: Reaxys Registry Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2372,NUJOEMIUONHBSQ-UHFFFAOYSA-N,,"Pyridinium-cyclopentadienid-Betain; 1-cyclopenta-2,4-dienyl-pyridinium betaine; 1-Cyclopenta-2,4-dienyl-pyridinium-betain; Cyclopentadien-pyridinium-ylid; Pyridiniumcyclopentadienylid; Pyridinium-cyclopentadienid",C1=CC(C=C1)=N1=CC=CC=C1,C10H9N,,2.34982,1,2.34982,,2.34982,2.34982,0.0,,2.34982,C1=CC(=n2ccccc2)C=C1,NUJOEMIUONHBSQ-UHFFFAOYSA-N\n
3107,QKHRGPYNTXRMSL-VOTSOKGWSA-N,5097-93-8,trans-4-styrylpyride; 4-Styrylpyridin,C(=C/C1=CC=NC=C1)C1=CC=CC=C1,(C5H4NCHCHC6H5),33113.0,4.092409,5,4.112881,0.087596,4.025974,4.246575,0.220601,0.045688,4.079457,c1ccc(cc1)C=Cc1ccncc1,QKHRGPYNTXRMSL-UHFFFAOYSA-N\n
3850,JWMUHTIFNGYNFA-UHFFFAOYSA-N,260-32-2,2-azaanthracene; benzo(g)isoquinoline; benzo[g]isoquinoline; benz[g]isoquinoline; Benz[g]isochinolin,C1=CC=C2C=C3C=NC=CC3=CC2=C1,C13H9N,4467.0,3.166982,3,3.169911,0.008499,3.163265,3.179487,0.016222,0.001858,3.165123,c1ccc2c(c1)cc1c(c2)ccnc1,JWMUHTIFNGYNFA-UHFFFAOYSA-N\n
3929,DWYFUJJWTRPARQ-UHFFFAOYSA-N,135-00-2,2-Benzoylthiophene; 2-benzoylthiophene,O=C(C1=CC=CS1)C1=CC=CC=C1,C11H7OSH,11220.1,4.366197,3,4.321492,0.077431,4.232082,4.366197,0.134115,0.0,4.366197,O=C(c1cccs1)c1ccccc1,DWYFUJJWTRPARQ-UHFFFAOYSA-N\n
4028,VHNXFKTYNFJDKY-SHYZEUOFSA-N,62251-43-8; 62251-46-1,2beta-bromo-8-methyl-8-azabicyclo<3.2.1>octan-3-one; 2beta-bromotropan-3-one; (+-)-2exo-bromo-tropan-3-one; (+-)-2exo-Brom-tropan-3-on,CN1[C@H]2CC[C@@H]1[C@@H](Br)C(=O)C2,C8H12BrNO,,3.444444,1,3.444444,,3.444444,3.444444,0.0,,3.444444,O=C1C[C@@H]2CC[C@H]([C@H]1Br)N2C,VHNXFKTYNFJDKY-SHYZEUOFSA-N\n
4435,DEFXDSGVNUQNJW-UHFFFAOYSA-N,525-24-6,"fluorazone; 9-H-pyrrolo<1,2-a>indole-9-one; 9-keto-9H-pyrrolo<1,2-a>indole; 9H-pyrrolo<1,2-a>indol-9-one; 9H-pyrrolo-[1,2a]-indol-9-one; 9H-pyrrolo[1,2-a]indol-9-one; pyrrolo[1,2-a]indol-4-one",O=C1C2=CC=CN2C2=CC=CC=C12,C11H7NO,470.0,3.1,1,3.1,,3.1,3.1,0.0,,3.1,O=C1c2ccccc2n2c1ccc2,DEFXDSGVNUQNJW-UHFFFAOYSA-N\n
4522,QFKBOYSUWYOBIS-KHRNHKAPSA-N,5963-15-5,"9t(?)-[2]furyl-nona-2t(?),4t(?),6t(?),8-tetraenal",O=CC=CC=CC=CC=CC1=CC=CO1,C13H12O2,,2.066667,1,2.066667,,2.066667,2.066667,0.0,,2.066667,O=CC=CC=CC=CC=Cc1ccco1,QFKBOYSUWYOBIS-UHFFFAOYSA-N\n
4601,SJTOQNJGUHMDEZ-UHFFFAOYSA-N,66729-00-8,4-chloroisoquinolin-3-amine; 4-chloro-[3]isoquinolylamine; 4-Chlor-[3]isochinolylamin; 3-Amino-4-chlorisochinolin,NC1=C(Cl)C2=C(C=CC=C2)C=N1,C9H7ClN2,,3.333333,1,3.333333,,3.333333,3.333333,0.0,,3.333333,Nc1ncc2c(c1Cl)cccc2,SJTOQNJGUHMDEZ-UHFFFAOYSA-N\n
4759,OZKOMUDCMCEDTM-UHFFFAOYSA-N,230-46-6,"1,7-phenanthroline; phenanthroline; 1 10-phenanthroline; 1-10-phenanthroline; 1,7-phenantroline; phenantroline; [1,7]phenanthroline",C1=CN=C2C=CC3=C(N=CC=C3)C2=C1,C12H8N2,,3.387978,1,3.387978,,3.387978,3.387978,0.0,,3.387978,c1cnc2c(c1)ccc1c2cccn1,OZKOMUDCMCEDTM-UHFFFAOYSA-N\n
4842,IVVGFUBQYUHOKG-BUHFOSPRSA-N,2569-55-3,(E)-3-phenylazopyridine; trans-3-(phenylazo)pyridine; trans-(phenylazo)pyridine; trans-3-phenylazopyridine; 3-phenylazopiridine; 3-phenylazopyridine; 3-phenylazo-pyridine,C1=CC=C(C=C1)N=NC1=CN=CC=C1,C11H9N3,923.0,2.792793,5,2.815121,0.052765,2.767857,2.903981,0.136124,0.017793,2.792906,c1ccc(cc1)N=Nc1cccnc1,IVVGFUBQYUHOKG-UHFFFAOYSA-N\n


In [163]:
#df_flt2[df_flt2.duplicated(subset='InChI Key')].sort_values('InChI Key').head(10)

In [164]:
df_flt2.to_pickle('/home/denn/home/ml/data/sf/Reaxys4_fix_uv.pickle')

In [165]:
import logging

from rdkit import RDLogger

RDLogger.logger().setLevel(getattr(RDLogger, 'ERROR'))

logger = logging.getLogger()
loglevel = getattr(logging, 'ERROR')
logger.setLevel(loglevel)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loglevel)




def get_finger_rdk(smiles):
    mol=Chem.MolFromSmiles(str(smiles))
    if mol is not None:
        Chem.SanitizeMol(mol)
        return pd.Series({'mol__inchikey':Chem.InchiToInchiKey(Chem.MolToInchi(mol)),'morgan':np.array(Chem.GetMorganFingerprintAsBitVect(mol,nBits=8192,radius=6),dtype=np.int8),'mol':mol})
    else:
#         print(smiles) 
        return pd.Series({'mol__inchikey':None,'morgan':np.nan,'mol':None})

df_fps=df_flt2[:].Structure.apply(get_finger_rdk)
#print()
#df_fps=df_flt2[:100].can.str.slice(0,-2).apply(get_finger_rdk)
#print(df_flt1.Structure.count())
#print(df_fps.count())
#df_fps[df_fps.mol__inchikey.isnull()]
#df_fps

RDKit ERROR: [15:06:20] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:06:20] Explicit valence for atom # 6 N, 5, is greater than permitted
RDKit ERROR: [15:06:21] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:06:23] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:06:25] Explicit valence for atom # 11 N, 5, is greater than permitted
RDKit ERROR: [15:06:25] Explicit valence for atom # 20 N, 5, is greater than permitted
RDKit ERROR: [15:06:29] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:06:35] Explicit valence for atom # 23 N, 5, is greater than permitted
RDKit ERROR: [15:07:15] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:07:15] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:07:15] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:07:15] Explicit valence for atom 

RDKit ERROR: [15:11:25] Explicit valence for atom # 18 N, 5, is greater than permitted
RDKit ERROR: [15:11:36] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [15:11:36] Explicit valence for atom # 10 N, 5, is greater than permitted
RDKit ERROR: [15:11:37] Explicit valence for atom # 11 N, 5, is greater than permitted
RDKit ERROR: [15:11:39] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 5 C, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 5 C, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 5 C, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 15 N, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 16 N, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 17 N, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for ato

RDKit ERROR: [15:14:35] Explicit valence for atom # 11 N, 5, is greater than permitted
RDKit ERROR: [15:15:07] Explicit valence for atom # 10 N, 5, is greater than permitted
RDKit ERROR: [15:15:09] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:15:10] Explicit valence for atom # 2 N, 5, is greater than permitted
RDKit ERROR: [15:15:12] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:15:12] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:15:23] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:15:23] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:15:24] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:15:25] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:15:25] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:15:27] Explicit valence for ato

RDKit ERROR: [15:16:49] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 8 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 21 N, 5, is greater than permitted
RDKit ERROR: [15:16:52] Explicit valence for atom # 26 N, 5, is greater than permitted
RDKit ERROR: [15:16:53] Explicit valence for atom # 8 N, 5, is greater than permitted
RDKit ERROR: [15:16:53] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:16:53] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:16:56] Explicit valence for atom # 

RDKit ERROR: [15:19:21] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:19:29] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:19:29] Explicit valence for atom # 15 N, 5, is greater than permitted
RDKit ERROR: [15:19:45] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:19:47] Explicit valence for atom # 2 N, 5, is greater than permitted
RDKit ERROR: [15:19:47] Explicit valence for atom # 6 N, 5, is greater than permitted
RDKit ERROR: [15:19:48] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:19:48] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:19:49] Explicit valence for atom # 11 N, 5, is greater than permitted
RDKit ERROR: [15:19:50] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:19:50] Explicit valence for atom # 10 N, 5, is greater than permitted
RDKit ERROR: [15:19:50] Explicit valence for atom #

RDKit ERROR: [15:25:39] Explicit valence for atom # 10 N, 5, is greater than permitted
RDKit ERROR: [15:25:46] Explicit valence for atom # 17 N, 5, is greater than permitted
RDKit ERROR: [15:25:49] Explicit valence for atom # 14 N, 5, is greater than permitted
RDKit ERROR: [15:25:51] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [15:26:19] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [15:26:24] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:26:24] Explicit valence for atom # 14 N, 5, is greater than permitted
RDKit ERROR: [15:26:24] Explicit valence for atom # 15 N, 5, is greater than permitted
RDKit ERROR: [15:26:24] Explicit valence for atom # 6 N, 5, is greater than permitted
RDKit ERROR: [15:26:30] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:26:33] Explicit valence for atom # 21 N, 5, is greater than permitted
RDKit ERROR: [15:26:46] Explicit valence for a

RDKit ERROR: [15:31:08] Explicit valence for atom # 38 C, 5, is greater than permitted
RDKit ERROR: [15:31:10] Explicit valence for atom # 26 N, 4, is greater than permitted
RDKit ERROR: [15:31:15] Explicit valence for atom # 19 N, 5, is greater than permitted
RDKit ERROR: [15:31:15] Explicit valence for atom # 20 N, 5, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for

In [558]:
#%timeit get_finger_rdk('N#C[C-]([N+]1=CC=CC=C1)C(C2=CC=CC=C2)=O')

array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)

In [167]:
#show(df_fps.sample(2))
df_fps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118417 entries, 2372 to 30480261
Data columns (total 3 columns):
mol              117826 non-null object
mol__inchikey    117826 non-null object
morgan           117826 non-null object
dtypes: object(3)
memory usage: 3.6+ MB


In [168]:
#help(df_flt1.join)
df_flt3=df_flt2.join(df_fps,how='inner',rsuffix='_fps')

In [177]:
show(df_flt3.loc[2745177:2745178,][['mol','Structure']])
show(df_flt3.loc[2745177:2745178,])

Unnamed: 0_level_0,InChI Key,CAS Registry Number,Chemical Name,Structure,Linear Structure Formula,max_eps,median,count,mean,std,min,max,max_min,std_rj,mean_rj,can,check_inchi,mol,mol__inchikey,morgan
Substance Identification: Reaxys Registry Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2745177,DHDYTSMZSUHCIP-VAWYXSNFSA-N,27781-35-7,"2,3-dicyano-3t-(4-dimethylamino-phenyl)-acrylic acid amide; 2,3-Dicyan-3t-(4-dimethylamino-phenyl)-acrylsaeure-amid; trans-alpha.beta-Dicyano-p-(N,N-dimethylamino)-cinnamamid",CN(C)C1=CC=C(C=C1)C(C#N)=C(/C#N)C(N)=O,C13H12N4O,,2.589855,2,2.589855,0.076464,2.535787,2.643923,0.108136,,2.589855,CN(c1ccc(cc1)C(=C(C(=O)N)C#N)C#N)C,DHDYTSMZSUHCIP-UHFFFAOYSA-N\n,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nN\nN\nN\nNH2\nO\n,DHDYTSMZSUHCIP-UHFFFAOYSA-N,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"


In [178]:
df_flt3.to_pickle('/home/denn/home/ml/data/sf/Reaxys4__fix_fps8k_uv.pickle')

In [9]:
from blocks import block
from rdkit.Chem.Descriptors import MolWt
def blockify(s):
    try:
        return block.Block(smiles=s)
    except:
        return None

import logging

from rdkit import RDLogger

RDLogger.logger().setLevel(getattr(RDLogger, 'ERROR'))

logger = logging.getLogger()
loglevel = getattr(logging, 'ERROR')
logger.setLevel(loglevel)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loglevel)



dfexp=reax_uv_gr.reset_index()
dfexp['block'] = dfexp.Structure.apply(blockify)
dfexp=dfexp.dropna(subset=['block'])
dfexp['mol'] = dfexp.block.apply(lambda b: b.mol)
dfexp['smiles']=dfexp.block.apply(lambda b: b.smiles())
dfexp['MolWt']=dfexp.mol.apply(lambda m: MolWt(m))
dfexp['mol__inchikey'] = dfexp.block.apply(lambda b: b.inchikey)
dfexp['mol__inchikey']=dfexp['mol__inchikey'].str.split('-').str[0]


RDKit ERROR: [16:23:27] Explicit valence for atom # 10 C, 6, is greater than permitted
ERROR:root:Could not create RdKit mol from smiles: C(C1=C2C=CC=CC2=C(C[C]2345[BH]678[BH]9%10%11[BH]%12%13%14[BH]696[BH]%129%12[BH]%13%13%15[BH]%10%14%10[BH]27%11[BH]3%13%10[C]49%15(C2=CC=CC=C2)[BH]586%12)C2=CC=CC=C12)[C]1234[BH]567[BH]89%10[BH]%11%12%13[BH]585[BH]%118%11[BH]%12%12%14[BH]9%139[BH]16%10[BH]2%129[C]38%14(C1=CC=CC=C1)[BH]475%11, inchi: None
Traceback (most recent call last):
  File "/home/denn/home/a2g2/blocks/block.py", line 53, in __init__
    raise Exception("Mol not created")
Exception: Mol not created
RDKit ERROR: [16:23:27] Explicit valence for atom # 15 C, 6, is greater than permitted
ERROR:root:Could not create RdKit mol from smiles: C(C1=C2C=CC=CC2=CC2=CC=CC=C12)[C]1234[BH]567[BH]89%10[BH]%11%12%13[BH]585[BH]%118%11[BH]%12%12%14[BH]9%139[BH]16%10[BH]2%129[C]38%14(C1=CC=CC=C1)[BH]475%11, inchi: None
Traceback (most recent call last):
  File "/home/denn/home/a2g2/blocks/block.py",

In [10]:
#show(dfexp[dfexp['mol__inchikey'].str.contains('CSHWQDPOILHKBI')])

In [11]:
dfexp = dfexp.set_index('mol__inchikey')
print(dfexp['Structure'].count())
df_filter=dfexp[(dfexp['MolWt']<1000)]
print(df_filter['Structure'].count())

df_filter=df_filter[df_filter['mol'].apply(lambda m: not m.HasSubstructMatch(Chem.MolFromSmarts('[Si,Se,Li,Na]')))]
print(df_filter['Structure'].count())
#df_filter=df_filter[df_filter['mol'].apply(lambda m: m.HasSubstructMatch(Chem.MolFromSmarts('O=C([#6;D2])[O;H1]')))]
#print(df_filter['Structure'].count())

df_filter=df_filter[df_filter['mol'].apply(lambda m: not m.HasSubstructMatch(Chem.MolFromSmarts('CCCC')))]
print(df_filter['Structure'].count())

df_filter=df_filter[df_filter['mol'].apply(lambda m: not m.HasSubstructMatch(Chem.MolFromSmarts('c1ccc2cc3ccccc3cc2c1.c1ccc2cc3ccccc3cc2c1')))]
print(df_filter['Structure'].count())

# df_filter=df_filter[np.logical_not((df_filter['std_rj']>10) & (df_filter['count']>2))]
# print(df_filter['Structure'].count())
# df_filter=df_filter[(df_filter['std_rj']>10)]
# print(df_filter['Structure'].count())
#print(df_filter[df_filter['count']==1].count())
#print(Chem.Mol.GetAtoms(df_filter.iloc[1]['mol']).)
#show(df_filter[['mol']].sample(10))

#print(reax_uv[reax_uv['Structure']=='N#CC1=C2C=CC=CC2=CC2=CC=CC=C12'][cols])

6810
6530
6138
4070
3532


In [12]:
df_filter.to_pickle('/home/denn/harvard/SF/Library/Reaxys/anth_peryl_uv_below1000_reaxys2.pickle')

In [13]:
df_filter['mol'].apply(Chem.MolToSmiles).to_csv('/home/denn/harvard/SF/Library/Reaxys/anth_peryl_uv_below1000_reaxys2.smi',index=False)