In [2]:
import pandas as pd
import numpy as np
import numba
#import dask.dataframe as dd

import sys

from IPython.display import HTML
from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools # headsup: this import change the behavior of dataframes with mols in them
# some global configuration of the pandastools
PandasTools.molRepresentation = 'svg'
PandasTools.molSize = (200,200)

#constatns
HA_TO_EV = 27.211399
PERIODICTABLE = Chem.GetPeriodicTable()

# this is a little helper function to render images inside a dataframe
# once again, there are ways to monkey patch the rendering of dataframes, but I am trying to 
# avoid most of that to make things a bit easier to understand

def show(df):
    return HTML(df.to_html(escape=False))

In [87]:
# from dask.distributed import Executor
# e = Executor('localhost:8786', set_as_default=True)

In [88]:
import os
import glob

path = r'/home/denn/home/ml/data/reaxys/bp/'
all_files = glob.glob(os.path.join(path, "*.xls"))
pds = []
for f in all_files:
    print(f)
    pds.append(pd.read_csv(f,delimiter='\t',low_memory=False))
    
reax_full=pd.concat(pds, ignore_index=True)
print(reax_full.shape)

/home/denn/home/ml/data/reaxys/bp/bp_165000.xls
/home/denn/home/ml/data/reaxys/bp/bp_60000.xls
/home/denn/home/ml/data/reaxys/bp/bp_155000.xls
/home/denn/home/ml/data/reaxys/bp/bp_1000.xls
/home/denn/home/ml/data/reaxys/bp/bp_135000.xls
/home/denn/home/ml/data/reaxys/bp/bp_110000.xls
/home/denn/home/ml/data/reaxys/bp/bp_210000.xls
/home/denn/home/ml/data/reaxys/bp/bp_195000.xls
/home/denn/home/ml/data/reaxys/bp/bp_10000.xls
/home/denn/home/ml/data/reaxys/bp/bp_190000.xls
/home/denn/home/ml/data/reaxys/bp/bp_75000.xls
/home/denn/home/ml/data/reaxys/bp/bp_35000.xls
/home/denn/home/ml/data/reaxys/bp/bp_115000.xls
/home/denn/home/ml/data/reaxys/bp/bp_90000.xls
/home/denn/home/ml/data/reaxys/bp/bp_185000.xls
/home/denn/home/ml/data/reaxys/bp/bp_2000.xls
/home/denn/home/ml/data/reaxys/bp/bp_15000.xls
/home/denn/home/ml/data/reaxys/bp/bp_205000.xls
/home/denn/home/ml/data/reaxys/bp/bp_95000.xls
/home/denn/home/ml/data/reaxys/bp/bp_120000.xls
/home/denn/home/ml/data/reaxys/bp/bp_20000.xls
/hom

In [89]:
# uv_cols = [col for col in reax_full.columns if 'UV' in col]
# cols=uv_cols+['Structure']
print(reax_full.columns)
#reax_full[cols].head()
need_cols=['Boiling Point: Boiling Point [C]','Pressure (Boiling Point) [Torr]', 'Refractive Index', 'Wavelength (Refractive Index) [nm]',
           'Density: Density [g·cm-3]', 'Reference Temperature [C]', 'Measurement Temperature [C]', 'Type (Density)',
           'Temperature (Refractive Index) [C]','Molecular Formula','Substance Identification: Reaxys Registry Number',
           'InChI Key','CAS Registry Number','Chemical Name','Linear Structure Formula','Structure','Number of References']
#reax_full[need_cols].dropna(subset=['Type (Density)']).sample(10)

Index(['Structure', 'Structure: Markush', 'Substance Identification: Reaxys Registry Number', 'Links to Reaxys', 'Data Count', 'CAS Registry Number', 'Chemical Name', 'Linear Structure Formula', 'Molecular Formula', 'Molecular Weight', 'Type of Substance', 'Type and Modification', 'InChI Key', 'Composition: Comp. Name', 'Composition: Comp. Conc.', 'Composition: Comp. Attrib.', 'Field Availability', 'Number of Reactions', 'Number of References', 'Boiling Point: Boiling Point [C]', 'Pressure (Boiling Point) [Torr]', 'Location', 'Comment (Boiling Point)', 'References', 'Links to Reaxys.1', 'Refractive Index', 'Wavelength (Refractive Index) [nm]', 'Temperature (Refractive Index) [C]', 'Location.1', 'Comment (Refractive Index)', 'References.1', 'Links to Reaxys.2', 'Density: Density [g·cm-3]', 'Reference Temperature [C]', 'Measurement Temperature [C]', 'Type (Density)', 'Location.2', 'Comment (Density)', 'References.2', 'Links to Reaxys.3', 'Unnamed: 40'], dtype='object')


In [90]:

need_cols=['Boiling Point: Boiling Point [C]','Pressure (Boiling Point) [Torr]', 'Location', 'Comment (Boiling Point)', 
#           'Refractive Index','Wavelength (Refractive Index) [nm]','Temperature (Refractive Index) [C]',
#           'Density: Density [g·cm-3]', 'Reference Temperature [C]', 'Measurement Temperature [C]', 'Type (Density)',
           'Molecular Formula','Substance Identification: Reaxys Registry Number',
           'InChI Key','CAS Registry Number','Chemical Name','Linear Structure Formula','Structure','Number of References']
ref_cols_name=[col for col in reax_full.columns if col.startswith('Ref')]

reax_bp=reax_full[need_cols+ref_cols_name].dropna(subset=['Boiling Point: Boiling Point [C]']).dropna(axis=1,how='all')
#del reax_full
print(reax_bp.shape)
# reax_uv=reax_uv[reax_uv.Structure.str.match('([0-9CNOFS@+\-\[\]\(\)\\\/%=#$]+)$',case=False,as_indexer=True)]
#reax_uv=reax_uv[reax_uv.Structure.str.match('^((?!(Co|Se|Cs|Os|\[O\+8\]|\[C-3\]|\[N\+5\]|\[O\+3\]|\.)).)*$',case=True,as_indexer=True)]
reax_bp=reax_bp[~reax_bp.Structure.str.contains('.',regex=False)]
print(reax_bp.shape)
reax_bp=reax_bp[reax_bp['Comment (Boiling Point)'].isnull()]
print(reax_bp.shape)

(554076, 17)
(552763, 17)
(537776, 17)


In [29]:
reax_bp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 486114 entries, 0 to 529381
Data columns (total 18 columns):
Boiling Point: Boiling Point [C]                    486114 non-null object
Pressure (Boiling Point) [Torr]                     439980 non-null object
Refractive Index                                    199790 non-null object
Location                                            5501 non-null object
Comment (Boiling Point)                             0 non-null object
Molecular Formula                                   486114 non-null object
Substance Identification: Reaxys Registry Number    486114 non-null int64
InChI Key                                           486111 non-null object
CAS Registry Number                                 479619 non-null object
Chemical Name                                       482662 non-null object
Linear Structure Formula                            486114 non-null object
Structure                                           486114 non-null obje

In [33]:
# reax_bp=reax_bp.drop_duplicates()
# print(reax_bp.shape)

In [7]:
# import re
# def mf2dict(mf_str):
#     molfor=re.findall(r'([A-Z][a-z]*)(\d*)', mf_str)
#     molfor1=map(lambda e: (e[0], 1) if e[1] == '' else (e[0],int(e[1])), molfor)
#     return dict(molfor1)
# reax_uv['mf']=reax_uv['Molecular Formula'].apply(mf2dict)
# reax_uv['C_c']=reax_uv['mf'].apply(lambda d: d.get('C',0))

In [8]:
# reax_uv=reax_uv[reax_uv.mf.apply(lambda m: bool(set(m.keys()) - set(['H','C','N','O','F','S','Cl','Br','I','P']))==False)]
# print(reax_uv.shape)

(299341, 99)


In [31]:
#reax_bp['Pressure (Boiling Point) [Torr]'].value_counts()
#reax_bp['Location'].value_counts()

In [10]:
#reax_uv['Solvent (UV/VIS Spectroscopy)']=reax_uv['Solvent (UV/VIS Spectroscopy)'].fillna('not given')
#reax_uv.replace({‘a’: {‘b’: nan}})

In [91]:
# reax_bp=reax_bp[(reax_bp.C_c>6) & (reax_bp.C_c<51)]
# print(reax_bp.shape)

reax_bp=reax_bp[~(reax_bp['Chemical Name'].fillna('').str.contains('radical',regex=False))]
print(reax_bp.shape)
reax_bp=reax_bp[~(reax_bp['Chemical Name'].fillna('').str.contains('cation',regex=False))]
print(reax_bp.shape)
reax_bp=reax_bp[~(reax_bp['Chemical Name'].fillna('').str.contains('anion',regex=False))]
print(reax_bp.shape)
# reax_bp=reax_bp[~(reax_bp['UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)'].fillna('').str.contains('two-photon',regex=False))]
# print(reax_bp.shape)
# reax_bp=reax_bp[~(reax_bp['Linear Structure Formula'].fillna('').str.contains('(1+)',regex=False))]
# print(reax_bp.shape)
# reax_bp=reax_bp[~(reax_bp['Linear Structure Formula'].fillna('').str.contains('(1-)',regex=False))]
# print(reax_bp.shape)
reax_bp=reax_bp[~(reax_bp['Linear Structure Formula'].fillna('').str.contains('(2-)',regex=False))]
print(reax_bp.shape)
reax_bp=reax_bp[~(reax_bp['Linear Structure Formula'].fillna('').str.contains('(2+)',regex=False))]
rint(reax_bp.shape)
reax_bp=reax_bp[~(reax_bp['Linear Structure Formula'].fillna('').str.contains('(3-)',regex=False))]
print(reax_bp.shape)
reax_bp=reax_bp[~(reax_bp['Linear Structure Formula'].fillna('').str.contains('(3+)',regex=False))]
print(reax_bp.shape)

(537735, 17)
(537731, 17)
(537721, 17)
(537721, 17)


NameError: name 'rint' is not defined

In [36]:
reax_bp.sample(5)

Unnamed: 0,Boiling Point: Boiling Point [C],Pressure (Boiling Point) [Torr],Refractive Index,Location,Comment (Boiling Point),Molecular Formula,Substance Identification: Reaxys Registry Number,InChI Key,CAS Registry Number,Chemical Name,Linear Structure Formula,Structure,Number of References,References,Refractive Index.1,References.1,Reference Temperature [C],References.2
435786,87,11.0,,,,C9H16O,2076858,RNZGQRHWGNRMPR-UHFFFAOYSA-N,41641-14-9,"1-hydroxymethyl-3,3-dimethyl-cyclohex-1-ene; 1-Hydroxymethyl-3,3-dimethylcyclohex-1-ene; 3,3-Dimethyl-1-hydroxymethyl-1-cyclohexene; (3,3-dimethylcyclohex-1-en-1-yl)methanol; 1-hydroxymethyl-3,3-dimethylcyclohexene; 3,3-Dimethyl-1-cyclohexene-1-methanol; 3,3-Dimethyl-1-cyclohexen-1-methanol",C8H13CH3O,CC1(C)CCCC(CO)=C1,16.0,"Article; Seifert; Schinz; Helvetica Chimica Acta; vol. 34; (1951); p. 728,735;",,,,
288401,72 - 73,9.5,1.424,,,C8H14O3,1769180,GKUQREVLPWOEEG-UHFFFAOYSA-N,3592-22-1,3-acetoxy-3-methyl-pentan-2-one; 3-Acetoxy-3-methyl-pentan-2-on,C8H14O3,CCC(C)(OC(C)=O)C(C)=O,4.0,"Article; Matsoyan,S.G. et al.; J. Gen. Chem. USSR (Engl. Transl.); vol. 30; 4; (1960); p. 1202 - 1207,1223 - 1228;",1.424,"Article; Matsoyan,S.G. et al.; J. Gen. Chem. USSR (Engl. Transl.); vol. 30; 4; (1960); p. 1202 - 1207,1223 - 1228;",4.0,"Article; Matsoyan,S.G. et al.; J. Gen. Chem. USSR (Engl. Transl.); vol. 30; 4; (1960); p. 1202 - 1207,1223 - 1228;"
154715,184,10.0,,,,C9H8N2,114479,XMIAFAKRAAMSGX-UHFFFAOYSA-N,611-34-7,5-Aminoquinoline,NC9H6NH2,NC1=C2C=CC=NC2=CC=C1,345.0,Article; Kaufmann; Zeller; Chemische Berichte; vol. 50; (1917); p. 1630;,,,,
305868,206.5 - 207.5,750.0,,,,C9H12S,1906756,SNOAHAUUBQMVGW-UHFFFAOYSA-N,3019-20-3,isopropylthiobenzene; i-propyl phenyl sulfide; 1-(1-methylethylthio)benzene; isopropyl phenyl sulfide; phenyl isopropyl sulfide; isopropylsulfanyl-benzene; isopropyl-phenyl sulfide,(CH3)2CHS(C6H5),CC(C)SC1=CC=CC=C1,286.0,Article; Ipatieff; Pines; Friedman; Journal of the American Chemical Society; vol. 60; (1938); p. 2732;,,,,
217055,159 - 162,44.0,,,,C11H12N2,132874,IIIQRAJKCBNLIZ-UHFFFAOYSA-N,32570-88-0,"2-(2,5-dimethyl-1H-pyrrol-1-yl)pyridine; 2-(2,5-dimethyl-1H-pyrrol-1-yl) pyridine; 2-(2,5-dimethyl-1H-pyrrol-1-yl)-pyridine; 1-(2-pyridyl)-2,5-dimethyl-1H-pyrrole; 2-(2,5-dimethyl-pyrrol-1-yl)-pyridine; 2-(2,5-Dimethyl-pyrrol-1-yl)-pyridin",C11H12N2,CC1=CC=C(C)N1C1=NC=CC=C1,35.0,"Article; Vorkapic-Furac, Jasna; Mintas, Mladen; Kastner, Fritz; Mannschreck, Albrecht; Journal of Heterocyclic Chemistry; vol. 29; 2; (1992); p. 327 - 333;",,,,


In [37]:
print(sum(reax_bp.duplicated(subset='InChI Key')))
print(sum(reax_bp.duplicated(subset='Substance Identification: Reaxys Registry Number')))

312085
311501


In [92]:
def uv_parse(bp,p):
    try:
        bp_arr = np.array([float(x) for x in str(bp).split('-')])
        bp_m=bp_arr.mean()
        bp_s=bp_arr.std()
    except ValueError as e:
        print(e)
        bp_m=None
        bp_s=None
    try:
        p_m = np.array([float(x) for x in str(p).split('-')]).mean()
    except ValueError as e:
        print(e)
        p_m=None

    return pd.Series([bp_m,bp_s,p_m], ['bp','bp_std','p'])

res = reax_bp[['Boiling Point: Boiling Point [C]','Pressure (Boiling Point) [Torr]']].apply(lambda row: uv_parse(row['Boiling Point: Boiling Point [C]'],row['Pressure (Boiling Point) [Torr]']),axis=1)
reax_bp = reax_bp.join(res)

could not convert string to float: '1E'
could not convert string to float: '1E'
could not convert string to float: '1E'
could not convert string to float: '8E'
could not convert string to float: 
could not convert string to float: '1E'
could not convert string to float: '1E'
could not convert string to float: '5E'
could not convert string to float: '7.50075E'
could not convert string to float: '1E'
could not convert string to float: '1E'
could not convert string to float: '5E'
could not convert string to float: '4E'
could not convert string to float: '8E'
could not convert string to float: '4E'
could not convert string to float: '6E'
could not convert string to float: 
could not convert string to float: '1.95016E'
could not convert string to float: 
could not convert string to float: 
could not convert string to float: 
could not convert string to float: 
could not convert string to float: 
could not convert string to float: 
could not convert string to float: 
could not convert string

could not convert string to float: '5E'
could not convert string to float: 
could not convert string to float: 
could not convert string to float: 
could not convert string to float: '1E'
could not convert string to float: '5.62556E'
could not convert string to float: 
could not convert string to float: '3E'
could not convert string to float: 
could not convert string to float: '1E'
could not convert string to float: '2E'
could not convert string to float: '1E'
could not convert string to float: '7E'
could not convert string to float: '7E'
could not convert string to float: '7.50075E'
could not convert string to float: '1E'
could not convert string to float: '7.5E'
could not convert string to float: 
could not convert string to float: '1E'
could not convert string to float: '2E'
could not convert string to float: '8E'
could not convert string to float: 
could not convert string to float: 
could not convert string to float: 
could not convert string to float: 
could not convert string t

could not convert string to float: '1E'
could not convert string to float: '9E'
could not convert string to float: 
could not convert string to float: '3E'
could not convert string to float: '5E'
could not convert string to float: '1E'
could not convert string to float: '8E'
could not convert string to float: '1E'
could not convert string to float: '5E'
could not convert string to float: '1E'
could not convert string to float: '2.17517E'
could not convert string to float: 
could not convert string to float: 
could not convert string to float: '1E'
could not convert string to float: '2E'
could not convert string to float: '3E'
could not convert string to float: '1E'
could not convert string to float: '8E'
could not convert string to float: '1E'
could not convert string to float: '1E'
could not convert string to float: '1E'
could not convert string to float: '1E'
could not convert string to float: '3E'
could not convert string to float: '5E'
could not convert string to float: '5E'
could 

In [93]:
reax_bp=reax_bp.dropna(subset=['bp'])
print(reax_bp.shape)

(537514, 20)


In [16]:
need_cols=['Ext./Abs. Coefficient [l·mol-1cm-1]','Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)','Solvent (UV/VIS Spectroscopy)','Linear Structure Formula','Structure','Number of References','Absorption Maxima (UV/VIS) [nm]']
ref_cols_name=[col for col in reax_uv.columns if col.startswith('Ref')]
def fix_ref(rows):
    counts=rows.count()
    max_indx=counts.argmax()
    #ref_col=rows.dropna(axis=1,how='any')
    if counts[max_indx]>0:
        return rows.loc[:,max_indx]
    else:
        return None

reference=reax_uv[need_cols+ref_cols_name].groupby(['Substance Identification: Reaxys Registry Number','Solvent (UV/VIS Spectroscopy)','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)'],group_keys=False).apply(lambda rows: fix_ref(rows[ref_cols_name]))


In [66]:
#reference.fillna('').count()

150087

In [17]:
reax_uv['ref']=reference.fillna('')


In [94]:
reax_uv['ref']=reax_uv['ref'].fillna('')
reax_uv['UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)']=reax_uv['UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)'].fillna('')
#reax_uv.sample(10)

In [95]:
#'C_c',
need_cols=['ref','Ext./Abs. Coefficient [l·mol-1cm-1]','Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)','Solvent (UV/VIS Spectroscopy)','Linear Structure Formula','Structure','Number of References','Absorption Maxima (UV/VIS) [nm]']
def fix_reax(rows):
    if rows['ref'].iloc[0]!='':
        if rows.shape[0]==2:
            nms1=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[0].split(';')]
            nms2=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[1].split(';')]
            nms12 = nms1 + nms2
            nms21 = nms2 + nms1
            if nms12==sorted(nms12) or nms12==sorted(nms12,reverse=True) or nms21==sorted(nms21) or nms21==sorted(nms21,reverse=True):
                try:
                    eps=len([float(x) for x in rows['Ext./Abs. Coefficient [l·mol-1cm-1]'].str.cat(sep='; ').split(';')])
                except ValueError:
                    eps=0
                if eps==0 or eps==len(nms12):
                    ret_r=rows.head(1)
                    ret_r['Absorption Maxima (UV/VIS) [nm]']=rows['Absorption Maxima (UV/VIS) [nm]'].str.cat(sep='; ') #['Absorption Maxima (UV/VIS) [nm]'].apply(lambda a: a.str.cat(sep=';'))  .apply(lambda a: print(a.dropna(axis=1)))
                    ret_r['Ext./Abs. Coefficient [l·mol-1cm-1]']=rows['Ext./Abs. Coefficient [l·mol-1cm-1]'].str.cat(sep='; ')
                    return ret_r
        elif rows.shape[0]==3:
            nms1=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[0].split(';')]
            nms2=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[1].split(';')]
            nms3=[float(x) for x in rows['Absorption Maxima (UV/VIS) [nm]'].iloc[2].split(';')]
            nms123 = nms1 + nms2 + nms3
            nms321 = nms3 + nms2 + nms1
            nms231 = nms2 + nms3 + nms1
            if nms123==sorted(nms123) or nms123==sorted(nms123,reverse=True) or nms321==sorted(nms321) or nms321==sorted(nms321,reverse=True) or (nms231==sorted(nms231) and len(nms2)==4 and len(nms3)==4):
                try:
                    eps=len([float(x) for x in rows['Ext./Abs. Coefficient [l·mol-1cm-1]'].str.cat(sep='; ').split(';')])
                except ValueError:
                    eps=0
                if eps==0 or eps==len(nms123):
                    ret_r=rows.head(1)
                    ret_r['Absorption Maxima (UV/VIS) [nm]']=rows['Absorption Maxima (UV/VIS) [nm]'].str.cat(sep='; ') #['Absorption Maxima (UV/VIS) [nm]'].apply(lambda a: a.str.cat(sep=';'))  .apply(lambda a: print(a.dropna(axis=1)))
                    ret_r['Ext./Abs. Coefficient [l·mol-1cm-1]']=rows['Ext./Abs. Coefficient [l·mol-1cm-1]'].str.cat(sep='; ')
                    return ret_r
        else:
            return rows
    return rows
    
reax_fix1=reax_uv[:][need_cols].drop_duplicates().groupby(['Substance Identification: Reaxys Registry Number','Solvent (UV/VIS Spectroscopy)','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)','ref'],group_keys=False).apply(lambda rows: fix_reax(rows))


#,group_keys=False

In [96]:
print(sum(reax_uv.index.duplicated(keep=False)))
print(sum(reax_fix1.index.duplicated(keep=False)))
reax_uv[:][need_cols].drop_duplicates().info()
reax_fix1.info()

0
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 232371 entries, 0 to 700664
Data columns (total 12 columns):
ref                                                       232371 non-null object
Ext./Abs. Coefficient [l·mol-1cm-1]                       119913 non-null object
Substance Identification: Reaxys Registry Number          232371 non-null int64
InChI Key                                                 231528 non-null object
CAS Registry Number                                       135335 non-null object
Chemical Name                                             180765 non-null object
UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)    232371 non-null object
Solvent (UV/VIS Spectroscopy)                             232371 non-null object
Linear Structure Formula                                  232371 non-null object
Structure                                                 232371 non-null object
Number of References                                      232371 non-null in

In [99]:
col='Substance Identification: Reaxys Registry Number'
col='InChI Key'
print(reax_uv[col].value_counts()[:10])
print(reax_uv[col].unique().shape)
print(reax_fix1[col].value_counts()[:10])
print(reax_fix1[col].unique().shape)


YNHJECZULSZAQK-LWQDQPMZSA-N    346
REFJWTPEDVJJIY-UHFFFAOYSA-N    334
IKGXIBQEEMLURG-NVPNHPEKSA-N    233
KZNIFHPLKGYRTM-UHFFFAOYSA-N    206
IYRMWMYZSQPJKC-UHFFFAOYSA-N    191
IQPNAANSBPBGFQ-UHFFFAOYSA-N    183
OVSQVDMCBVZWGM-QSOFNFLRSA-N    182
JPUKWEQWGBDDQB-QSOFNFLRSA-N    170
UWOVWIIOKHRNKU-UHFFFAOYSA-N    159
OENHQHLEOONYIE-JLTXGRSLSA-N    150
Name: InChI Key, dtype: int64
(127012,)
REFJWTPEDVJJIY-UHFFFAOYSA-N    322
YNHJECZULSZAQK-LWQDQPMZSA-N    303
IKGXIBQEEMLURG-NVPNHPEKSA-N    230
KZNIFHPLKGYRTM-UHFFFAOYSA-N    200
IYRMWMYZSQPJKC-UHFFFAOYSA-N    183
OVSQVDMCBVZWGM-QSOFNFLRSA-N    181
IQPNAANSBPBGFQ-UHFFFAOYSA-N    180
JPUKWEQWGBDDQB-QSOFNFLRSA-N    164
UWOVWIIOKHRNKU-UHFFFAOYSA-N    152
OENHQHLEOONYIE-JLTXGRSLSA-N    148
Name: InChI Key, dtype: int64
(127012,)


In [100]:
#print(reax_fix1[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']][reax_fix1['Absorption Maxima (UV/VIS) [nm]'].str.contains('|',regex=False)].count())
#reax_fix1[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']][reax_fix1['Absorption Maxima (UV/VIS) [nm]'].str.contains('|',regex=False)]
reax_fix1.shape

(220551, 12)

In [31]:
#show(reax_uv.loc[28351:28353].dropna(axis=1,how='all'))

In [None]:
#reax_uv1=reax_uv

In [101]:
reax_uv1=reax_fix1.copy()

In [102]:
# [Br,C,Ca,Cl,F,H,I,K,Mg,N,Na,O,P,S,Zn]
# a = set(['H','C','N','O','F','S','Cl','Br','I'])
# b = set(['C','O','Fe'])

# b-a

In [103]:
# ref_cols_name=[col for col in reax_uv.columns if col.startswith('Ref')]
# def fix_reax(rows):
#     if rows.shape[1]>1:
#         ref_col=rows[ref_cols_name].dropna(axis=1).columns[0]
#         print(rows.groupby(ref_col)['Absorption Maxima (UV/VIS) [nm]'].apply(lambda a: a.str.cat(sep=';'))) #['Absorption Maxima (UV/VIS) [nm]'].apply(lambda a: a.str.cat(sep=';'))  .apply(lambda a: print(a.dropna(axis=1)))
#     return rows
    
# t1=reax_uv[reax_uv['InChI Key']=='WCSKDPQVNKZNQD-LWQDQPMZSA-N'].groupby(['InChI Key','Solvent (UV/VIS Spectroscopy)']).apply(lambda rows: fix_reax(rows))



In [104]:
#t1

In [34]:
#reference
#t1=t1.dropna(how='all')
#t1['Absorption Maxima (UV/VIS) [nm]'].str.contains('|',regex=False)
#t1[t1['Absorption Maxima (UV/VIS) [nm]'].str.contains('|',regex=False)]
#reax_fix1.to_pickle('/home/denn/home/ml/data/sf/reaxys4_fix.pickle')

In [35]:
#reax_uv1.ix[225444]
#reax_uv[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']].values[:10]
#res=fnuv_parse(reax_uv[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']].values[:10])
#res=fnuv_parse(reax_uv['Absorption Maxima (UV/VIS) [nm]'].values[:10],reax_uv['Ext./Abs. Coefficient [l·mol-1cm-1]'].values[:10])

In [26]:
#res[res.s1_nm>2000].count()

In [134]:
# sample=reax_uv.sample(10).copy()
# sample[['Absorption Maxima (UV/VIS) [nm]','Ext./Abs. Coefficient [l·mol-1cm-1]']]

In [133]:
#sample.dropna(axis=1,how='all')
#sample.count()

In [141]:
# reax_uv=reax_uv[(reax_uv['Solvent (UV/VIS Spectroscopy)']!='water')&(reax_uv['Solvent (UV/VIS Spectroscopy)']!='various solvent(s)')&(reax_uv['Solvent (UV/VIS Spectroscopy)']!='methanol; various solvent(s)')\
#                &(reax_uv['Solvent (UV/VIS Spectroscopy)']!='aq. ethanol')]

In [28]:
#print(sum(reax_uv1.index.duplicated(keep=False)))
#reax_uv1.info()

0


In [50]:
reax_bp1=reax_bp[(reax_bp.p<900) & (reax_bp.p>500)]
print(reax_bp1.shape)


(38567, 21)


In [52]:
sum(reax_bp.p==0.0)

65

In [67]:
reax_bp[reax_bp.p!=0.0].p.sort_values().head(10)
reax_bp.p.value_counts()

1.000000e+00    22659
1.000000e+01    21080
7.600000e+02    19462
2.000000e+00    18925
1.500000e+01    17324
1.200000e+01    17107
1.000000e-01    16728
3.000000e+00    14482
2.000000e+01    14086
5.000000e-01    12760
2.000000e-01    12067
5.000000e+00    11431
3.000000e-01    10874
4.000000e+00     9896
1.300000e+01     9297
1.400000e+01     9019
1.100000e+01     8773
5.000000e-02     7173
8.000000e+00     6886
1.800000e+01     6856
1.600000e+01     6540
1.500000e+00     6414
4.000000e-01     6392
1.000000e-02     5915
6.000000e+00     5788
7.000000e+00     5363
9.000000e+00     5360
1.700000e+01     5231
2.500000e+01     4904
3.000000e+01     4065
6.000000e-01     3898
8.000000e-01     3879
2.500000e+00     3311
2.000000e-02     3241
7.000000e-01     3069
2.200000e+01     2748
1.900000e+01     2615
3.000000e-02     2565
5.000000e+01     2199
4.000000e+01     2073
2.300000e+01     2018
1.000000e-03     2015
3.500000e+00     1940
2.100000e+01     1852
4.000000e-02     1796
1.000000e+

In [71]:
np.logspace(-5.0, 3.0, num=9)

array([  1.00000000e-05,   1.00000000e-04,   1.00000000e-03,
         1.00000000e-02,   1.00000000e-01,   1.00000000e+00,
         1.00000000e+01,   1.00000000e+02,   1.00000000e+03])

In [94]:
reax_bp1=reax_bp[:].groupby(['Substance Identification: Reaxys Registry Number',pd.cut(reax_bp.p, np.logspace(-5.0, 3.0, num=9))])['bp'].agg({
                                                   #'mode':lambda x: x.value_counts().idxmax(),
                                                   #'mode_count':lambda x: x.value_counts().max(),
#                                                    's1_ev': {
#                                                        'std':'std',
                                                       'median':'median',
                                                       'max':'max',
                                                       'min':'min',
                                                       #'max_min':lambda x: np.max(x)-np.min(x),
                                                       'count':'count',
                                                       'mean':'mean',
                                                       #'mean_rj':lambda x: np.mean(reject_outliers(x.values,iq_range=0.5)),
                                                       #'std_rj':lambda x: np.std(reject_outliers(x,iq_range=0.5))
#                                                    },
#                                                     's1_eps': {'max_eps':'max'}
                                                    })

reax_bp1['max_min']=reax_bp1['max']-reax_bp1['min']

In [95]:
reax_bp1.sample(10)
print(reax_bp1['max_min'].max())
#reax_bp1['mean'].value_counts()

600.0


In [96]:
df_filter=reax_bp1.copy()
# df_filter=df_filter[np.logical_not((df_filter['std_rj']>10) & (df_filter['count']>2))]
# print(df_filter['count'].count())
#df_filter=df_filter[(df_filter['std'].isnull())|(df_filter['std']<10)]
#df_filter=df_filter[(df_filter['std_rj'].isnull())|(df_filter['std_rj']<20)]
df_filter=df_filter[df_filter['max_min']<10]
print(df_filter['count'].count())
print(df_filter['max_min'].max())
print(df_filter['max_min'].mean())
#print(df_filter[df_filter['count']==1].count())
print(df_filter.shape)
show(df_filter.sample(10))

251146
9.9
0.565411095538
(251146, 6)


Unnamed: 0_level_0,Unnamed: 1_level_0,max,median,min,count,mean,max_min
Substance Identification: Reaxys Registry Number,p,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1942612,"(1, 10]",102.5,102.5,102.5,1,102.5,0.0
18821428,"(0.001, 0.01]",129.0,129.0,129.0,1,129.0,0.0
1726674,"(1, 10]",125.0,125.0,125.0,1,125.0,0.0
1707463,"(0.1, 1]",63.0,63.0,63.0,1,63.0,0.0
2961094,"(0.01, 0.1]",90.0,90.0,90.0,1,90.0,0.0
2716299,"(1, 0.0001]",53.0,53.0,53.0,1,53.0,0.0
1874194,"(0.01, 0.1]",86.0,86.0,86.0,1,86.0,0.0
20100,"(1, 10]",94.0,94.0,94.0,1,94.0,0.0
1707818,"(0.1, 1]",101.5,101.5,101.5,1,101.5,0.0
2037053,"(100, 1000]",125.75,123.625,121.5,2,123.625,4.25


In [110]:
df_filter.reset_index().p.value_counts()


(1, 10]            67691
(10, 100]          59354
(0.1, 1]           59281
(0.01, 0.1]        33679
(100, 1000]        18895
(0.001, 0.01]       9056
(0.0001, 0.001]     2695
(1, 0.0001]          495
Name: p, dtype: int64

In [109]:
df_filter.xs('(1, 10]',level=1).max_min.max()
# reax_solv.shape

9.9000000000000057

In [None]:
#reax_solv['Molecular Weight'].median()
#reax_solv[reax_solv['Number of References']==1]['Number of References'].count()

In [72]:
#reax_solv[reax_solv.mf.apply(len)==2].sample(10).dropna(axis=1,how='all')
#reax_uv[reax_uv.C_c==0].sample(5).dropna(axis=1,how='all')
#reax_uv[reax_uv['InChI Key']=='MWPLVEDNUUSJAV-UHFFFAOYSA-N'].dropna(axis=1,how='all')

In [None]:
#reax_solv.count()

In [None]:
#reax_solv['Comment (UV/VIS Spectroscopy)'].dropna()[:100]
#'UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)'

In [146]:
# need_cols=['s1_nm','s1_eps','s2_nm','s2_eps','s3_nm','s3_eps','Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name','UV/VIS Spectroscopy: Description (UV/VIS Spectroscopy)','Solvent (UV/VIS Spectroscopy)','C_c','Linear Structure Formula','Structure','Number of References','Absorption Maxima (UV/VIS) [nm]']

In [30]:
# print(reax_uv1.shape)
# reax=reax_uv1.drop_duplicates()
# #reax_solv_i=reax_solv[need_cols].set_index(['InChI Key']) #,'Solvent (UV/VIS Spectroscopy)'
# #reax_solv[reax_solv.C_c==0 | [reax_solv['CAS Registry Number'].isnull()][need_cols]
# print(reax.shape)

(134474, 19)
(134474, 19)


In [120]:
#show(reax.sort_values(by='s1_nm',ascending=False)[:30].dropna(axis=1,how='all'))

In [278]:
# def reject_outliers(sr, iq_range=0.5):
#     pcnt = ((1 - iq_range) / 2)*100
#     qlow, median, qhigh = np.percentile(sr,[pcnt, 50, 100-pcnt])
#     iqr = qhigh - qlow
#     return sr[ np.abs(sr - median) <= iqr]

In [293]:
# reax1=reax.groupby(['InChI Key','Solvent (UV/VIS Spectroscopy)'])['s1_nm'].agg({
# #                                                    'mode':lambda x: x.value_counts().idxmax(),
# #                                                    'mode_count':lambda x: x.value_counts().max(),
#                                                    'std':'std',
# #                                                    'median':np.median,
#                                                    'max':'max',
#                                                    'min':'min',
#                                                    'max_min':lambda x: np.max(x)-np.min(x),
#                                                    'count':'count',
#                                                    'mean':'mean',
#                                                    'mean_rj':lambda x: np.mean(reject_outliers(x,iq_range=0.5)),
#                                                    'std_rj':lambda x: np.std(reject_outliers(x,iq_range=0.5))
#                                                    })


In [31]:
# print(reax1[(reax1['max_min']<20)&(reax1['count']>1)].count())
# reax1[(reax1['max_min']>100)&(reax1['count']>0)].sort_values(by='max_min',ascending=False)[:30]

In [32]:
# #reax[reax['InChI Key']=='OYJOYFYJDAFOEI-UHFFFAOYSA-N']
# #show(reax[reax['InChI Key']=='ZGUGWUXLJSTTMA-UHFFFAOYSA-N'].dropna(axis=1))
# show(reax[reax['Substance Identification: Reaxys Registry Number']==2121226].dropna(axis=1))

In [99]:
#df_solvents=reax_solv[['Solvent (UV/VIS Spectroscopy)','nm','Structure','InChI Key']].set_index([]).pivot(columns='Solvent (UV/VIS Spectroscopy)')
#df_solvents=reax_solv_i[['Solvent (UV/VIS Spectroscopy)','nm']].pivot(columns='Solvent (UV/VIS Spectroscopy)')

In [33]:


# #dft = np.logical_not(reax_uv['Solvent (UV/VIS Spectroscopy)'].str.contains('acid|buffer|H2O',na=False))
# dft = (reax_uv['Solvent (UV/VIS Spectroscopy)']=='methanol')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='ethanol')
# #dft = (reax_uv['Solvent (UV/VIS Spectroscopy)']=='methanol')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='ethanol')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='acetonitrile')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='tetrahydrofuran')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='dioxane')
# #dft = (reax_uv['Solvent (UV/VIS Spectroscopy)']=='CHCl3')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='CH2Cl2')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='dichloromethane')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='hexane')|(reax_uv['Solvent (UV/VIS Spectroscopy)']=='cyclohexane')
# reax_uv1=reax_uv[dft]
# reax_uv1.nm.count()

In [107]:
reax_uv1['s1_ev']=1240./reax_uv1['s1_nm']

In [108]:
reax_s1=reax_uv1[:].groupby('Substance Identification: Reaxys Registry Number')['s1_ev','s1_eps'].agg({
                                                   #'mode':lambda x: x.value_counts().idxmax(),
                                                   #'mode_count':lambda x: x.value_counts().max(),
                                                   's1_ev': {
                                                       'std':'std',
                                                       'median':'median',
                                                       'max':'max',
                                                       'min':'min',
                                                       #'max_min':lambda x: np.max(x)-np.min(x),
                                                       'count':'count',
                                                       'mean':'mean',
                                                       #'mean_rj':lambda x: np.mean(reject_outliers(x.values,iq_range=0.5)),
                                                       #'std_rj':lambda x: np.std(reject_outliers(x,iq_range=0.5))
                                                   },
                                                    's1_eps': {'max_eps':'max'}
                                                    })

reax_s1['max_min']=reax_s1['max']-reax_s1['min']

In [109]:
print(sum(reax_s1.index.duplicated(keep=False)))
reax_s1.info()
#reax_s1['count'].value_counts()

0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 128171 entries, 1807 to 30480261
Data columns (total 8 columns):
max_eps    76690 non-null float64
median     128171 non-null float64
count      128171 non-null int64
mean       128171 non-null float64
std        31933 non-null float64
min        128171 non-null float64
max        128171 non-null float64
max_min    128171 non-null float64
dtypes: float64(7), int64(1)
memory usage: 8.8 MB


In [110]:
import math
@numba.jit(nopython=True,cache=True)
def percentile(N, percent):
    """
    Find the percentile of a list of values.

    @parameter N - is a list of values. Note N MUST BE already sorted.
    @parameter percent - a float value from 0.0 to 1.0.
    @parameter key - optional key function to compute value from each element of N.

    @return - the percentile of the values
    """
    k = (N.shape[0]-1) * percent
    f = math.floor(k)
    c = math.ceil(k)
    if f == c:
        return (N[int(k)])
    d0 = (N[int(f)]) * (c-k)
    d1 = (N[int(c)]) * (k-f)
    return d0+d1

@numba.jit(nopython=True,cache=True)
def reject_outliers3(sr, iq_range=0.5):
    pcnt = ((1-iq_range) / 2)
    N=sr
    qlow = percentile(N,pcnt)
    median = percentile(N,0.5)
    qhigh = percentile(N,1-pcnt)
    iqr = qhigh - qlow
    return sr[ np.abs(sr - median) <= iqr]

@numba.jit(nopython=True,cache=True)
def mean_rj(sr, iq_range=0.5):
    sr.sort()
    return np.mean(reject_outliers3(sr,iq_range=0.5))

@numba.jit(nopython=True,cache=True)
def std_rj(sr, iq_range=0.5):
    sr.sort()
    return np.std(reject_outliers3(sr,iq_range=0.5))    
    
def reject_outliers1(sr, iq_range=0.5):
    pcnt = (1 - iq_range) / 2
    qlow, median, qhigh = sr.dropna().quantile([pcnt, 0.50, 1-pcnt])
    iqr = qhigh - qlow
    return sr[ (sr - median).abs() <= iqr]

# ar=np.array([1,2,3,4,5,5,5,5,4,7,8,9],dtype=int)
# sr=pd.Series(ar)
# %timeit mean_rj(np.array(sr.values,dtype=int),0.1)
# %timeit std_rj(np.array(sr.values,dtype=int),0.1)
#%timeit reject_outliers1(sr,0.1)
#print(reject_outliers(sr,0.5))
#print(reject_outliers1(sr,0.5))

#help(sr.values)

In [111]:
reax_s1_rj=reax_uv1[:].groupby('Substance Identification: Reaxys Registry Number')['s1_ev'].agg({
                                                   #'mode':lambda x: x.value_counts().idxmax(),
                                                   #'mode_count':lambda x: x.value_counts().max(),
                                                   's1_ev': {
                                                      'mean_rj':lambda x: mean_rj(x.values,iq_range=0.5) if x.shape[0]>2 else x.mean(),
                                                      'std_rj':lambda x: std_rj(x.values,iq_range=0.5) if x.shape[0]>2 else np.nan,
                                                   },
                                                    })

In [39]:
#reax_s1_rj.dropna().sample(10)

In [112]:
reax_s1[['std_rj','mean_rj']]=reax_s1_rj['s1_ev'][['std_rj','mean_rj']]
#reax_s1['std_rj']=reax_s1_rj['std_rj']
#reax_s1.columns


In [113]:
#'Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name',
print(reax_s1.count())
print(reax_s1.shape)
print(reax_s1['max_min'].mean())

reax_s1[reax_s1['std_rj']>0.1][:10]
print(sum(reax_s1.index.duplicated(keep=False)))
reax_s1.info()

max_eps     76690
median     128171
count      128171
mean       128171
std         31933
min        128171
max        128171
max_min    128171
std_rj      14512
mean_rj    128171
dtype: int64
(128171, 10)
0.112650039786
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 128171 entries, 1807 to 30480261
Data columns (total 10 columns):
max_eps    76690 non-null float64
median     128171 non-null float64
count      128171 non-null int64
mean       128171 non-null float64
std        31933 non-null float64
min        128171 non-null float64
max        128171 non-null float64
max_min    128171 non-null float64
std_rj     14512 non-null float64
mean_rj    128171 non-null float64
dtypes: float64(9), int64(1)
memory usage: 10.8 MB


In [114]:


# clean=reax_uv_gr[(reax_uv_gr['std_rj']>10)]
# print(.count())
df_filter=reax_s1.copy()
# df_filter=df_filter[np.logical_not((df_filter['std_rj']>10) & (df_filter['count']>2))]
# print(df_filter['count'].count())
#df_filter=df_filter[(df_filter['std'].isnull())|(df_filter['std']<10)]
#df_filter=df_filter[(df_filter['std_rj'].isnull())|(df_filter['std_rj']<20)]
df_filter=df_filter[df_filter['max_min']<0.5]
print(df_filter['count'].count())
print(df_filter['max_min'].max())
print(df_filter['max_min'].mean())
#print(df_filter[df_filter['count']==1].count())
print(df_filter.shape)
show(df_filter.sample(10))


118417
0.499876593994
0.0231741482123
(118417, 10)


Unnamed: 0_level_0,max_eps,median,count,mean,std,min,max,max_min,std_rj,mean_rj
Substance Identification: Reaxys Registry Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6762322,,2.616034,1,2.616034,,2.616034,2.616034,0.0,,2.616034
5150902,8128.0,2.119658,1,2.119658,,2.119658,2.119658,0.0,,2.119658
6604652,13500.0,2.171629,1,2.171629,,2.171629,2.171629,0.0,,2.171629
803687,,3.289125,5,3.291051,0.027331,3.263158,3.333333,0.070175,0.013722,3.280481
341301,15488.0,3.39726,1,3.39726,,3.39726,3.39726,0.0,,3.39726
5316275,12303.0,2.87703,1,2.87703,,2.87703,2.87703,0.0,,2.87703
175360,21380.0,3.604651,3,3.525369,0.175334,3.324397,3.647059,0.322662,0.021204,3.625855
7881471,,2.206406,1,2.206406,,2.206406,2.206406,0.0,,2.206406
4212388,14125.0,2.818182,1,2.818182,,2.818182,2.818182,0.0,,2.818182
30293429,,2.039474,1,2.039474,,2.039474,2.039474,0.0,,2.039474


In [115]:
df_filter[df_filter['max_min']>0.3].count()

max_eps    1855
median     2887
count      2887
mean       2887
std        2887
min        2887
max        2887
max_min    2887
std_rj     1573
mean_rj    2887
dtype: int64

In [111]:
print(sum(df_filter.index.duplicated(keep=False)))
df_filter.info()

0
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 251146 entries, (1060, (100, 1000]) to (30633090, (0.1, 1])
Data columns (total 6 columns):
max        251146 non-null float64
median     251146 non-null float64
min        251146 non-null float64
count      251146 non-null int64
mean       251146 non-null float64
max_min    251146 non-null float64
dtypes: float64(5), int64(1)
memory usage: 14.3+ MB


In [115]:
cols_merge=['Substance Identification: Reaxys Registry Number','InChI Key','CAS Registry Number','Chemical Name','Structure','Linear Structure Formula']
reax_merge=reax_bp[cols_merge].drop_duplicates(subset='Substance Identification: Reaxys Registry Number').set_index('Substance Identification: Reaxys Registry Number')#.drop_duplicates(subset='InChI Key')
df_flt1 = pd.merge(reax_merge,df_filter.xs('(1, 10]',level=1), how='inner',left_index=True,right_index=True)
print(df_flt1.shape)
df_flt1.info()

(67691, 11)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 67691 entries, 1064 to 29746456
Data columns (total 11 columns):
InChI Key                   67689 non-null object
CAS Registry Number         65018 non-null object
Chemical Name               66294 non-null object
Structure                   67691 non-null object
Linear Structure Formula    67691 non-null object
max                         67691 non-null float64
median                      67691 non-null float64
min                         67691 non-null float64
count                       67691 non-null int64
mean                        67691 non-null float64
max_min                     67691 non-null float64
dtypes: float64(5), int64(1), object(5)
memory usage: 6.2+ MB


In [121]:
df_flt1[df_flt1.duplicated(subset='InChI Key',keep=False)]

Unnamed: 0_level_0,InChI Key,CAS Registry Number,Chemical Name,Structure,Linear Structure Formula,max,median,min,count,mean,max_min
Substance Identification: Reaxys Registry Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1121,KDBXRAQKSXYXFU-BQBZGAKWSA-N,278-38-6; 77369-77-8,(+-)-cis-hexahydro-benzoxete; (+-)-cis-Hexahydro-benzoxet,[H][C@]12CO[C@@]1([H])CCCC2,C7H12O,35.5,35.5,35.5,1,35.5,0.0
3484,NHKDLPHFEIBSQR-WFYOFFIASA-N,14202-27-8; 51830-12-7; 67315-16-6; 70095-87-3; 70494-79-0; 139686-29-6; 139686-31-0; 139686-32-1,"(+/-) methyl (4,5)-trans-epoxy-(2E)-hexenoate; (+/-) methyl 4,5-trans-epoxy-(2E)-hexenoate; (E)-methyl 3-(3-methyloxiran-2-yl)acrylate; (4,5)-trans-epoxy-2(E)-hexenoate; methyl 4,5-epoxy-2-hexenoate; 4,5-epoxy-2-hexenoate; (+-)-3t-(trans-3-methyl-oxiranyl)-acrylic acid methyl ester",COC(=O)C=C[C@@H]1O[C@H]1C,C7H10O3,89.0,89.0,89.0,1,89.0,0.0
18394,BWXXSADMQUTHRY-ROUUACIJSA-N,68972-94-1; 68973-05-7; 80183-15-9; 81177-24-4; 110660-95-2; 136458-56-5,"(2R*,3R*)-2,3-bis((benzyloxy)methyl)oxirane; trans-1,4-Di(benzyloxy)-2,3-epoxybutane; di-O-benzyl-2,3-anhydro-DL-threitol; Di-O-benzyl-2,3-anhydro-DL-threit",C(OCC1=CC=CC=C1)[C@@H]1O[C@H]1COCC1=CC=CC=C1,C18H20O3,237.0,237.0,237.0,1,237.0,0.0
80469,OBOOTZLJSBIXKK-OPRDCNLKSA-N,13414-15-8; 14595-61-0; 14715-43-6; 14747-63-8; 14762-02-8,"(+-)-2-methyl-(4ar,8at)-decahydro-isoquinolin-7t-ol; (+-)-2-Methyl-(4ar,8at)-decahydro-isochinolin-7t-ol",[H][C@]12CC[C@@H](O)C[C@]1([H])CN(C)CC2,C10H19NO,100.0,100.0,100.0,1,100.0,0.0
81112,YVCOJTATJWDGEU-VXNVDRBHSA-N,4541-87-1,cis-1-phenylpropene oxide; cis-beta-methylstyrene oxide; (+/-)-cis-beta-methylstyrene oxide; rac-(Z)-beta-methylstyrene oxide; cis-beta-methylstyrene epoxide; (Z)-beta-methylstyrene oxide; cis-beta-methylstyrene oxide,C[C@H]1O[C@H]1C1=CC=CC=C1,C9H10O,81.0,81.0,81.0,1,81.0,0.0
773678,HKDCIIMOALDWHF-UHFFFAOYSA-N,628-61-5,2-chlorooctane; (+/-)-2-chlorooctane; rac-2-chlorooctane; 2-octyl chloride; 2-octylchloride; 2-Chlorooctane; 2-chloro-octane,CCCCCCC(C)Cl,C8H17Cl,60.5,56.0,55.0,3,57.166667,5.5
956611,WLNPJBFMCCXPLZ-UHFFFAOYSA-N,54316-25-5,3-isobutenylpyrazole; 3-(2-methyl-propenyl)-1(2)H-pyrazole; 3-<2-Methyl-prop-1-enyl>-pyrazol,CC(C)=C/C1=NNC=C1,C7H10N2,99.0,99.0,99.0,1,99.0,0.0
956663,WLNPJBFMCCXPLZ-UHFFFAOYSA-N,,5-(2-Methyl-1-propenyl)pyrazol; 3-(2-methyl-propenyl)-1(2)H-pyrazole; 5-<2-Methyl-prop-1-enyl>-pyrazol; 5-Isobutenylpyrazol,CC(C)=CC1=CC=NN1,C7H10N2,99.0,99.0,99.0,1,99.0,0.0
1280137,QQHZNUPEBVRUFO-IMJSIDKUSA-N,50468-21-8,"(2S,3S)-2,3-epoxy-1-butanol; [(2S,3S)-3-methyloxiran-2-yl]methanol; (2S,3S)-3-methyl(oxiranemethanol); (2S,3S)-2,3-epoxybutan-1-ol; trans-2,3-epoxybutanol; ((2S)-trans-3-methyl-oxiranyl)-methanol; L-2,3-anhydro-1-deoxy-threitol",C[C@@H]1O[C@H]1CO,C4H8O2,58.5,58.5,58.5,1,58.5,0.0
1280193,KDBXRAQKSXYXFU-BQBZGAKWSA-N,278-38-6; 77369-77-8,cis-7-Oxa-bicyclo<4.2.0>octan; cis-7-Oabicyclo<4.2.0>-octan; cis-7-Oxa-bicyclo[4.2.0]octan,C1O[C@H]2CCCC[C@@H]12,C7H12O,35.5,35.5,35.5,1,35.5,0.0


In [145]:
print(sum(df_filter.index.duplicated(keep=False)))
df_filter.info()
print(sum(reax_merge.index.duplicated(keep=False)))
reax_merge.info()
print(sum(df_flt1.index.duplicated(keep=False)))
df_flt1.info()

0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 118417 entries, 2372 to 30480261
Data columns (total 10 columns):
max_eps    69761 non-null float64
median     118417 non-null float64
count      118417 non-null int64
mean       118417 non-null float64
std        22179 non-null float64
min        118417 non-null float64
max        118417 non-null float64
max_min    118417 non-null float64
std_rj     9201 non-null float64
mean_rj    118417 non-null float64
dtypes: float64(9), int64(1)
memory usage: 9.9 MB
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 128210 entries, 1807 to 30480261
Data columns (total 5 columns):
InChI Key                   127615 non-null object
CAS Registry Number         67851 non-null object
Chemical Name               92751 non-null object
Structure                   128210 non-null object
Linear Structure Formula    128210 non-null object
dtypes: object(5)
memory usage: 5.9+ MB
0
<class 'pandas.core.frame.DataFrame'>
Int64Index: 118417 entries, 2372 to 30

In [136]:
#reax_merge[reax_merge.index.duplicated(keep=False)]
#df_flt1[df_flt1.mean_rj.isnull()][:10]
#df_flt1.dropna(subset=['mean_rj']).info()

In [83]:
#show(df_flt1[:10])
# df_flt1=df_filter.reset_index()
# df_flt1=df_flt1[df_flt1.Structure.str.match('([0-9CNOFS@+\-\[\]\(\)\\\/%=#$]+)$',case=False,as_indexer=True)]
# df_flt1=df_flt1[df_flt1.Structure.str.match('^((?!(\[Co\]|\[Se\]|\[O\+8\]|\[C-3\]|\[N\+5\]|\[O\+3\]|\.)).)*$',case=True,as_indexer=True)]
# print(df_flt1.Structure.count())
# df_flt1.sample(10)

In [164]:
#np.save('/home/denn/home/ml/data/reaxys3_MW120-1000_UV360-440_ftr1.npy',df_flt1.as_matrix())

In [122]:
import pybel

def obl(xyz):
    m=pybel.readstring('smiles',xyz)
    return pd.Series({'can':m.write('can'),'check_inchi':m.write('inchikey')})

#df_check['check_inchi']=df_check['xyz'].apply(lambda m: pybel.readstring('xyz',m).write('inchikey').split('-')[0])
#df_check['check_smi']=df_check['xyz'].apply(lambda m: pybel.readstring('xyz',m).write('can'))
#df_check=df_check1['xyz'].apply(obl))
df_obl=df_flt1.Structure.apply(obl)
#df_flt2=df_flt1.join(df_flt1.Structure.apply(obl),how)
#%timeit obl('BrC1=CC=C(C=C1)C1CC(=NN1C1=CC=CC=C1)C1=CC=C(C=C1)C#N')

In [123]:
df_obl.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67691 entries, 1064 to 29746456
Data columns (total 2 columns):
can            67691 non-null object
check_inchi    67691 non-null object
dtypes: object(2)
memory usage: 1.5+ MB


In [133]:
df_flt2=df_flt1.join(df_obl,how='inner')
df_flt2['can']=df_flt2.can.str.strip()
df_flt2['check_inchi']=df_flt2.check_inchi.str.strip()
df_flt2[:10]

Unnamed: 0_level_0,InChI Key,CAS Registry Number,Chemical Name,Structure,Linear Structure Formula,max,median,min,count,mean,max_min,can,check_inchi
Substance Identification: Reaxys Registry Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1064,WWFRMNXCYGEFPA-UHFFFAOYSA-N,4740-43-6,pentamethylene selenide; Selenane; tetrahydro-selenopyran; Pentamethylenselenid; Selenacyclohexan; Selenan; selenane,C1CC[Se]CC1,Se(CH2)5,67.5,67.5,67.5,1,67.5,0.0,C1CCC[Se]C1,WWFRMNXCYGEFPA-UHFFFAOYSA-N
1078,APJPWWDTSMWKNE-UHFFFAOYSA-N,5974-87-8,"1-oxa-4-telluracyclohexane; 1,4-oxatellurane; [1,4]oxatellurane; [1,4]Oxatelluran",C1C[Te]CCO1,TeC4H8O,60.0,60.0,60.0,1,60.0,0.0,C1COCC[Te]1,APJPWWDTSMWKNE-UHFFFAOYSA-N
1099,RZMUEDPRCNHRJF-UHFFFAOYSA-N,72552-73-9,"N-2-propylidenemethanamide N-oxide; C,C-dimethyl-N-methylnitrone; N-methyl-C,C-dimethylnitrone; N-methyl dimethyl nitrone; acetone-(N-methyl oxime ); C,C,N-Trimethyl-nitron; Aceton-(N-methyl-oxim)",CC(C)=N(C)=O,C4H9NO,77.0,77.0,77.0,1,77.0,0.0,CN(=C(C)C)=O,RZMUEDPRCNHRJF-UHFFFAOYSA-N
1101,FLVFPAIGVBQGET-UHFFFAOYSA-N,13220-33-2,"1-methyl-3-pyrrolidinol; 3-hydroxy-1-methylpyrrolidine; 1-methyl-3-hydroxypyrrolidine; 3-hydroxy-N-methylpyrrolidine; N-methyl-3-hydroxypyrrolidine; (R,S)-1-methyl-3-pyrrolidinol; (R,S)-1-methylpyrrolidin-3-ol",CN1CCC(O)C1,C5H11NO,78.0,78.0,78.0,1,78.0,0.0,CN1CCC(C1)O,FLVFPAIGVBQGET-UHFFFAOYSA-N
1108,DHRSKOBIDIDMJZ-UHFFFAOYSA-N,1768-64-5,4-chlorotetrahydro-2H-pyran; 4-chloro-2H-tetrahydropyran; 4-chlorotetrahydro-pyran; 4-chlorotetrahydropyrane; 4-chlorotetrahydropyran; 4-chloro-tetrahydro-pyran; 4-Chlor-tetrahydro-pyran,ClC1CCOCC1,C5H9ClO,38.0,38.0,38.0,1,38.0,0.0,ClC1CCOCC1,DHRSKOBIDIDMJZ-UHFFFAOYSA-N
1121,KDBXRAQKSXYXFU-BQBZGAKWSA-N,278-38-6; 77369-77-8,(+-)-cis-hexahydro-benzoxete; (+-)-cis-Hexahydro-benzoxet,[H][C@]12CO[C@@]1([H])CCCC2,C7H12O,35.5,35.5,35.5,1,35.5,0.0,C1CC[C@@H]2[C@H](C1)OC2,KDBXRAQKSXYXFU-BQBZGAKWSA-N
1139,RYZAMMMGBAOMMC-UHFFFAOYSA-N,38447-82-4,4-Hydroxy-4-methyltetrahydro-2(H)-thiapyran; 4-hydroxy-4-methylthiacyclohexane; 4-methylthian-4-ol; 4-Methyl-tetrahydro-thiopyran-4-ol; 4-hydroxy-4-methyltetrahydrothiopyran; 4-methyltetrahydro-2H-thiopyran-4-ol,CC1(O)CCSCC1,C6H12OS,78.0,78.0,78.0,1,78.0,0.0,CC1(O)CCSCC1,RYZAMMMGBAOMMC-UHFFFAOYSA-N
1155,JMPSWKMWNJPVSV-UHFFFAOYSA-N,286-26-0,"1,3-dioxabicyclo<3.1.0>heptane; Tetrahydro-oxireno[b]pyran",C1COC2OC2C1,C5H8O2,82.0,82.0,82.0,1,82.0,0.0,C1COC2C(C1)O2,JMPSWKMWNJPVSV-UHFFFAOYSA-N
1165,TTYZTSXCQYWNSS-UHFFFAOYSA-N,6012-19-7,"5-ethyl-1,2-dimethyl-piperidine; 5-Aethyl-1,2-dimethyl-piperidin; 3-Aethyl-1,6-dimethyl-piperidin; 1,2-Dimethyl-5-ethyl-piperidin; 3-Ethyl-1,6-dimethyl-piperidin; 5-Ethyl-1,2-dimethyl-piperidin; N-Methyl-isocopellidin",CCC1CCC(C)N(C)C1,C9H19N,60.0,60.0,60.0,1,60.0,0.0,CCC1CCC(N(C1)C)C,TTYZTSXCQYWNSS-UHFFFAOYSA-N
1166,FLNRILFMKQBAOQ-UHFFFAOYSA-N,31263-47-5,1-n-butylborinane; 1-butyl-borinane; 1-Butyl-borinan; B-n-Butylborinan,CCCCB1CCCCC1,CH3CH2CH2CH2B(CH2)5,61.0,61.0,61.0,1,61.0,0.0,CCCCB1CCCCC1,FLNRILFMKQBAOQ-UHFFFAOYSA-N


In [144]:
df_flt2[df_flt2.duplicated(subset='InChI Key',keep=False)].sort_values('InChI Key').shape#head(10)
df_flt2[df_flt2.duplicated(subset='check_inchi',keep=False)].sort_values('check_inchi').shape#head(10)
#df_flt2[df_flt2.duplicated(subset='Structure',keep=False)].sort_values('Structure').head(10)
df_flt2[df_flt2.duplicated(subset='can',keep='first')].sort_values('can').shape


(767, 13)

In [146]:
df_flt3=df_flt2.drop_duplicates(subset='can')

In [153]:
df_flt3.shape

(66924, 13)

In [152]:
df_flt3[['can','mean']].to_csv('/home/denn/home/ml/data/reaxys/bp/reaxys_bp_1_10_torr.csv',index=False)

In [164]:
df_flt2.to_pickle('/home/denn/home/ml/data/sf/Reaxys4_fix_uv.pickle')

In [165]:
import logging

from rdkit import RDLogger

RDLogger.logger().setLevel(getattr(RDLogger, 'ERROR'))

logger = logging.getLogger()
loglevel = getattr(logging, 'ERROR')
logger.setLevel(loglevel)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loglevel)




def get_finger_rdk(smiles):
    mol=Chem.MolFromSmiles(str(smiles))
    if mol is not None:
        Chem.SanitizeMol(mol)
        return pd.Series({'mol__inchikey':Chem.InchiToInchiKey(Chem.MolToInchi(mol)),'morgan':np.array(Chem.GetMorganFingerprintAsBitVect(mol,nBits=8192,radius=6),dtype=np.int8),'mol':mol})
    else:
#         print(smiles) 
        return pd.Series({'mol__inchikey':None,'morgan':np.nan,'mol':None})

df_fps=df_flt2[:].Structure.apply(get_finger_rdk)
#print()
#df_fps=df_flt2[:100].can.str.slice(0,-2).apply(get_finger_rdk)
#print(df_flt1.Structure.count())
#print(df_fps.count())
#df_fps[df_fps.mol__inchikey.isnull()]
#df_fps

RDKit ERROR: [15:06:20] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:06:20] Explicit valence for atom # 6 N, 5, is greater than permitted
RDKit ERROR: [15:06:21] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:06:23] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:06:25] Explicit valence for atom # 11 N, 5, is greater than permitted
RDKit ERROR: [15:06:25] Explicit valence for atom # 20 N, 5, is greater than permitted
RDKit ERROR: [15:06:29] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:06:35] Explicit valence for atom # 23 N, 5, is greater than permitted
RDKit ERROR: [15:07:15] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:07:15] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:07:15] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:07:15] Explicit valence for atom 

RDKit ERROR: [15:11:25] Explicit valence for atom # 18 N, 5, is greater than permitted
RDKit ERROR: [15:11:36] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [15:11:36] Explicit valence for atom # 10 N, 5, is greater than permitted
RDKit ERROR: [15:11:37] Explicit valence for atom # 11 N, 5, is greater than permitted
RDKit ERROR: [15:11:39] Explicit valence for atom # 1 N, 4, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 5 C, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 5 C, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 5 C, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 15 N, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 16 N, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for atom # 17 N, 5, is greater than permitted
RDKit ERROR: [15:11:50] Explicit valence for ato

RDKit ERROR: [15:14:35] Explicit valence for atom # 11 N, 5, is greater than permitted
RDKit ERROR: [15:15:07] Explicit valence for atom # 10 N, 5, is greater than permitted
RDKit ERROR: [15:15:09] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:15:10] Explicit valence for atom # 2 N, 5, is greater than permitted
RDKit ERROR: [15:15:12] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:15:12] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:15:23] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:15:23] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:15:24] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:15:25] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:15:25] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:15:27] Explicit valence for ato

RDKit ERROR: [15:16:49] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 8 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:16:50] Explicit valence for atom # 21 N, 5, is greater than permitted
RDKit ERROR: [15:16:52] Explicit valence for atom # 26 N, 5, is greater than permitted
RDKit ERROR: [15:16:53] Explicit valence for atom # 8 N, 5, is greater than permitted
RDKit ERROR: [15:16:53] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:16:53] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:16:56] Explicit valence for atom # 

RDKit ERROR: [15:19:21] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:19:29] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:19:29] Explicit valence for atom # 15 N, 5, is greater than permitted
RDKit ERROR: [15:19:45] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:19:47] Explicit valence for atom # 2 N, 5, is greater than permitted
RDKit ERROR: [15:19:47] Explicit valence for atom # 6 N, 5, is greater than permitted
RDKit ERROR: [15:19:48] Explicit valence for atom # 5 N, 5, is greater than permitted
RDKit ERROR: [15:19:48] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:19:49] Explicit valence for atom # 11 N, 5, is greater than permitted
RDKit ERROR: [15:19:50] Explicit valence for atom # 3 N, 5, is greater than permitted
RDKit ERROR: [15:19:50] Explicit valence for atom # 10 N, 5, is greater than permitted
RDKit ERROR: [15:19:50] Explicit valence for atom #

RDKit ERROR: [15:25:39] Explicit valence for atom # 10 N, 5, is greater than permitted
RDKit ERROR: [15:25:46] Explicit valence for atom # 17 N, 5, is greater than permitted
RDKit ERROR: [15:25:49] Explicit valence for atom # 14 N, 5, is greater than permitted
RDKit ERROR: [15:25:51] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [15:26:19] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [15:26:24] Explicit valence for atom # 4 N, 5, is greater than permitted
RDKit ERROR: [15:26:24] Explicit valence for atom # 14 N, 5, is greater than permitted
RDKit ERROR: [15:26:24] Explicit valence for atom # 15 N, 5, is greater than permitted
RDKit ERROR: [15:26:24] Explicit valence for atom # 6 N, 5, is greater than permitted
RDKit ERROR: [15:26:30] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:26:33] Explicit valence for atom # 21 N, 5, is greater than permitted
RDKit ERROR: [15:26:46] Explicit valence for a

RDKit ERROR: [15:31:08] Explicit valence for atom # 38 C, 5, is greater than permitted
RDKit ERROR: [15:31:10] Explicit valence for atom # 26 N, 4, is greater than permitted
RDKit ERROR: [15:31:15] Explicit valence for atom # 19 N, 5, is greater than permitted
RDKit ERROR: [15:31:15] Explicit valence for atom # 20 N, 5, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 16 N, 4, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 9 N, 5, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for atom # 12 N, 5, is greater than permitted
RDKit ERROR: [15:31:16] Explicit valence for

In [558]:
#%timeit get_finger_rdk('N#C[C-]([N+]1=CC=CC=C1)C(C2=CC=CC=C2)=O')

array([ 0.,  0.,  0., ...,  0.,  0.,  0.], dtype=float32)

In [167]:
#show(df_fps.sample(2))
df_fps.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118417 entries, 2372 to 30480261
Data columns (total 3 columns):
mol              117826 non-null object
mol__inchikey    117826 non-null object
morgan           117826 non-null object
dtypes: object(3)
memory usage: 3.6+ MB


In [168]:
#help(df_flt1.join)
df_flt3=df_flt2.join(df_fps,how='inner',rsuffix='_fps')

In [177]:
show(df_flt3.loc[2745177:2745178,][['mol','Structure']])
show(df_flt3.loc[2745177:2745178,])

Unnamed: 0_level_0,InChI Key,CAS Registry Number,Chemical Name,Structure,Linear Structure Formula,max_eps,median,count,mean,std,min,max,max_min,std_rj,mean_rj,can,check_inchi,mol,mol__inchikey,morgan
Substance Identification: Reaxys Registry Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2745177,DHDYTSMZSUHCIP-VAWYXSNFSA-N,27781-35-7,"2,3-dicyano-3t-(4-dimethylamino-phenyl)-acrylic acid amide; 2,3-Dicyan-3t-(4-dimethylamino-phenyl)-acrylsaeure-amid; trans-alpha.beta-Dicyano-p-(N,N-dimethylamino)-cinnamamid",CN(C)C1=CC=C(C=C1)C(C#N)=C(/C#N)C(N)=O,C13H12N4O,,2.589855,2,2.589855,0.076464,2.535787,2.643923,0.108136,,2.589855,CN(c1ccc(cc1)C(=C(C(=O)N)C#N)C#N)C,DHDYTSMZSUHCIP-UHFFFAOYSA-N\n,\n \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nN\nN\nN\nNH2\nO\n,DHDYTSMZSUHCIP-UHFFFAOYSA-N,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]"


In [178]:
df_flt3.to_pickle('/home/denn/home/ml/data/sf/Reaxys4__fix_fps8k_uv.pickle')

In [9]:
from blocks import block
from rdkit.Chem.Descriptors import MolWt
def blockify(s):
    try:
        return block.Block(smiles=s)
    except:
        return None

import logging

from rdkit import RDLogger

RDLogger.logger().setLevel(getattr(RDLogger, 'ERROR'))

logger = logging.getLogger()
loglevel = getattr(logging, 'ERROR')
logger.setLevel(loglevel)
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loglevel)



dfexp=reax_uv_gr.reset_index()
dfexp['block'] = dfexp.Structure.apply(blockify)
dfexp=dfexp.dropna(subset=['block'])
dfexp['mol'] = dfexp.block.apply(lambda b: b.mol)
dfexp['smiles']=dfexp.block.apply(lambda b: b.smiles())
dfexp['MolWt']=dfexp.mol.apply(lambda m: MolWt(m))
dfexp['mol__inchikey'] = dfexp.block.apply(lambda b: b.inchikey)
dfexp['mol__inchikey']=dfexp['mol__inchikey'].str.split('-').str[0]


RDKit ERROR: [16:23:27] Explicit valence for atom # 10 C, 6, is greater than permitted
ERROR:root:Could not create RdKit mol from smiles: C(C1=C2C=CC=CC2=C(C[C]2345[BH]678[BH]9%10%11[BH]%12%13%14[BH]696[BH]%129%12[BH]%13%13%15[BH]%10%14%10[BH]27%11[BH]3%13%10[C]49%15(C2=CC=CC=C2)[BH]586%12)C2=CC=CC=C12)[C]1234[BH]567[BH]89%10[BH]%11%12%13[BH]585[BH]%118%11[BH]%12%12%14[BH]9%139[BH]16%10[BH]2%129[C]38%14(C1=CC=CC=C1)[BH]475%11, inchi: None
Traceback (most recent call last):
  File "/home/denn/home/a2g2/blocks/block.py", line 53, in __init__
    raise Exception("Mol not created")
Exception: Mol not created
RDKit ERROR: [16:23:27] Explicit valence for atom # 15 C, 6, is greater than permitted
ERROR:root:Could not create RdKit mol from smiles: C(C1=C2C=CC=CC2=CC2=CC=CC=C12)[C]1234[BH]567[BH]89%10[BH]%11%12%13[BH]585[BH]%118%11[BH]%12%12%14[BH]9%139[BH]16%10[BH]2%129[C]38%14(C1=CC=CC=C1)[BH]475%11, inchi: None
Traceback (most recent call last):
  File "/home/denn/home/a2g2/blocks/block.py",

In [10]:
#show(dfexp[dfexp['mol__inchikey'].str.contains('CSHWQDPOILHKBI')])

In [11]:
dfexp = dfexp.set_index('mol__inchikey')
print(dfexp['Structure'].count())
df_filter=dfexp[(dfexp['MolWt']<1000)]
print(df_filter['Structure'].count())

df_filter=df_filter[df_filter['mol'].apply(lambda m: not m.HasSubstructMatch(Chem.MolFromSmarts('[Si,Se,Li,Na]')))]
print(df_filter['Structure'].count())
#df_filter=df_filter[df_filter['mol'].apply(lambda m: m.HasSubstructMatch(Chem.MolFromSmarts('O=C([#6;D2])[O;H1]')))]
#print(df_filter['Structure'].count())

df_filter=df_filter[df_filter['mol'].apply(lambda m: not m.HasSubstructMatch(Chem.MolFromSmarts('CCCC')))]
print(df_filter['Structure'].count())

df_filter=df_filter[df_filter['mol'].apply(lambda m: not m.HasSubstructMatch(Chem.MolFromSmarts('c1ccc2cc3ccccc3cc2c1.c1ccc2cc3ccccc3cc2c1')))]
print(df_filter['Structure'].count())

# df_filter=df_filter[np.logical_not((df_filter['std_rj']>10) & (df_filter['count']>2))]
# print(df_filter['Structure'].count())
# df_filter=df_filter[(df_filter['std_rj']>10)]
# print(df_filter['Structure'].count())
#print(df_filter[df_filter['count']==1].count())
#print(Chem.Mol.GetAtoms(df_filter.iloc[1]['mol']).)
#show(df_filter[['mol']].sample(10))

#print(reax_uv[reax_uv['Structure']=='N#CC1=C2C=CC=CC2=CC2=CC=CC=C12'][cols])

6810
6530
6138
4070
3532


In [12]:
df_filter.to_pickle('/home/denn/harvard/SF/Library/Reaxys/anth_peryl_uv_below1000_reaxys2.pickle')

In [13]:
df_filter['mol'].apply(Chem.MolToSmiles).to_csv('/home/denn/harvard/SF/Library/Reaxys/anth_peryl_uv_below1000_reaxys2.smi',index=False)