In [None]:
import os
import sys
import pprint
import pandas as pd
import itertools
import numpy as np
import numba
import h5py

# this is all setup for the notebook
from IPython.display import HTML
import matplotlib
%matplotlib inline
from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools # headsup: this import change the behavior of dataframes with mols in them
# some global configuration of the pandastools
PandasTools.molRepresentation = 'svg'
PandasTools.molSize = (200,200)

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot,plot
from plotly.graph_objs import *
init_notebook_mode()
from plotly.tools import FigureFactory as FF


#constatns
HA_TO_EV = 27.211399
PERIODICTABLE = Chem.GetPeriodicTable()
NM_TO_EV=1240

# this is a little helper function to render images inside a dataframe
# once again, there are ways to monkey patch the rendering of dataframes, but I am trying to 
# avoid most of that to make things a bit easier to understand
def show(df):
    return HTML(df.to_html(escape=False))

# covert from database geoms obect to xyz file format string
def to_xyz(geoms):
    output = str(len(geoms)) + "\n\n"
    for g in geoms:
        output += " ".join([PERIODICTABLE.GetElementSymbol(int(g[0])), str(g[1]), str(g[2]), str(g[3])]) + "\n"
    return output

from tqdm import tnrange

In [None]:
df_all=pd.read_pickle('/home/denn/home/ml/data/sf/reaxys2_all.pickle')

In [None]:
#show(df_all.loc['WHVPQBSNFXGXCL'])

In [None]:
import os
import glob

path = r'/home/denn/home/ml/data/sf/'
all_files = glob.glob(os.path.join(path, "reaxys5_*.xls"))
pds = []
for f in all_files:
    print(f)
    pds.append(pd.read_csv(f,delimiter='\t',low_memory=False))
    
reax_full=pd.concat(pds, ignore_index=True)


In [None]:
reax_uv=reax_full[~reax_full.Structure.str.contains('.',regex=False)]
import re
def mf2dict(mf_str):
    molfor=re.findall(r'([A-Z][a-z]*)(\d*)', mf_str)
    molfor1=map(lambda e: (e[0], 1) if e[1] == '' else (e[0],int(e[1])), molfor)
    return dict(molfor1)
reax_uv['mf']=reax_uv['Molecular Formula'].apply(mf2dict)
reax_uv['C_c']=reax_uv['mf'].apply(lambda d: d.get('C',0))

solvent_syn = {'Solvent (UV/VIS Spectroscopy)': 
               {'CH2Cl2': 'dichloromethane',
                'CHCl3':'chloroform',
                'H2O':'water',
                'dimethyl sulfoxide':'dimethylsulfoxide',
               'N,N-dimethyl-formamide':'dimethylformamide'}
              }
reax_uv.replace(solvent_syn,inplace=True)

print(reax_uv['Absorption Maxima (UV/VIS) [nm]'].count())

reax_uv=reax_uv[reax_uv.mf.apply(lambda m: bool(set(m.keys()) - set(['H','C','N','O','F','S','Cl','Br','I','P']))==False)]
reax_uv=reax_uv[(reax_uv.C_c>6) & (reax_uv.C_c<51)]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Chemical Name'].fillna('').str.contains('radical',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Chemical Name'].fillna('').str.contains('cation',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Chemical Name'].fillna('').str.contains('anion',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(1+)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(1-)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(2-)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(2+)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(3-)',regex=False))]
print(reax_uv.shape)
reax_uv=reax_uv[~(reax_uv['Linear Structure Formula'].fillna('').str.contains('(3+)',regex=False))]
print(reax_uv.shape)

In [None]:
reax_uv.count().sort_values(ascending=False)

In [None]:
reax_inchi=reax_uv['InChI Key'].str.split('-').str[0].drop_duplicates()
reax_inchi.count()
reax_uv['ik']=reax_uv['InChI Key'].str.split('-').str[0]

In [None]:
#pd.merge(df_all,reax_inchi.to_frame(name='mol__inchikey'),how='inner',left_on=,right_on='mol__inchikey').count()
merge=pd.merge(reax_inchi.to_frame('mol__inchikey'),df_all.reset_index(),on='mol__inchikey',how='inner')
#print(df_all.index.get_level_values(0))

In [None]:
#reax_inchi.to_frame('mol__inchikey').drop_duplicates()
exclude=list(merge['mol__inchikey'].drop_duplicates().values)

In [None]:
exclude

In [None]:
reax5=reax_uv[['ik','Structure']][~reax_uv['ik'].isin(list(exclude))]

In [None]:
reax5=reax5.drop_duplicates(subset='ik')

In [None]:
def molify(s):
    m=Chem.MolFromSmiles(s)
    if m is not None:
        m=Chem.MolFromSmiles(Chem.MolToSmiles(m,canonical=True))
        if m is not None:
            #Chem.SanitizeMol(Chem.AddHs(m))
            Chem.SanitizeMol(m)
    return m

In [None]:
reax5['mol']=reax5.Structure.apply(molify)

In [None]:
reax5=reax5.dropna()

In [None]:
reax5['smiles']=reax5.mol.apply(Chem.MolToSmiles)
reax5['morgan'] = reax5.mol.apply(lambda m: np.array(Chem.GetMorganFingerprintAsBitVect(m,nBits=8192,radius=6)))

In [None]:
show(reax5.sample(1))

In [None]:
reax5.to_pickle('/home/denn/home/ml/data/sf/reaxys5.pickle')

In [None]:
#reax5.smiles.to_csv('/home/denn/harvard/SF/Library/Reaxys/reaxys5_dicyanoanth.smi',index=False)

In [None]:
help(plot)