In [None]:
import os
import sys
import django
import pprint
import pandas as pd
from django_pandas.io import read_frame
import itertools
from blocks import block
import numpy as np

# setup the djhango settings file.  Change this to use the settings file that connects you to your desired database
os.environ["DJANGO_SETTINGS_MODULE"] = "djangochem.settings.denn"
# this must be run to setup access to the django settings and make database access work etc.
django.setup()

# import the models that you want to access
from pgmols.models import Mol, Calc, Geom
from jobs.models import Job, JobConfig

# this is all setup for the notebook
from IPython.display import HTML
import matplotlib
%matplotlib inline
from rdkit.Chem import AllChem as Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools # headsup: this import change the behavior of dataframes with mols in them
# some global configuration of the pandastools
PandasTools.molRepresentation = 'svg'
PandasTools.molSize = (200,200)

from plotly.offline import download_plotlyjs, init_notebook_mode, iplot,plot
from plotly.graph_objs import *
init_notebook_mode()
from plotly.tools import FigureFactory as FF


#constatns
HA_TO_EV = 27.211399
PERIODICTABLE = Chem.GetPeriodicTable()
NM_TO_EV=1240

# this is a little helper function to render images inside a dataframe
# once again, there are ways to monkey patch the rendering of dataframes, but I am trying to 
# avoid most of that to make things a bit easier to understand
def show(df):
    return HTML(df.to_html(escape=False))

# covert from database geoms obect to xyz file format string
def to_xyz(geoms):
    output = str(len(geoms)) + "\n\n"
    for g in geoms:
        output += " ".join([PERIODICTABLE.GetElementSymbol(int(g[0])), str(g[1]), str(g[2]), str(g[3])]) + "\n"
    return output




In [None]:
import IPython.display
import time, json

#surface type constants
VDW =1
MS=2
SAS=3
SES=4

class view3dMol(object):
    '''A class for constructing embedded 3Dmol.js views in ipython notebooks.
       The results are completely static which means there is no need for there
       to be an active kernel but also that there is no communication between
       the javascript viewer and ipython.
       
       The API for the created object is exactly that for $3Dmol.GLViewer, with
       the exception that the functions all return None.
       http://3dmol.csb.pitt.edu/doc/$3Dmol.GLViewer.html
    '''
    def __init__(self,width=640,height=480,query='',options=dict(),js='http://3dmol.csb.pitt.edu/build/3Dmol.js'):
        '''Create a 3Dmol.js view.
            width -- width in pixels of container
            height -- height in pixels of container
            query -- optional argument to provide to $3Dmol.download
            options -- optional options to provide to $3Dmol.download
            js -- url for 3Dmol.js'''
        divid = "3dmolviewer_UNIQUEID" 
        
        self.startjs = '<div id="%s"  style="position: relative; width: %dpx; height: %dpx">' % (divid,width,height)
        self.startjs += '<script>'
        self.endjs = '</script>';
        
        #load 3dmol, but only once
        self.startjs += "if(typeof $3Dmolpromise === 'undefined') $3Dmolpromise = $.when($.getScript('%s'))" % js
        
        self.startjs += ";$3Dmolpromise.done(function() {";
        self.endjs = "});" + self.endjs
        
        self.startjs += 'var viewer = $3Dmol.createViewer($("#%s"),{backgroundColor:"white"});' % divid
        if query:
            self.startjs += '$3Dmol.download("%s", viewer, %s, function() {' % (query,json.dumps(options))
            self.endjs = "})" + self.endjs        
        self.endjs = "viewer.render();" + self.endjs;

    def show(self):
        return IPython.display.HTML(self._repr_html_())
    
    def _repr_html_(self):
        html = (self.startjs+self.endjs).replace('UNIQUEID',str(time.time()).replace('.',''))
        #print html
        return html

    def _repr_html1_(self,conformer):
        html = (self.startjs+self.endjs).replace('UNIQUEID',str(conformer))
        #print html
        return html
    
    def __getattr__(self,name):
        '''auto-instantiate javascript calls based on whatever the user provided'''
        if name.startswith('_'): #object to ipython canary functions
            raise AttributeError("%r object has no attribute %r" %
                         (self.__class__, attr))
        def makejs(*args):            
            cmd = 'viewer.%s(' % name;
            for arg in args:
                cmd += '%s,' % json.dumps(arg)
            cmd = cmd.rstrip(',')
            cmd += ');';
            self.startjs += cmd
            return self
            
        return makejs

In [None]:
#Header for HTML table output
# Reference Semantic.ui
# JQuery
html_start = """
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-type" content="text/html; charset=utf-8">
<meta name="viewport" content="width=device-width,initial-scale=1">
<title>SF Candidates</title>

<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/v/dt/jqc-1.12.3/jszip-2.5.0/pdfmake-0.1.18/dt-1.10.12/b-1.2.2/b-colvis-1.2.2/b-html5-1.2.2/fh-3.1.2/r-2.1.0/sc-1.4.2/datatables.min.css"/>
 
<script type="text/javascript" src="https://cdn.datatables.net/v/dt/jqc-1.12.3/jszip-2.5.0/pdfmake-0.1.18/dt-1.10.12/b-1.2.2/b-colvis-1.2.2/b-html5-1.2.2/fh-3.1.2/r-2.1.0/sc-1.4.2/datatables.min.js"></script>



<link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/semantic-ui/2.2.1/semantic.min.css">
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/1.10.12/css/dataTables.semanticui.min.css">
<link rel="stylesheet" type="text/css" href="https://cdn.datatables.net/buttons/1.2.2/css/buttons.semanticui.min.css">


	<script type="text/javascript" language="javascript" src="https://cdn.datatables.net/1.10.12/js/dataTables.semanticui.min.js">
	</script>
	<script type="text/javascript" language="javascript" src="https://cdnjs.cloudflare.com/ajax/libs/semantic-ui/2.2.1/semantic.min.js">
	</script>
    <script type="text/javascript" language="javascript" src="https://cdnjs.cloudflare.com/ajax/libs/3Dmol/1.0.6/3Dmol-nojquery-min.js">
	</script>
	
    <style type="text/css" class="init">
	
	</style>
	<script type="text/javascript" class="init">
	
        $(document).ready(function() {
            var table = $('#example').DataTable( {
                lengthChange: true,
                "lengthMenu": [ 3, 10, 25, 50, 75, 100 ],
                buttons: [ 'copy', 'excel', 'colvis' ],
                
            } );
        
            table.buttons().container()
                .appendTo( $('div.eight.column:eq(0)', table.table().container()) );
        } );
        var blobObject = null;

        function createDownloadLink(anchorSelector, str, fileName){

            if(window.navigator.msSaveOrOpenBlob) {
                var fileData = [str];
                blobObject = new Blob(fileData);
                $(anchorSelector).click(function(){
                    window.navigator.msSaveOrOpenBlob(blobObject, fileName);
                });
            } else {
                var url = "data:text/plain;charset=utf-8," + encodeURIComponent(str);
                $(anchorSelector).attr("href", url);
            }
        }
	</script>
</head>
<body>
<div class="ui container">
"""

html_end = """
</div>
</body>
</html>
"""

In [None]:
from django.db.models.expressions import RawSQL
from django.core import serializers
from django.db.models import Q,F,Min,Case,When,Value
#    .filter(Q(mol__inchikey__contains='BIOPPFDHKHWJIA')|Q(mol__inchikey__contains='NXXNVJDXUHMAHU')|Q(mol__inchikey__contains='QQLRAWFMOWCSBU')|Q(mol__inchikey__contains='MWPLVEDNUUSJAV'))\
all_calcs = Calc.objects \
    .filter(mol__group__name__exact='sf_lib',
            parentjob__status__exact='done',
            method__name__contains='hybrid') \
    .filter(Q(mol__tags__contains=['reaxys1'])|Q(mol__tags__contains=['emol1'])|Q(mol__tags__contains=['anthracene2'])|Q(mol__tags__contains=['reaxys2'])|Q(mol__tags__contains=['reaxys5']))\
    .exclude(Q(method__name__contains='_tda_')
               |
#              Q(geoms__parentjob__config__parent_class_name__exact='Mol')|
            Q(mol__tags__contains=['bridge'])|
            Q(mol__tags__contains=['acceptor'])
            )\
    .annotate(totalenergy=RawSQL("((props->>%s)::numeric)", ('totalenergy',)))\
    .annotate(homo=RawSQL("((props->>%s)::numeric)", ('homo',)))\
    .annotate(lumo=RawSQL("((props->>%s)::numeric)", ('lumo',)))\
    .annotate(s2=Case(When(Q(parentjob__config__name='b3lyp_6-31gs_opt_t1_qchem')|
                                 Q(parentjob__config__name='b3lyp_6-31gs_opt_BS_qchem'),
                                 then=RawSQL("((props->>%s)::numeric)", ('s2',)))),default=Value(None))\
    .annotate(excitedstates=Case(When(parentjob__config__name__contains='tddft',
                                 then=RawSQL("((props->>%s)::json)", ('excitedstates',)))), default=Value(None))\
    .order_by('mol__inchikey','parentjob__config__name','totalenergy')


all_calcs_values=all_calcs.values('id',
                                  'geoms__parentjob__config__name',
                                  'geoms__xyz',
                                  'geoms__method',
                                  'geoms__parents',
                                  'geoms__parents__xyz',
                                  'geoms__parents__method',
                                  'geoms__parents__parents',
                                  'mol__inchikey',
                                  'mol__smiles',
                                  'mol__tags',
                                  'method__name',
                                  'parentjob__config__name',
                                  'totalenergy',
                                  's2',
                                  'excitedstates',
                                  'homo',
                                  'lumo',
                                 )

all_calcs_dict=list(all_calcs_values)
df = pd.DataFrame(all_calcs_dict)

show(df[['id','geoms__parents','mol__inchikey','parentjob__config__name','totalenergy']][:1])

In [None]:
df['parentjob__config__name']=df['parentjob__config__name'].str.replace('-','_')
df['geoms__parentjob__config__name']=df['geoms__parentjob__config__name'].str.replace('-','_')
df['mol__inchikey']=df['mol__inchikey'].str.split('-').str[0]

df['totalenergy']=df['totalenergy'].astype(float)
df['s2']=df['s2'].astype(float)
df['homo']=df['homo'].astype(float)
df['lumo']=df['lumo'].astype(float)

print(df.info())
show(df[['id','geoms__parents','mol__inchikey','parentjob__config__name','totalenergy']].sample(1))

In [None]:
df_smiles=pd.DataFrame(list(Mol.objects.filter(group__name__exact='sf_lib').values('inchikey','smiles','tags')))
df_smiles=df_smiles.rename(columns={'inchikey':'mol__inchikey','smiles':'mol__smiles','tags':'mol__tags'})
#df_smiles=df_smiles.drop_duplicates()
df_smiles['mol__inchikey']=df_smiles['mol__inchikey'].str.split('-').str[0]
df_smiles=df_smiles.set_index('mol__inchikey')
show(df_smiles[:2])

In [None]:
#Get B3LYP geometries in XYZ file format for conformers indexed by geoms geomID
df_xyz=df[['geoms__method','geoms__parents','geoms__parentjob__config__name','mol__inchikey','geoms__xyz']]
df_xyz=df_xyz[(df_xyz.geoms__parentjob__config__name=='b3lyp_6_31gs_opt_qchem')|(df_xyz.geoms__parentjob__config__name=='b3lyp_6_31gs_opt_BS_qchem')]
df_xyz=df_xyz.drop_duplicates(subset=['geoms__parents','mol__inchikey'])
df_xyz['xyz']=df['geoms__xyz'].apply(to_xyz)
df_xyz=df_xyz.set_index(['geoms__parents'])
print(df_xyz.count())

In [None]:
df_xyz['geoms__method'].mean()
#df_xyz.count()


In [None]:
def to_sdf(geoms,name):
    output =name+'\n\n\n{0:03d}'.format(len(geoms)) + "  0  0  0  1  0            999 V2000\n"
    for g in geoms:
        output += '{:10.4f}{:10.4f}{:10.4f} {:<3}{}\n'.format((g[1]), (g[2]), (g[3]), PERIODICTABLE.GetElementSymbol(int(g[0])), " 0  0  0  0  0  0")
    output += 'M  END\n$$$$'
    return output  

sdf_savepath = "/home/denn/home2/data/"
for i,r in df_xyz[:2].iterrows():
    with(open(os.path.join(sdf_savepath,"{}_{}.sdf".format(str(r.mol__inchikey),str(i))), "w")) as sdf_file:
        sdf_file.write(to_sdf(r.geoms__xyz,"{}_{}".format(str(r.mol__inchikey),str(i))))


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
import itertools
from rdkit.Chem import rdmolops
from collections import defaultdict
import copy


def getUA(maxValence_list, valence_list):
    UA = []
    DU = []
    for i, (maxValence,valence) in enumerate(zip(maxValence_list, valence_list)):
        if maxValence - valence > 0:
            UA.append(i)
            DU.append(maxValence - valence)
    return UA,DU


def get_BO(AC,valences):
    BO = AC.copy()
    BO_valence = list(BO.sum(axis=1))
    UA,DU = getUA(valences, BO_valence)

    while len(DU) > 1:
        UA_pairs = list(itertools.combinations(UA, 2))

        for i,j in UA_pairs:
            if BO[i,j] > 0:
                BO[i,j] += 1
                BO[j,i] += 1
                break
        
        BO_valence = list(BO.sum(axis=1))
        UA_new, DU_new = getUA(valences, BO_valence)

        if DU_new != DU:
            UA = copy.copy(UA_new)
            DU = copy.copy(DU_new)
        else:
            break
    
    return BO


def BO_is_OK(BO,AC,charge,DU,atomic_valence_electrons,atomicNumList,charged_fragments):

    q = 0
    if charged_fragments:
        BO_valences = list(BO.sum(axis=1))
        for i,atom in enumerate(atomicNumList):
            q += get_atomic_charge(atom,atomic_valence_electrons[atom],BO_valences[i])
            if atom == 6:
                number_of_single_bonds_to_C = list(BO[i,:]).count(1)
                if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
                    q += 1
                if number_of_single_bonds_to_C == 3 and q + 1 < charge:
                    q += 2

    if (BO-AC).sum() == sum(DU) and charge == q:
        return True
    else:
        return False


def get_atomic_charge(atom,atomic_valence_electrons,BO_valence):
    if atom == 1:
        charge = 1 - BO_valence
    elif atom == 5:
        charge = 3 - BO_valence
    elif atom == 15 and BO_valence == 5:
        charge = 0
    elif atom == 16 and BO_valence == 6:
        charge = 0
    else:
        charge = atomic_valence_electrons - 8 + BO_valence
          
    return charge

def clean_charges(mol):
# this is a temporary hack. The real solution is to generate several BO matrices in AC2BO and pick the one
# with the lowest number of atomic charges
#
    rxn_smarts = ['[N+:1]=[*:2]-[O-:3]>>[N+0:1]-[*:2]=[O-0:3]',
                  '[N+:1]=[*:2]-[*:3]=[*:4]-[O-:5]>>[N+0:1]-[*:2]=[*:3]-[*:4]=[O-0:5]']

    for smarts in rxn_smarts:
        patt = Chem.MolFromSmarts(smarts.split(">>")[0])
        while mol.HasSubstructMatch(patt):
            rxn = AllChem.ReactionFromSmarts(smarts)
            ps = rxn.RunReactants((mol,))
            mol = ps[0][0]
                    
    return mol


def BO2mol(mol,BO_matrix, atomicNumList,atomic_valence_electrons,mol_charge,charged_fragments):
# based on code written by Paolo Toscani

    l = len(BO_matrix)
    l2 = len(atomicNumList)
    BO_valences = list(BO_matrix.sum(axis=1))

    if (l != l2):
        raise RuntimeError('sizes of adjMat ({0:d}) and atomicNumList '
            '{1:d} differ'.format(l, l2))

    rwMol = Chem.RWMol(mol)

    bondTypeDict = {
        1: Chem.BondType.SINGLE,
        2: Chem.BondType.DOUBLE,
        3: Chem.BondType.TRIPLE
    }

    for i in range(l):
        for j in range(i + 1, l):
            bo = int(round(BO_matrix[i, j]))
            if (bo == 0):
                continue
            bt = bondTypeDict.get(bo, Chem.BondType.SINGLE)
            rwMol.AddBond(i, j, bt)
    mol = rwMol.GetMol()

    if charged_fragments:
        mol = set_atomic_charges(mol,atomicNumList,atomic_valence_electrons,BO_valences,BO_matrix,mol_charge)
    else:
        mol = set_atomic_radicals(mol,atomicNumList,atomic_valence_electrons,BO_valences)

    return mol

def set_atomic_charges(mol,atomicNumList,atomic_valence_electrons,BO_valences,BO_matrix,mol_charge):
    q = 0
    for i,atom in enumerate(atomicNumList):
        a = mol.GetAtomWithIdx(i)
        charge = get_atomic_charge(atom,atomic_valence_electrons[atom],BO_valences[i])
        q += charge
        if atom == 6:
            number_of_single_bonds_to_C = list(BO_matrix[i,:]).count(1)
            if number_of_single_bonds_to_C == 2 and BO_valences[i] == 2:
                    q += 1
                    charge = 0
            if number_of_single_bonds_to_C == 3 and q + 1 < mol_charge:
                    q += 2
                    charge = 1

        if (abs(charge) > 0):
            a.SetFormalCharge(int(charge))
    rdmolops.SanitizeMol(mol)

    mol = clean_charges(mol)

    return mol


def set_atomic_radicals(mol,atomicNumList,atomic_valence_electrons,BO_valences):
# The number of radical electrons = absolute atomic charge
    for i,atom in enumerate(atomicNumList):
        a = mol.GetAtomWithIdx(i)
        charge = get_atomic_charge(atom,atomic_valence_electrons[atom],BO_valences[i])

        if (abs(charge) > 0):
            a.SetNumRadicalElectrons(int(abs(charge)))

    return mol


def AC2BO(AC,atomicNumList,charge,charged_fragments):
    atomic_valence = defaultdict(list)
    atomic_valence[1] = [1]
    atomic_valence[6] = [3]
    atomic_valence[6] = [4]
    atomic_valence[7] = [4,3]
    atomic_valence[8] = [2,1]
    atomic_valence[9] = [1]
    atomic_valence[14] = [4]
    atomic_valence[15] = [5,4,3]
    atomic_valence[16] = [6,4,2]
    atomic_valence[17] = [1]
    atomic_valence[35] = [1]
    atomic_valence[53] = [1]
    

    atomic_valence_electrons = {}
    atomic_valence_electrons[1] = 1
    atomic_valence_electrons[5] = 3
    atomic_valence_electrons[6] = 4
    atomic_valence_electrons[7] = 5
    atomic_valence_electrons[8] = 6
    atomic_valence_electrons[9] = 7
    atomic_valence_electrons[14] = 4
    atomic_valence_electrons[15] = 5
    atomic_valence_electrons[16] = 6
    atomic_valence_electrons[17] = 7
    atomic_valence_electrons[35] = 7
    atomic_valence_electrons[53] = 7

# make a list of valences, e.g. for CO: [[4],[2,1]]
    valences_list_of_lists = []
    for atomicNum in atomicNumList:
        valences_list_of_lists.append(atomic_valence[atomicNum])

# convert [[4],[2,1]] to [[4,2],[4,1]]
    valences_list = list(itertools.product(*valences_list_of_lists))

    best_BO = AC.copy()

# implemenation of algorithm shown in Figure 2
# UA: unsaturated atoms
# DU: degree of unsaturation (u matrix in Figure)
# best_BO: Bcurr in Figure 
#
    for valences in valences_list:
        AC_valence = list(AC.sum(axis=1))
        UA,DU_from_AC = getUA(valences, AC_valence)
        if len(UA) == 0 or BO_is_OK(AC,AC,charge,DU_from_AC,atomic_valence_electrons,atomicNumList,charged_fragments):
            best_BO = AC.copy()
            break
        else:
            BO = get_BO(AC,valences)
            if BO_is_OK(BO,AC,charge,DU_from_AC,atomic_valence_electrons,atomicNumList,charged_fragments):
                best_BO = BO.copy()
                break
            elif BO.sum() > best_BO.sum():
                    best_BO = BO.copy()

    return best_BO,atomic_valence_electrons


def AC2mol(mol,AC,atomicNumList,charge,charged_fragments):
# convert AC matrix to bond order (BO) matrix
    BO,atomic_valence_electrons = AC2BO(AC,atomicNumList,charge,charged_fragments)

# add BO connectivity and charge info to mol object
    mol = BO2mol(mol,BO, atomicNumList,atomic_valence_electrons,charge,charged_fragments)
    
    return mol


def get_proto_mol(atomicNumList):
    mol = Chem.MolFromSmarts("[#"+str(atomicNumList[0])+"]")
    rwMol = Chem.RWMol(mol)
    for i in range(1,len(atomicNumList)):
        a = Chem.Atom(int(atomicNumList[i]))
        rwMol.AddAtom(a)
    
    mol = rwMol.GetMol()

    return mol

def get_atomicNumList(atomic_symbols):
    symbol2number = {}
    symbol2number["H"] = 1
    symbol2number["B"] = 5
    symbol2number["C"] = 6
    symbol2number["N"] = 7
    symbol2number["O"] = 8
    symbol2number["F"] = 9
    symbol2number["Si"] = 14
    symbol2number["P"] = 15
    symbol2number["S"] = 16
    symbol2number["Cl"] = 17
    symbol2number["Br"] = 35
    symbol2number["I"] = 53
    
    atomicNumList = []
    
    for symbol in atomic_symbols:
        atomicNumList.append(symbol2number[symbol])
    
    return atomicNumList

def read_xyz_file(filename):

    atomic_symbols = []
    xyz_coordinates = []

    with open(filename, "r") as file:
        for line_number,line in enumerate(file):
            if line_number == 0:
                num_atoms = int(line)
            elif line_number == 1:
                if "charge=" in line:
                    charge = int(line.split("=")[1])
                else:
                    charge = 0
            else:
                atomic_symbol, x, y, z = line.split()
                atomic_symbols.append(atomic_symbol)
                xyz_coordinates.append([float(x),float(y),float(z)])

    atomicNumList = get_atomicNumList(atomic_symbols)
    
    return atomicNumList,charge,xyz_coordinates

def xyz2AC(atomicNumList,xyz):
    import numpy as np
    mol = get_proto_mol(atomicNumList)

    conf = Chem.Conformer(mol.GetNumAtoms())
    for i in range(mol.GetNumAtoms()):
        conf.SetAtomPosition(i,(xyz[i][0],xyz[i][1],xyz[i][2]))
    mol.AddConformer(conf)

    dMat = Chem.Get3DDistanceMatrix(mol)
    pt = Chem.GetPeriodicTable()

    num_atoms = len(atomicNumList)
    AC = np.zeros((num_atoms,num_atoms)).astype(int)

    for i in range(num_atoms):
        a_i = mol.GetAtomWithIdx(i)
        Rcov_i = pt.GetRcovalent(a_i.GetAtomicNum())*1.30
        for j in range(i+1,num_atoms):
            a_j = mol.GetAtomWithIdx(j)
            Rcov_j = pt.GetRcovalent(a_j.GetAtomicNum())*1.30
            if dMat[i,j] <= Rcov_i + Rcov_j:
                AC[i,j] = 1
                AC[j,i] = 1

    return AC,mol

def xyz2mol(atomicNumList,charge,xyz_coordinates,charged_fragments):

# Get atom connectivity (AC) matrix, list of atomic numbers, molecular charge, 
# and mol object with no connectivity information
    AC,mol = xyz2AC(atomicNumList,xyz_coordinates)

# Convert AC to bond order matrix and add connectivity and charge info to mol object
    new_mol = AC2mol(mol,AC,atomicNumList,charge,charged_fragments)
    
    return new_mol

In [None]:
from joblib import Parallel, delayed
import multiprocessing

sdf_savepath = "/home/denn/home2/data/SF/sdf/"
def write_sdf(row):
    i,r=row
    arr=np.vstack(np.array(r.geoms__xyz))
    w = Chem.SDWriter(os.path.join(sdf_savepath,"{}_{}.sdf".format(str(r.mol__inchikey),str(i))))
    mol=xyz2mol(arr[:,0].ravel().astype(np.uint8),0,arr[:,1:],True)
    mol.SetProp("_Name","{}".format(i))
    w.write(mol)

# for row in df_xyz[:10].iterrows():
#     write_sdf(row)
# sdfw=Parallel(n_jobs=8)(delayed(write_sdf)(row) for row in df_xyz[:1000].iterrows())


In [None]:
# Get DTFB geometries in XYZ file format for conformers indexed by geoms__parents geomID
df_xyz=df[['geoms__parents__method','geoms__parents','mol__inchikey','geoms__parents__xyz']]
df_xyz=df_xyz.drop_duplicates(subset=['geoms__parents','mol__inchikey'])
df_xyz['xyz']=df['geoms__parents__xyz'].apply(to_xyz)
df_xyz=df_xyz.set_index(['geoms__parents'])
df_xyz['geoms__parents__method'].mean()

In [None]:
from multiprocessing import Pool

with Pool(processes=16) as pool:
    ress=pool.imap_unordered(write_sdf,df_xyz.iterrows(),1)
    for i,res in enumerate(ress):
        print(i,end=' ')
    print()

# with Pool(processes=4) as pool:
#     ress=[pool.apply_async(write_sdf,(row,)) for row in df_xyz[:200].iterrows()]
#     for i,res in enumerate(ress):
#         res.get()
#         print(i,end=' ')
#     print()
    
    

In [None]:
sdf_savepath = "/home/denn/home2/data/SF/sdf_DFTB/"
def write_sdf(row):
    i,r=row
    arr=np.vstack(np.array(r.geoms__parents__xyz))
    w = Chem.SDWriter(os.path.join(sdf_savepath,"{}_{}.sdf".format(str(r.mol__inchikey),str(i))))
    mol=xyz2mol(arr[:,0].ravel().astype(np.uint8),0,arr[:,1:],True)
    mol.SetProp("_Name","{}".format(i))
    w.write(mol)

from multiprocessing import Pool

with Pool(processes=16) as pool:
    ress=pool.imap_unordered(write_sdf,df_xyz.iterrows(),1)
    for i,res in enumerate(ress):
        print(i,end=' ')
    print()

In [None]:
# np.vstack(list((df_xyz[:1].geoms__xyz)))

In [None]:
df_xyz.info()

In [None]:
df_xyz.count()

In [None]:
cols=['geoms__parents','mol__inchikey','parentjob__config__name','totalenergy','s2','excitedstates','homo','lumo']
df_p=df[cols].set_index(['mol__inchikey','geoms__parents'])

dfp=df_p.pivot(columns='parentjob__config__name')
#print(dfp[:1])
dft=dfp['totalenergy']['b3lyp_6_31gs_opt_qchem']
dft=dft.groupby(level=0).min().to_frame()

#Get geometries in XYZ file format for conformers indexed by geoms__parents geomID
# df_xyz=df[['geoms__parents','mol__inchikey','geoms__parents__xyz']]
# df_xyz=df_xyz.drop_duplicates(subset=['geoms__parents','mol__inchikey'])
# df_xyz['xyz']=df['geoms__parents__xyz'].apply(to_xyz)
# df_xyz=df_xyz.set_index(['geoms__parents'])



df_calc=pd.DataFrame(index=dfp.index)
df_calc['t1_opt']=(dfp['totalenergy']['b3lyp_6_31gs_opt_t1_qchem']-dfp['totalenergy']['b3lyp_6_31gs_opt_qchem'])*HA_TO_EV
df_calc['t1_opt1']=(dfp['totalenergy']['b3lyp_6_31gs_opt_t1_qchem']-dfp['totalenergy']['b3lyp_6_31gs_tddft'])*HA_TO_EV
df_calc['dft_energy']=(dfp['totalenergy']['b3lyp_6_31gs_opt_qchem']-dft['b3lyp_6_31gs_opt_qchem'])*HA_TO_EV
df_calc['s2_t1']=dfp['s2']['b3lyp_6_31gs_opt_t1_qchem']

df_calc['homo']=dfp['homo']['b3lyp_6_31gs_tddft']*HA_TO_EV
df_calc['lumo']=dfp['lumo']['b3lyp_6_31gs_tddft']*HA_TO_EV

df_calc.index = df_calc.index.set_names(['mol__inchikey','geoms__parents'])

df_calc=df_calc.groupby(level=0, group_keys=False).apply(lambda x: x.sort_values('dft_energy'))

cols_calc=['t1_opt','t1_opt1']



In [None]:
dfe = dfp.excitedstates.b3lyp_6_31gs_tddft.apply(pd.Series)
dfe1 = dfe.stack()
dfe2 = dfe1.apply(pd.Series)

dfe2 = dfe2[['energy','multiplicity','oscillator_strength','pbht']]

df_s = dfe2[dfe2.multiplicity=='singlet'].rename(columns={'energy':'s1_vert'})
df_t = dfe2[dfe2.multiplicity=='triplet'].rename(columns={'energy':'t1_vert'})

dfs1=df_s.groupby(level=[0,1], group_keys=False).apply(lambda x: x.sort_values('s1_vert').iloc[0])
dft1=df_t.groupby(level=[0,1], group_keys=False).apply(lambda x: x.sort_values('t1_vert').iloc[0])
dft2=df_t.groupby(level=[0,1], group_keys=False).apply(lambda x: x.sort_values('t1_vert').iloc[1]).rename(columns={'t1_vert':'t2_vert'})

df_ex = dfs1.join(dft1,lsuffix='_s1').join(dft2, lsuffix='_t1',rsuffix='_t2')

df_calcs = pd.merge(df_calc, df_ex, how='outer', right_index=True, left_index=True)

df_all=df_calcs


In [None]:
#df_calc

In [None]:
if 'lcwpbe_6_31gs_tddft_qchem' in dfp.excitedstates.columns:
    dfe = dfp.excitedstates.lcwpbe_6_31gs_tddft_qchem.apply(pd.Series)
    dfe1 = dfe.stack()
    dfe2 = dfe1.apply(pd.Series)

    dfe2 = dfe2[['energy','multiplicity','oscillator_strength','pbht']]

    df_s = dfe2[dfe2.multiplicity=='singlet'].rename(columns={'energy':'s1_vert_lcpbe'})

    dfs1=df_s.groupby(level=[0,1], group_keys=False).apply(lambda x: x.sort_values('s1_vert_lcpbe').iloc[0])


    df_calcs = pd.merge(df_calcs, dfs1, how='outer', right_index=True, left_index=True,suffixes=('','_s1_lcpbe'))
    df_all=df_calcs
show(df_calcs[:3])
print(df_all.count())

In [None]:
#len(dfp.excitedstates.wb97xd_6_31gs_tddft[dfp.excitedstates.wb97xd_6_31gs_tddft.isnull()])

In [None]:
if 'wb97xd_6_31gs_tddft' in dfp.excitedstates.columns:
    dfe = dfp.excitedstates.wb97xd_6_31gs_tddft.apply(pd.Series)
    dfe1 = dfe.stack()
    dfe2 = dfe1.apply(pd.Series)

    dfe2 = dfe2[['energy','multiplicity','oscillator_strength','pbht']]

    df_s = dfe2[dfe2.multiplicity=='singlet'].rename(columns={'energy':'s1_vert_wb97xd'})

    dfs1=df_s.groupby(level=[0,1], group_keys=False).apply(lambda x: x.sort_values('s1_vert_wb97xd').iloc[0])


    df_calcs = pd.merge(df_calcs, dfs1, how='outer', right_index=True, left_index=True,suffixes=('','_s1_wb97xd'))
    df_all=df_calcs
show(df_calcs[:3])
print(df_all.count())

In [None]:
#df_all.s1_vert_wb97xd.sample(10)

In [None]:
#df_check=pd.merge(df_dnn.reset_index(),df_xyz.reset_index()).set_index('mol__inchikey').reset_index()#.join(df_smiles,how='inner')
df_check1=pd.merge(df_all.reset_index(),df_xyz.reset_index()).reset_index(drop=True)#.join(df_smiles,how='inner')

In [None]:
import pybel
#from dask import dataframe as dd
#pybel.informats
#pybel.outformats
#dd_check=dd.from_pandas(df_check,10)
#%timeit pybel.readstring('xyz',df_check.loc['YZLGAWUIHCFDLO']['xyz']).write('inchikey')

def obl(xyz):
    m=pybel.readstring('xyz',xyz)
    return pd.Series({'can':m.write('can'),'check_inchi':m.write('inchikey').split('-')[0]})

#df_check['check_inchi']=df_check['xyz'].apply(lambda m: pybel.readstring('xyz',m).write('inchikey').split('-')[0])
#df_check['check_smi']=df_check['xyz'].apply(lambda m: pybel.readstring('xyz',m).write('can'))
df_check=df_check1.join(df_check1['xyz'].apply(obl))
#dd_check['check_inchi']=dd_check['xyz'].apply(func=lambda m: pybel.readstring('xyz',m).write('inchikey'),meta=dd_check['xyz'])



In [None]:
#df_check.sample(5)

In [None]:
#df_check.sample(1)
#df_check.mol__smiles
df_not_valid=df_check[df_check.mol__inchikey!=df_check.check_inchi]
df_not_valid_s=pd.merge(df_not_valid,df_smiles.reset_index())
#df_not_valid_s[['mol__smiles','can']].sample(10)
df_not_valid_s['mol']=df_not_valid_s.mol__smiles.apply(Chem.MolFromSmiles)

In [None]:
print(df_not_valid_s.shape)

In [None]:
#df_check.sample(1)
#df_check.mol__smiles
df_valid=df_check[df_check.mol__inchikey==df_check.check_inchi]
df_valid_s=pd.merge(df_valid,df_smiles.reset_index())
#df_not_valid_s[['mol__smiles','can']].sample(10)
#df_not_valid_s['mol']=df_not_valid_s.mol__smiles.apply(Chem.MolFromSmiles)
df_valid_s.count()

In [None]:
#Generate HTML report in interactive table with 3D view valid mols

report_file_name="/home/denn/home/ml/not_valid.html"

def pop3d(conformer):
    popup = '<div class="ui primary button" id="view3d_{}">View 3D</div>'.format(conformer)
    return popup

def buy_emol(tags):
    if 'emol1' in tags:
        return '<i class="shopping bag icon"></i><div style="display: none;">buy</div>'
    else:
        return ''
def pubchem_link(inchikey):
    cell='<a href="https://pubchem.ncbi.nlm.nih.gov/search/#collection=compounds&query_type=text&query={}" target="_blank">{}</a>'.format(inchikey,inchikey)
    return cell

df_html=df_not_valid_s.copy().drop_duplicates(subset=['mol__inchikey'])
df_html['mol3d']=df_html['mol__inchikey'].apply(pop3d)#+'<br>Conformers: '+df_html['conf_count'].map(str)
#df_html['Buy (eMol)']=df_html['mol__tags'].apply(buy_emol)
df_html['mol__inchikey_link']=df_html['mol__inchikey'].apply(pubchem_link)+df_html['mol__smiles']\
    .apply('<div style="width:10em;word-break:break-all;" class="ui small segment" data-tooltip="SMILES could be pasted to ChemDraw or used to search in Reaxys, etc.">{}</div>'.format)
#df_html['T1, eV']=df_html['t1_opt'].apply(lambda s:"{0:.2f}".format(s))+'<br>Max-min: '+df_html['mm_t1'].apply(lambda s:"{0:.2f}".format(s))
html_js = """
    <div class="ui small modal" id="mol3Dmodal">
        <i class="close icon"></i>
        <div class="header">3D view (rotate with mouse)</div>
        <div class="content">
            <div style="height: 350px; width: 720px; position: relative;" class='viewer_3Dmoljs' data-backgroundcolor='0xffffff'></div>
            <div class="action">
                Conformer: <div id="conf_num" class="ui label"></div>
                <div class="ui primary labeled icon button" id="prev_conf" tabindex="0">
                  <i class="left arrow icon"></i>
                  Prev
                </div>
                <div class="ui primary right labeled icon button" id="next_conf" tabindex="0">
                  <i class="right arrow icon"></i>
                  Next
                </div>mi
            </div>

            <p>dft_energy: in eV from lowest energy conformer</p>
            <div id="modal_table"></div>
        </div>
    </div>
    <script>
        $('#conf_num').data('count', 0);
        $('#conf_num').html($('#conf_num').data('count'))
        $('#next_conf').click(function(){
            var count = $('#conf_num').data('count');
            var view = $3Dmol.viewers[0];
            if (count<view.getFrames()-1){
                count = count + 1;
                view.setFrame(count);
                view.zoomTo();
                view.render();
                $('#conf_num').data('count', count);
                $('#conf_num').html(count);
            }
        });
        $('#prev_conf').click(function(){
            var count = $('#conf_num').data('count');
            var view = $3Dmol.viewers[0];
            if (count>0){t
                count = count - 1;
                view.setFrame(count);
                view.zoomTo();
                view.render();
                $('#conf_num').data('count', count);
                $('#conf_num').html(count);
            }
        });
    </script>
    <h1 class="ui header">Singlet Fission Failed Candidates</h1>
    <p class="ui header"></p>
"""
modal_cols=['dft_energy','t1_opt','t1_vert','t2_vert','s1_vert','oscillator_strength_s1', 'pbht_s1']
html_button_js="<script>"
for conformer in df_html.mol__inchikey.drop_duplicates():
    html_button_js+="$('#mol3Dmodal').modal('attach events', '#view3d_{}');".format(conformer)
    df_sel=df_not_valid_s[df_not_valid_s.mol__inchikey==conformer].sort_values('dft_energy').reset_index()
    #print(df_sel.columns)
    #print(df_sel.to_html(escape=True).replace('<table border="1" class="dataframe">','<table class="ui celled table" cellspacing="0" width="100%">').replace("\n", "\\n"))
    html_modal_table= "$('#modal_table').html('%s');" % df_sel[modal_cols].to_html(escape=True).replace('<table border="1" class="dataframe">','<table class="ui celled table" cellspacing="0" width="100%">').replace("\n", "")
    html_button_js+=("$('#view3d_%s').click(function() { $('#conf_num').data('count', 0); $('#conf_num').html($('#conf_num').data('count')); var view = $3Dmol.viewers[0];view.removeAllModels();view.addModelsAsFrames('%s','xyz');view.setStyle({'stick':{}});view.addStyle({'sphere':{'radius':0.4}});view.zoomTo();view.render();%s});" \
                     % (conformer,''.join(df_xyz.loc[df_sel['geoms__parents']]['xyz']).replace("\n", "\\n"),html_modal_table))
    #print(''.join(df_xyz.loc[df_sel['geoms__parents']]['xyz']))
html_button_js+="</script>"


pd.options.display.float_format = '{:,.2f}'.format
html_table=df_html[['mol__inchikey_link','mol','mol3d','t1_opt','s1_vert','mol__tags']].to_html(escape=False)
    #.rename(index=str, columns={'mol__inchikey_link':'InChIKey linked to PubChem (SMILES in the box)','mol':'Molecule','mol3d':'3D','t1_opt':'T1, eV','s1_vert':'S1, eV','criteria1':'S1-2T1, eV','criteria2':'T2-2T1, eV','mm_t1':'T1 max-min, eV','mm_s1':'S1 max-min, eV'}).to_html(escape=False)
html_table=html_table.replace('<table border="1" class="dataframe">','<table id="example" class="ui celled table" cellspacing="0" width="100%">')

pd.options.display.float_format = '{:,.4f}'.format
#Write the report to disk 
# with open(report_file_name, "w") as text_file:
#    text_file.write(html_start+html_js+html_table+html_button_js+html_end)


In [None]:
df_all=df_valid.set_index(['mol__inchikey','geoms__parents'])

In [None]:
#df_all.to_pickle('/home/denn/home/ml/data/sf/reaxys2_all.pickle')

In [None]:
df_tmp1=df_all.groupby(level=0).apply(lambda x: x.sort_values('dft_energy').iloc[0])
#df_tmp1.count()

In [None]:
print('All: '+ str(df_all['dft_energy'].count()))
df_high=df_all[df_all.dft_energy>0.1]
print('High (>0.1 eV): '+ str(df_high['dft_energy'].count()))
df_min=df_all[df_all.dft_energy==0.0]
df_low=df_all[df_all.dft_energy<0.1]
print('Low (<0.1 eV): '+ str(df_low['dft_energy'].count()))
print('Min: '+ str(df_min['dft_energy'].count()))

In [None]:
print(df_all.columns)
#df_all[['xyz','t1_opt','s1_vert','s1_vert_lcpbe','s1_vert_wb97xd']].to_pickle('/home/denn/home/ml/data/sf/anth_t1_s1_reaxys2_xyz.pickle')
#df_low.to_pickle('/home/denn/home2/data/SF/SF_low_conformers.pkl')

In [None]:
diffi=df_min.reset_index().set_index('mol__inchikey').index.symmetric_difference(df_tmp1.index)
df_tmp1.loc[diffi]['dft_energy']
#unzip(df_min.index)[:10]

In [None]:
# # Statistics
# def reject_outliers(sr, iq_range=0.5):
#     pcnt = ((1 - iq_range) / 2)*100
#     qlow, median, qhigh = np.percentile(sr,[pcnt, 50, 100-pcnt])
#     iqr = qhigh - qlow
#     return sr[ np.abs(sr - median) <= iqr]

# df_t1_stat=df_all.dropna(subset=(['t1_opt'])).groupby(level=0)['t1_opt'].agg({
# #                                                    'mode':lambda x: x.value_counts().idxmax(),
# #                                                    'mode_count':lambda x: x.value_counts().max(),
#                                                    't1_opt_std':np.std,
# #                                                    'median':np.median,
# #                                                    'max':np.max,
# #                                                    'min':np.min,
#                                                    't1_opt_max_min':lambda x: np.max(x)-np.min(x),
# #                                                    'count':lambda x: x.count(),
# #                                                    'mean':np.mean,
# #                                                    'mad':lambda x: x.mad(),
# #                                                    'mean_rj':lambda x: np.mean(reject_outliers(x,iq_range=0.5)),
# #                                                    'std_rj':lambda x: np.std(reject_outliers(x,iq_range=0.5))
#                                                    })

In [None]:
#df_t1_stat.sample(10)

In [None]:
# df_eng_stat=df_all.dropna(subset=(['dft_energy'])).groupby(level=0)['dft_energy'].agg({
# #                                                    'mode':lambda x: x.value_counts().idxmax(),
# #                                                    'mode_count':lambda x: x.value_counts().max(),
#                                                    #'std':np.std,
#                                                    #'median':np.median,
#                                                    'dfte_max_min':np.max,
#                                                    'count':lambda x: x.count(),
#                                                    #'mean':np.mean,
#                                                    #'mad':lambda x: x.mad(),
# #                                                    'mean_rj':lambda x: np.mean(reject_outliers(x,iq_range=0.5)),
# #                                                    'std_rj':lambda x: np.std(reject_outliers(x,iq_range=0.5))
#                                                    })

In [None]:
#df_eng_stat[:10]

In [None]:
# df_dnn = df_min['t1_opt'].dropna().to_frame().reset_index().set_index('mol__inchikey').join(df_smiles,how='inner').join(df_t1_stat.fillna(0),how='left').join(df_eng_stat,how='left')
# #df_dnn[df_dnn.t1_opt.isnull()][:10]
# def get_finger_rdk(smiles):
#     try:
#         mol=Chem.MolFromSmiles(str(smiles))
#         Chem.SanitizeMol(mol)
#         return np.array(Chem.GetMorganFingerprintAsBitVect(mol,nBits=8192,radius=6),dtype=np.float32)
#     except:
#          return None
# df_dnn['fp_rdk']=df_dnn.mol__smiles.apply(get_finger_rdk)
# print(df_dnn.count())

In [None]:
#df_dnn.to_pickle('/home/denn/home/ml/data/sf/anth_t1_opt_reaxys2_morgan6_8k.pickle')

In [None]:
dfxray =pd.read_pickle('/home/denn/home2/ml/data/anthracene_core_xray1.pickle').reset_index()
dfxray['mol__inchikey']=dfxray['mol__inchikey'].str.split('-').str[0]
dfxray=dfxray.set_index('mol__inchikey')

#df_emol=pd.read_pickle('/home/denn/home/ml/data/emolecules.pickle')
#df_reaxys1=pd.read_pickle('/home/denn/harvard/SF/Library/Reaxys/anth_peryl_uv_below1000.pickle')#.reset_index()
df_reaxys1=pd.read_pickle('/home/denn/harvard/SF/Library/Reaxys/anth_peryl_uv_below1000_reaxys2_newproc_all.pickle')#.reset_index()
#df_reaxys1['mol__inchikey']=df_reaxys1['mol__inchikey'].str.split('-').str[0]
#df_reaxys1=df_reaxys1.set_index('mol__inchikey')
show(df_reaxys1[:1])

In [None]:
df_match_min=pd.merge(df_min.reset_index(),dfxray.reset_index(),how='left',on='mol__inchikey',suffixes=('','_x'))
df_match_min=df_match_min.set_index(['mol__inchikey','geoms__parents'])

In [None]:
def max_min(group, x_n):
    x = group[x_n]
    return np.max(x)-np.min(x)
#['dft_energy']
                              
df_=df_low.reset_index()
#df_['mol__inchikey']=df_['mol__inchikey'].str.split('-').str[0]
df_mae_t1=df_.groupby('mol__inchikey').apply(max_min,'t1_opt').to_frame().rename(columns={0:'mm_t1'})
df_mae_count=df_.groupby('mol__inchikey').count()['geoms__parents'].to_frame().rename(columns={'geoms__parents':'conf_count'})
#print(df_mae_count)
df_mae_s1=df_.groupby('mol__inchikey').apply(max_min,'s1_vert').to_frame().rename(columns={0:'mm_s1'})
df_mae1=df_.set_index('mol__inchikey').groupby(level=0).apply(lambda x: x.sort_values('dft_energy').iloc[0])
#print(df_reaxys1.index)
#print(df_mae1.index)
df_confs=df_reaxys1[['mean_rj','std_rj','min']].join(df_mae1,how='right').join(df_mae_t1).join(df_mae_s1).join(df_mae_count)#.join(df_smiles)
#show(df_mae[['mol','MAE_t1','MAE_s1','t1_opt','exp_t1','s1_vert','exp_s1']])
#df_confs['s1_exp']=NM_TO_EV/df_confs['mean_rj']
df_confs['s1_exp']=df_confs['mean_rj']
print(df_confs.count())
show(df_confs.sample(1))

In [None]:
# dft5=df_confs.reset_index()
# show(dft5[dft5['mol__inchikey'].str.contains('ABT')])


In [None]:
import warnings
#warnings.simplefilter(action = "ignore")
#import subprocess

SLOPE=1.05
INT=0

# SLOPE=1
# INT=0

#df_d=df_min[['t1_opt','s1_vert','t2_vert','dft_energy']]
df_d=df_confs.copy()
df_d['s1_vert']=df_d['s1_vert']#*SLOPE+INT
#df_d['s1_exp']=NM_TO_EV/df_d['mean_rj']
df_d['2t1_opt']=df_d['t1_opt']*2
df_d['criteria1']=df_d['s1_vert']-df_d['2t1_opt']
df_d['criteria1_exp']=df_d['s1_exp']-df_d['2t1_opt']
df_d['criteria2']=df_d['t2_vert']-df_d['2t1_opt']
# print(df_d[(df_d['criteria1']>=-0.2)&(df_d['criteria1']<=0.2)].shape)
# print(df_d[(df_d['criteria2']>=-0.0)&(df_d['criteria1']>=-0.2)&(df_d['criteria1']<=0.2)].shape)
#df_d1=df_d.reset_index().groupby('mol__inchikey').describe()#.to_frame()
# df_d=df_d[(df_d['criteria1']>=-0.0)&(df_d['criteria1']<=0.3)|(df_d['criteria1_exp']>=-0.0)]
#Criteria simple
# df_d=df_d[(df_d['criteria1']>=-0.2)&(df_d['criteria1']<=0.2)]
# df_d=df_d[(df_d['criteria1_exp']>=-0.2)&(df_d['criteria1_exp']<=0.2)]
# df_d=df_d[df_d['criteria2']>=-0.2]
#df_d=df_d[(df_d['s1_exp'].isnull())|((df_d['s1_exp'].notnull())&(np.abs(df_d['s1_exp']-df_d['s1_vert'])<0.4))]
# df_d=df_d[df_d['mm_t1']<0.2]
# df_d=df_d[df_d['mm_s1']<0.2]
#df_d=df_d[df_d['t1_opt']>1.3]
#df_d=df_d.reset_index().groupby('mol__inchikey').describe()#.to_frame()
df_d=df_d.reset_index().groupby('mol__inchikey').max()#.to_frame()
#df_d=pd.merge(df_d, df_smiles, how='right', right_index=True, left_index=True)
df_d=df_d.sort_values('criteria1_exp',ascending=False)
df_d=df_d.sort_values('t1_opt',ascending=False)
df_d=df_d.join(df_smiles)
df_d['mol']=df_d.mol__smiles.apply(Chem.MolFromSmiles)
#print(df_d.index)

dfxray1=dfxray.groupby(level=0).first()
df_match=pd.merge(df_d,dfxray1,how='left',right_index=True,left_index=True,suffixes=('','_x'))
df_match=df_match.sort_values('t1_opt',ascending=False)
#df_match=df_match.sort_values('criteria1',ascending=False)
print(df_d['mol'].count())
print(df_match['ccdc'].count())
print(df_match.count())

In [None]:
# df_match.to_pickle('/home/denn/home2/data/SF/SF_criteria.pkl')

In [None]:
#Generate HTML report in interactive table with 3D view

report_file_name="/home/denn/home/ml/data/sf/export_sam/antr.html"

def pop3d(conformer):
    popup = '<div class="ui primary button" id="view3d_{}">View 3D</div>'.format(conformer)
    return popup

def buy_emol(tags):
    if 'emol1' in tags:
        return '<i class="shopping bag icon"></i><div style="display: none;">buy</div>'
    else:
        return ''
def pubchem_link(inchikey):
    cell='<a href="https://pubchem.ncbi.nlm.nih.gov/search/#collection=compounds&query_type=text&query={}" target="_blank">{}</a>'.format(inchikey,inchikey)
    return cell

def download3d(conformer):
    popup = '<a href="#" class="ui primary button" id="down3d_{}">Download .xyz</a>'.format(conformer)
    return popup

df_html=df_match.reset_index()
df_html['mol3d']=df_html['mol__inchikey'].apply(pop3d)+'<br>Conformers: '+df_html['conf_count'].map(str)
df_html['Buy (eMol)']=df_html['mol__tags'].apply(buy_emol)
df_html['mol__inchikey_link']=df_html['mol__inchikey'].apply(pubchem_link)+df_html['mol__smiles']\
    .apply('<div style="width:10em;word-break:break-all;" class="ui small segment" data-tooltip="SMILES could be pasted to ChemDraw or used to search in Reaxys, etc.">{}</div>'.format)
#df_html['T1, eV']=df_html['t1_opt'].apply(lambda s:"{0:.2f}".format(s))+'<br>Max-min: '+df_html['mm_t1'].apply(lambda s:"{0:.2f}".format(s))
html_js = """
    <div class="ui small modal" id="mol3Dmodal">
        <i class="close icon"></i>
        <div class="header">3D view (rotate with mouse)</div>
        <div class="content">
            <div style="height: 350px; width: 720px; position: relative;" class='viewer_3Dmoljs' data-backgroundcolor='0xffffff'></div>
            <div class="action">
                Conformer: <div id="conf_num" class="ui label"></div>
                <div class="ui primary labeled icon button" id="prev_conf" tabindex="0">
                  <i class="left arrow icon"></i>
                  Prev
                </div>
                <div class="ui primary right labeled icon button" id="next_conf" tabindex="0">
                  <i class="right arrow icon"></i>
                  Next
                </div>mi
            </div>

            <p>dft_energy: in eV from lowest energy conformer</p>
            <div id="modal_table"></div>
        </div>
    </div>
    <script>
        $('#conf_num').data('count', 0);
        $('#conf_num').html($('#conf_num').data('count'))
        $('#next_conf').click(function(){
            var count = $('#conf_num').data('count');
            var view = $3Dmol.viewers[0];
            if (count<view.getFrames()-1){
                count = count + 1;
                view.setFrame(count);
                view.zoomTo();
                view.render();
                $('#conf_num').data('count', count);
                $('#conf_num').html(count);
            }
        });
        $('#prev_conf').click(function(){
            var count = $('#conf_num').data('count');
            var view = $3Dmol.viewers[0];
            if (count>0){
                count = count - 1;
                view.setFrame(count);
                view.zoomTo();
                view.render();
                $('#conf_num').data('count', count);
                $('#conf_num').html(count);
            }
        });
        function down(ref,data,filename) {
            properties = {type: 'plain/text'}; // Specify the file's mime-type.
            try {
              // Specify the filename using the File constructor, but ...
              file = new File([data], filename, properties);
            } catch (e) {
              // ... fall back to the Blob constructor if that isn't supported.
              file = new Blob([data], properties);
            }
            url = URL.createObjectURL(file);
            console.log(ref);
            document.getElementById(ref).href = url;
        };
    </script>
    <h1 class="ui header">Relevant Singlet Fission Candidates</h1>
    <p class="ui header"></p>
"""
modal_cols=['dft_energy','t1_opt','t1_vert','t2_vert','s1_vert','oscillator_strength_s1', 'homo','lumo','down3d']
html_button_js="<script>"
for conformer in df_html['mol__inchikey'][:]:
    html_button_js+="$('#mol3Dmodal').modal('attach events', '#view3d_{}');".format(conformer)
    df_sel=df_low.ix[conformer].sort_values('dft_energy').reset_index()
    df_sel['down3d']=df_sel.geoms__parents.apply(download3d)
    #print(df_sel.columns)
    #print(df_sel.to_html(escape=True).replace('<table border="1" class="dataframe">','<table class="ui celled table" cellspacing="0" width="100%">').replace("\n", "\\n"))
    html_modal_table= "$('#modal_table').html('%s');" % df_sel[modal_cols].to_html(escape=False).replace('<table border="1" class="dataframe">','<table class="ui celled table" cellspacing="0" width="100%">').replace("\n", "")
    html_button_js+=("$('#view3d_%s').click(function() { $('#conf_num').data('count', 0); $('#conf_num').html($('#conf_num').data('count')); var view = $3Dmol.viewers[0];view.removeAllModels();view.addModelsAsFrames('%s','xyz');view.setStyle({'stick':{}});view.addStyle({'sphere':{'radius':0.4}});view.zoomTo();view.render();%s});" \
                     % (conformer,''.join(df_xyz.loc[df_sel['geoms__parents']]['xyz']).replace("\n", "\\n"),html_modal_table))
    #print(''.join(df_xyz.loc[df_sel['geoms__parents']]['xyz']))
    for i,row in df_xyz.loc[df_sel['geoms__parents']].iterrows():
        print(row.mol__inchikey)
        with open('/home/denn/home/ml/data/sf/export_sam/'+row.mol__inchikey+'_'+str(i), "w") as text_file:
            text_file.write(row.xyz)
        
    #     j1="$(function () {"
#     j2=""
#     for i,row in df_xyz.loc[df_sel['geoms__parents']].iterrows():
#         j2+=('down("#down3d_%s","%s","%s.xyz");' % (i,row['xyz'],i)).replace("\n", "\\n")
#         #j2+=('createDownloadLink("#down3d_%s","%s","%s.xyz");' % (i,row['xyz'],i)).replace("\n", "\\n")
#     j3="});"
#     jdown=j1+j2+j3
# #     jdown=j2
# #     print(jdown)
#     html_button_js+=jdown
html_button_js+="</script>"

# html_button_js+="<script>"
# for conformer in df_html['mol__inchikey'][:]:
#     df_sel=df_low.ix[conformer].sort_values('dft_energy').reset_index()
# #     j1="$(function () {"
#     j2=""
#     for i,row in df_xyz.loc[df_sel['geoms__parents']].iterrows():
#         j2+=('down("down3d_%s","%s","%s.xyz");' % (i,row['xyz'],i)).replace("\n", "\\n")
#         #j2+=('createDownloadLink("#down3d_%s","%s","%s.xyz");' % (i,row['xyz'],i)).replace("\n", "\\n")
# #     j3="});"
# #     jdown=j1+j2+j3
#     jdown=j2
# #     print(jdown)
#     html_button_js+=jdown
# html_button_js+="</script>"

pd.options.display.float_format = '{:,.2f}'.format
html_table=df_html[['mol__inchikey_link','mol','mol3d','t1_opt','s1_vert','s1_exp','ccdc']]\
    .rename(index=str, columns={'mol__inchikey_link':'InChIKey linked to PubChem (SMILES in the box)','mol':'Molecule','mol3d':'3D','t1_opt':'T1, eV','s1_vert':'S1, eV','criteria1':'S1-2T1, eV','criteria2':'T2-2T1, eV','mm_t1':'T1 max-min, eV','mm_s1':'S1 max-min, eV'}).to_html(escape=False)
html_table=html_table.replace('<table border="1" class="dataframe">','<table id="example" class="ui celled table" cellspacing="0" width="100%">')

# Write the report to disk 
with open(report_file_name, "w") as text_file:
    text_file.write(html_start+html_js+html_table+html_button_js+html_end)


In [None]:
df_confs.columns

In [None]:
SLOPE=1.05
INT=0

colorscale = ['#7A4579', '#D56073', 'rgb(236,158,105)', (1, 1, 0.2), (0.98,0.98,0.98)]

df = pd.merge(df_confs.dropna(subset=['t1_opt','s1_vert']).join(df_smiles),dfxray1,how='left',right_index=True,left_index=True,suffixes=('','_x'))

x = df['t1_opt'].as_matrix()
y = df['s1_vert'].as_matrix()*SLOPE+INT
fig = FF.create_2D_density(
    x, y, colorscale=colorscale,
    title='',
    hist_color='rgb(255, 237, 222)', point_size=3,height=1200,width=1200
)
y0=np.min(y)-0.2
y1=np.max(y)+0.2

fig.layout.update({'shapes':[
        # Line Vertical
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': y0/2,
            'y0': y0,
            'x1': y1/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 2,
            }
         }
        ,
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': (y0+0.2)/2,
            'y0': y0,
            'x1': (y1+0.2)/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 1,
                'dash':'dash'
            }
         }
                    ,
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': (y0-0.2)/2,
            'y0': y0,
            'x1': (y1-0.2)/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 1,
                'dash':'dash'
            }
         }
        ]})
titlefont = 23
tickfont = 20
fig.layout.xaxis.update({'title':'Calculated S0->T1, eV','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.yaxis.update({'title':'Calculated S0->S1, eV','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.yaxis2.update({'title':'Counts','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.xaxis2.update({'title':'Counts','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})


fig.data[0].update({'text': ['{} : {} : {}'.format(i[0],i[1]['ccdc'],i[1]['mol__tags']) for i in df.iterrows()]})
#print(fig)
#help(fig)
iplot(fig,image_height=1200,image_width=1200)


In [None]:
# SLOPE=1.05
# INT=0

colorscale = ['#7A4579', '#D56073', 'rgb(236,158,105)', (1, 1, 0.2), (0.98,0.98,0.98)]

df = pd.merge(df_confs.dropna(subset=['t1_opt','s1_vert']).join(df_smiles),dfxray1,how='left',right_index=True,left_index=True,suffixes=('','_x'))

x = df['t1_opt'].as_matrix()
y = df['s1_vert'].as_matrix()#*SLOPE+INT
fig = FF.create_2D_density(
    x, y, colorscale=colorscale,
    title='',
    hist_color='rgb(255, 237, 222)', point_size=3,height=900,width=900
)
y0=np.min(y)-0.2
y1=np.max(y)+0.2

fig.layout.update({'shapes':[
        # Line Vertical
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': y0/2,
            'y0': y0,
            'x1': y1/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 2,
            }
         }
        ,
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': (y0+0.2)/2,
            'y0': y0,
            'x1': (y1+0.2)/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 1,
                'dash':'dash'
            }
         }
                    ,
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': (y0-0.2)/2,
            'y0': y0,
            'x1': (y1-0.2)/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 1,
                'dash':'dash'
            }
         }
        ]})
titlefont = 23
tickfont = 20
fig.layout.xaxis.update({'title':'Calculated S0->T1, eV','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.yaxis.update({'title':'Calculated S0->S1, eV','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.yaxis2.update({'title':'Counts','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.xaxis2.update({'title':'Counts','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})


fig.data[0].update({'text': ['{} : {} : {}'.format(i[0],i[1]['ccdc'],i[1]['mol__tags']) for i in df.iterrows()]})
#print(fig)
#help(fig)
#iplot(fig,image_height=1200,image_width=1200)
#print(plot(fig))
plot_div = plot(fig,image_height=900,image_width=900,output_type='div',show_link=False,include_plotlyjs=False)

from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
def smitosvg(smiles,molSize=(450,150),kekulize=True):
    mc = Chem.MolFromSmiles(smiles)
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)
    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    # It seems that the svg renderer used doesn't quite hit the spec.
    # Here are some fixes to make it work in the notebook, although I think
    # the underlying issue needs to be resolved at the generation step
    return svg.replace('svg:','')

svgs = [smitosvg(smiles).replace('\n','') for smiles in df.mol__smiles]

plotHTML = '''
    <html><head><meta charset="utf-8" /></head><body>
        <script type="text/javascript" src="https://cdn.plot.ly/plotly-latest.js"></script>
        <table>
            <tbody>
                <tr>
                    <td>%s</td>
                    <td><div id="mol" style="height: 300px; width: 300px;"></div></td>
                </tr>
            </tbody>
        </table>

        <script type="text/javascript">
        var myPlot = document.getElementById('myDiv');
        myPlot.on('plotly_hover', function(data){
            document.getElementById('mol').innerHTML=svgs[data.points[0].pointNumber];
        });

        var svgs = %s;

         </script></body></html>
    '''

from bs4 import BeautifulSoup
soup = BeautifulSoup(plot_div, 'html.parser')

div_id=soup.find_all('div')[0].get('id')
plot_div=plot_div.replace(div_id,'myDiv')


with open("/home/denn/home/temp/plot.html", "w") as text_file:
    text_file.write(plotHTML % (plot_div,str(svgs)))


In [None]:
colorscale = ['#7A4579', '#D56073', 'rgb(236,158,105)', (1, 1, 0.2), (0.98,0.98,0.98)]

df = df_confs[(df_confs['s1_exp']<3.85)&(df_confs['s1_exp']>1.05)]

x = df['t1_opt'].as_matrix()
y = df['s1_exp'].as_matrix()
fig = FF.create_2D_density(
    x, y, colorscale=colorscale,
    hist_color='rgb(255, 237, 222)', point_size=3,height=1200,width=1200
)
y0=1.3
y1=4.1

fig.layout.update({'shapes':[
        # Line Vertical
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': y0/2,
            'y0': y0,
            'x1': y1/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 2,
            }
         }
        ,
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': (y0+0.2)/2,
            'y0': y0,
            'x1': (y1+0.2)/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 1,
                'dash':'dash'
            }
         }
        ,
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': (y0-0.2)/2,
            'y0': y0,
            'x1': (y1-0.2)/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 1,
                'dash':'dash'
            }
         }
        ]})
titlefont = 23
tickfont = 20
fig.layout.xaxis.update({'title':'Calculated S0->T1, eV','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.yaxis.update({'title':'Experimental S0->S1, eV','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.yaxis2.update({'title':'Counts','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.xaxis2.update({'title':'Counts','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})



fig.data[0].update({'text': [str(i[0]) for i in df.iterrows()]})
#print(fig)
#help(fig)
iplot(fig,image_height=1200,image_width=1200,show_link=False)
#print(plot(fig))

In [None]:
# SLOPE=1.05
# INT=0

colorscale = ['#7A4579', '#D56073', 'rgb(236,158,105)', (1, 1, 0.2), (0.98,0.98,0.98)]

df = pd.merge(df_confs[(df_confs['s1_exp']<3.85)&(df_confs['s1_exp']>1.05)].dropna(subset=['t1_opt','s1_exp']).join(df_smiles),dfxray1,how='left',right_index=True,left_index=True,suffixes=('','_x'))

x = df['t1_opt'].as_matrix()
y = df['s1_exp'].as_matrix()#*SLOPE+INT
fig = FF.create_2D_density(
    x, y, colorscale=colorscale,
    title='',
    hist_color='rgb(255, 237, 222)', point_size=3,height=900,width=900
)
y0=np.min(y)-0.2
y1=np.max(y)+0.2

fig.layout.update({'shapes':[
        # Line Vertical
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': y0/2,
            'y0': y0,
            'x1': y1/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 2,
            }
         }
        ,
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': (y0+0.2)/2,
            'y0': y0,
            'x1': (y1+0.2)/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 1,
                'dash':'dash'
            }
         }
                    ,
        {
            'type': 'line',
            'xref': 'x',
            'yref': 'y',
            'x0': (y0-0.2)/2,
            'y0': y0,
            'x1': (y1-0.2)/2,
            'y1': y1,
            'line': {
                'color': 'red',
                'width': 1,
                'dash':'dash'
            }
         }
        ]})
titlefont = 23
tickfont = 20
fig.layout.xaxis.update({'title':'Calculated S0->T1, eV','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.yaxis.update({'title':'Experimental S0->S1, eV','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.yaxis2.update({'title':'Counts','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})
fig.layout.xaxis2.update({'title':'Counts','titlefont':{'size':titlefont},'tickfont':{'size':tickfont}})


fig.data[0].update({'text': ['{} : {} : {}'.format(i[0],i[1]['ccdc'],i[1]['mol__tags']) for i in df.iterrows()]})
#print(fig)
#help(fig)
#iplot(fig,image_height=1200,image_width=1200)
#print(plot(fig))
plot_div = plot(fig,image_height=900,image_width=900,output_type='div',show_link=False,include_plotlyjs=False)

from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import rdMolDraw2D
def moltosvg(smiles,molSize=(450,150),kekulize=True):
    mc = Chem.MolFromSmiles(smiles)
    if kekulize:
        try:
            Chem.Kekulize(mc)
        except:
            mc = Chem.Mol(mol.ToBinary())
    if not mc.GetNumConformers():
        rdDepictor.Compute2DCoords(mc)
    drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
    drawer.DrawMolecule(mc)
    drawer.FinishDrawing()
    svg = drawer.GetDrawingText()
    # It seems that the svg renderer used doesn't quite hit the spec.
    # Here are some fixes to make it work in the notebook, although I think
    # the underlying issue needs to be resolved at the generation step
    return svg.replace('svg:','')

svgs = [moltosvg(smiles).replace('\n','') for smiles in df.mol__smiles]

plotHTML = '''
    <html><head><meta charset="utf-8" /></head><body>
        <script type="text/javascript" src="https://cdn.plot.ly/plotly-latest.js"></script>
        <table>
            <tbody>
                <tr>
                    <td>%s</td>
                    <td><div id="mol" style="height: 300px; width: 300px;"></div></td>
                </tr>
            </tbody>
        </table>

        <script type="text/javascript">
        var myPlot = document.getElementById('myDiv');
        myPlot.on('plotly_hover', function(data){
            document.getElementById('mol').innerHTML=svgs[data.points[0].pointNumber];
        });

        var svgs = %s;

         </script></body></html>
    '''

from bs4 import BeautifulSoup
soup = BeautifulSoup(plot_div, 'html.parser')

div_id=soup.find_all('div')[0].get('id')
plot_div=plot_div.replace(div_id,'myDiv')


with open("/home/denn/home/temp/plot.html", "w") as text_file:
    text_file.write(plotHTML % (plot_div,str(svgs)))


In [None]:
# def view3d(conformer):
#     view = view3dMol(height=300,width=300)
#     view.addModel(''.join(df_xyz.loc[conformer]['xyz']),'xyz')
#     view.zoom(3)
#     view.setStyle({'stick':{}})
#     view.addStyle({'sphere':{'radius':0.4}})
#     return view._repr_html_()

# #df_d['mol3d']=df_d['geoms__parents'].apply(view3d)
# #df_d[['mol3d','mol']]
# #df_d3[['mol3d','mol','t1_opt','s1_vert','2t1_opt','criteria1','criteria2','geoms__parents']].to_html('/home/denn/harvard/SF/Meetings/18Oct2016/anthracene_results3d.html',escape=False)
# #show(df_d[['mol3d','mol','t1_opt','s1_vert','2t1_opt','criteria1','criteria2']][:10])

In [None]:
# #Generate report
# def pop3d(conformer):
#     popup = '<div class="ui primary button" id="view3d_{}">View 3D</div>'.format(conformer)
#     return popup
#     #<div id="mol3D"  style="position: relative; width: 600px; height: 400px"></div>
# #     <script>
# #     $(function() {
# #           let element = $('#mol3D');
# #           let config = { backgroundColor: 'white' };
# #           let view = $3Dmol.createViewer( element, config );
# #           view.setStyle({'stick':{}})
# #           view.addStyle({'sphere':{'radius':0.4}})
# #           view.render();
# #     });
# #     </script>
# df_html=df_match.reset_index()
# df_html['mol3d']=df_html['geoms__parents'].apply(pop3d)
# html_js = """
#     <div class="ui small modal" id="mol3Dmodal">
#     <i class="close icon"></i>
#     <div class="header">3D view (rotate with mouse)</div>
#     <div style="height: 350px; width: 720px; position: relative;" class='viewer_3Dmoljs' data-backgroundcolor='0xffffff'></div>
#     </div>
#     <h1 class="ui header">Singlet Fission Candidates </h1>
#     <p class="ui header">Selected using S1-2T1 and T2-2T1 > -0.2 eV. You can sort each column by values, and look associated 3D geometry</p>
# """
# html_button_js="<script>"
# for conformer in df_html['geoms__parents']:
#     html_button_js+="$('#mol3Dmodal').modal('attach events', '#view3d_{}');".format(conformer,conformer)
#     html_button_js+=("$('#view3d_%s').click(function() {var view = $3Dmol.viewers[0];view.removeAllModels();view.addModel('%s','xyz');view.setStyle({'stick':{}});view.addStyle({'sphere':{'radius':0.4}});view.zoomTo();view.render();});" % (conformer,''.join(df_xyz.loc[conformer]['xyz']))).replace("\n", "\\n")
# html_button_js+="</script>"


# pd.options.display.float_format = '{:,.2f}'.format
# html_table=df_html[['mol__inchikey','mol','mol3d','t1_opt','s1_vert','criteria1','criteria2']]\
#     .rename(index=str, columns={'mol__inchikey':'InChIKey (Unique Chemical Identifier)','mol':'Molecule','mol3d':'3D','t1_opt':'T1, eV','s1_vert':'S1, eV','criteria1':'S1-2T1, eV','criteria2':'T2-2T1, eV'}).to_html(escape=False)
# html_table=html_table.replace('<table border="1" class="dataframe">','<table id="example" class="ui celled table" cellspacing="0" width="100%">')
# with open("/home/denn/harvard/SF/Meetings/18Oct2016/SF_anthracene_results_live_nocor.html", "w") as text_file:
#     text_file.write(html_start+html_js+html_table+html_button_js+html_end)

In [None]:
df_smiles[df_smiles.mol__tags]

In [None]:
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.graph_objs import *

from numpy import arange,array,ones
from scipy import stats
init_notebook_mode()

def plot_calc_vs_theo(dfi:pd.DataFrame, calc, exp,height=800,width=800, errors=True, gen_html=False, html_filename=None):

    if gen_html:
        df=dfi[[exp,calc,'mol']].dropna()
        if html_filename is None:
            html_filename="/home/denn/home/temp/plot.html"
    else:
        df=dfi[[exp,calc]].dropna()
        
    df_group=df.groupby(level=0)
    xt1=df_group[exp].mean().values
    yt1=df_group[calc].mean().values
    if errors:
        yt1e=df_group[calc].std().fillna(0.).values

    slope, intercept, r_value, p_value, std_err = stats.linregress(xt1,yt1)
    line = slope*xt1+intercept

    print('Slope',slope)
    print('Intercept',intercept)
    
    if errors:
        trace0 = Scatter(
            x=xt1,
            y=yt1,
            mode='markers',
            name=calc,
            text=[str(i) for i in df_group.head(1).index.get_values()],

            error_y=dict(
                type='data',
                array=yt1e,
                visible=True,
                width=2,
                color='blue'
            )
        )
    else:
        trace0 = Scatter(
            x=xt1,
            y=yt1,
            mode='markers',
            name=calc,
            text=[str(i) for i in df_group.head(1).index.get_values()],
        )
            
    
    trace2 = Scatter(
                      x=xt1, 
                      y=line, 
                      mode='lines',
                      marker=Marker(color='rgb(31, 119, 180)'),
                      name='Fit'
                      )

    annotation = Annotation(
                        xref='paper',
                        yref='paper',
                       x=0,
                       y=1,
                         #xanchor='left',
                         #yanchor='top',
                      text='R_value: {:.5f} Slope: {:.4f} Intercept: {:.3f}'.format(r_value,slope,intercept),
                      showarrow=False,
                      font=Font(size=height/40)
                      )

    data = [trace0,trace2]
    layout = Layout(
        xaxis= dict(
            title= exp,
            ticklen= 5,
            zeroline= False,
            gridwidth= 1,
            titlefont=Font(size=height/40),
            tickfont=Font(size=height/40)
        ),
        yaxis=dict(
            title= calc,
            ticklen= 5,
            gridwidth= 1,
            titlefont=Font(size=height/40),
            tickfont=Font(size=height/40)
        ),
                shapes=[dict(
                        type='line',
                        xref='x',
                        yref= 'y',
                        x0= np.min(yt1)-0.05,
                        y0= np.min(yt1)-0.05,
                        x1= np.max(yt1)+0.05,
                        y1= np.max(yt1)+0.05,
                        line = {
                'color': 'red',
                'width': 2,
                'dash':'dash'
            }
                )],
         annotations=[annotation],
        showlegend=False,
        height=height,
        width=width,
    )

    fig = dict( data=data, layout=layout )

    iplot(fig,image_height=height,image_width=width)
    #plot(fig,image_height=height,image_width=width)
    
    if gen_html:
        plot_div = plot(fig,image_height=900,image_width=900,output_type='div',show_link=False,include_plotlyjs=False)

        from rdkit.Chem import rdDepictor
        from rdkit.Chem.Draw import rdMolDraw2D
        
        if 
        def moltosvg(mol,molSize=(450,150),kekulize=True):
            mc = Chem.Mol(mol.ToBinary())
            if kekulize:
                try:
                    Chem.Kekulize(mc)
                except:
                    mc = Chem.Mol(mol.ToBinary())
            if not mc.GetNumConformers():
                rdDepictor.Compute2DCoords(mc)
            drawer = rdMolDraw2D.MolDraw2DSVG(molSize[0],molSize[1])
            drawer.DrawMolecule(mc)
            drawer.FinishDrawing()
            svg = drawer.GetDrawingText()
            # It seems that the svg renderer used doesn't quite hit the spec.
            # Here are some fixes to make it work in the notebook, although I think
            # the underlying issue needs to be resolved at the generation step
            return svg.replace('svg:','')

        
        svgs = [moltosvg(smiles).replace('\n','') for smiles in df_group.head(1)['mol']]
        plotHTML = '''
        <html><head><meta charset="utf-8" /></head><body>
            <script type="text/javascript" src="https://cdn.plot.ly/plotly-latest.js"></script>
            <table>
                <tbody>
                    <tr>
                        <td>%s</td>
                        <td><div id="mol" style="height: 300px; width: 300px;"></div></td>
                    </tr>
                </tbody>
            </table>

            <script type="text/javascript">
            var myPlot = document.getElementById('myDiv');
            myPlot.on('plotly_hover', function(data){
                document.getElementById('mol').innerHTML=svgs[data.points[0].pointNumber];
            });

            var svgs = %s;

             </script></body></html>
        '''
        
        
        
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(plot_div, 'html.parser')
        
        div_id=soup.find_all('div')[0].get('id')
        plot_div=plot_div.replace(div_id,'myDiv')
        
        
        with open(html_filename, "w") as text_file:
            text_file.write(plotHTML % (plot_div,str(svgs)))



In [None]:
plot_calc_vs_theo(df_confs.join(df_smiles),'t1_vert','t1_opt',errors=False)
plot_calc_vs_theo(df_confs.join(df_smiles),'t1_vert','t1_opt',errors=False,gen_html=True)

In [None]:
#df_reaxys1=pd.read_pickle('/home/denn/home/ml/data/sf/Reaxys4_uv.pickle')
df_reaxys1=pd.read_pickle('/home/denn/harvard/SF/Library/Reaxys/anth_peryl_uv_below1000_reaxys2_newproc_all.pickle')

In [None]:
#print(df_min.reset_index(level=1)['s1_vert'])
print(df_reaxys1.count())

In [None]:
#df_reaxys1['mol__inchikey']=df_reaxys1['InChI Key'].str.split('-').str[0]

In [None]:
#df_reaxys1=df_reaxys1.set_index('mol__inchikey')

# df_s1_agg=df_low.groupby(level=1)['s1_vert','t1_opt','t2_vert','s1_vert_lcpbe','s1_vert_wb97xd'].agg({
#                                                    's1_vert': {
#                                                        'std':'std',
#                                                        'median':'median',
#                                                        'max':'max',
#                                                        'min':'min',
#                                                        'count':'count',
#                                                        'mean':'mean',
#                                                    },
#                                                     's1_vert_lcpbe': {
#                                                        'std':'std',
#                                                        'median':'median',
#                                                        'max':'max',
#                                                        'min':'min',
#                                                        'count':'count',
#                                                        'mean':'mean',
#                                                     }
#                                                     })

In [None]:
show(df_evt_b3lyp_s1[:5])

In [None]:
df_expvstheo=df_reaxys1.join(df_low.reset_index(level=1)[['s1_vert','geoms__parents','t1_opt','t2_vert','s1_vert_lcpbe','s1_vert_wb97xd']],how='inner')
#print(df_expvstheo.shape)
df_expvstheo1=df_expvstheo.reset_index().set_index(['mol__inchikey','geoms__parents'])

print(df_expvstheo1.count())
#show(df_expvstheo1.head(10))

In [None]:
# s1='t1_opt'
# df_expvstheo=df_expvstheo1.dropna(subset=[s1])
# df_expvstheo[['t1_opt','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_b3lyp_t1_reaxys2.csv',header=False,index=False)

In [None]:
df_s1_agg=df_low.groupby(level=0)[['s1_vert','t1_opt','t2_vert','s1_vert_lcpbe','s1_vert_wb97xd']].agg({
                                                       'std':'std',
                                                       'median':'median',
                                                       'max':'max',
                                                       'min':'min',
                                                       'count':'count',
                                                       'mean':'mean',
                                                   })

#reax_s1.loc(('s1_vert','max_min'))=reax_s1.loc(('s1_vert','max'))-reax_s1.loc(('s1_vert','max'))

In [None]:

#df_s1_agg.loc[:,(slice(None),'s1_vert')][:10]
#df_s1_agg['max_min']=(df_s1_agg.loc[:,'max']-df_s1_agg.loc[:,'min'])








In [None]:
df_b3lyp_s1=df_s1_agg.xs('s1_vert',level=1,axis=1).copy()
df_b3lyp_s1['max_min']=df_b3lyp_s1['max']-df_b3lyp_s1['min']
df_evt_b3lyp_s1=df_reaxys1.join(df_b3lyp_s1,how='inner',rsuffix='_s1')

s1='s1_vert'
df_expvstheo=df_expvstheo1.dropna(subset=[s1,'mean_rj']).copy()
df_expvstheo=df_expvstheo[df_expvstheo['max_min']<0.2]
#df_expvstheo=df_expvstheo[df_expvstheo['std_rj'].fillna(0.)<0.1]
#df_expvstheo=df_expvstheo[df_expvstheo['mean_rj']>320]
print(df_expvstheo[s1].count())
#df_expvstheo['s1_exp']=NM_TO_EV/df_expvstheo['max']
df_expvstheo['s1_exp']=df_expvstheo['mean_rj']
df_expvstheo['s1_diff']=df_expvstheo['s1_exp']-df_expvstheo[s1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<0.8]

#df_expvstheo[['mol',s1,'s1_exp','s1_diff']].to_pickle('/home/denn/home/ml/data/gp/anth_exp_theo_b3lyp_reaxys2.pickle')
#df_expvstheo[['s1_diff','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_diff_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_exp','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_exp_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_vert','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_b3lyp_b3lyp_reaxys2.csv',header=False,index=False)

#plot_calc_vs_theo(df_expvstheo,s1,'s1_exp',1200, 1200, True)
SLOPE=1.05
INT=0
df_evt_b3lyp_s1=df_evt_b3lyp_s1[df_evt_b3lyp_s1.max_min_s1<0.2]
print(df_evt_b3lyp_s1.shape)
df_evt_b3lyp_s1=df_evt_b3lyp_s1[df_evt_b3lyp_s1.max_min<0.2]
print(df_evt_b3lyp_s1.shape)
df_evt_b3lyp_s1['s1_exp']=df_evt_b3lyp_s1['min']
df_evt_b3lyp_s1['s1_vert']=df_evt_b3lyp_s1['mean_s1']#*SLOPE

X=df_evt_b3lyp_s1[['s1_exp','s1_vert']].values
from sklearn.covariance import EllipticEnvelope
outliers_fraction = 0.04
clf=EllipticEnvelope(contamination=outliers_fraction)
clf.fit(X)
y_pred = clf.predict(X)
df_evt_b3lyp_s1=df_evt_b3lyp_s1[y_pred==1]
print(df_evt_b3lyp_s1.shape)

df_evt_b3lyp_s1['s1_vert']=df_evt_b3lyp_s1['s1_vert']#*SLOPE
df_evt_b3lyp_s1['s1_diff']=df_evt_b3lyp_s1['s1_exp']-df_evt_b3lyp_s1['s1_vert']
#df_evt_b3lyp_s1=df_evt_b3lyp_s1[np.abs(df_evt_b3lyp_s1['s1_diff'])<0.6]
#print(df_evt_b3lyp_s1.shape)

#plot_calc_vs_theo(df_evt_b3lyp_s1,'s1_vert','s1_exp',800, 800, False,True)
plot_calc_vs_theo(df_evt_b3lyp_s1,'s1_vert','s1_exp',800, 800, False,False)
#df_evt_b3lyp_s1[['mol','s1_vert','s1_exp','s1_diff']].to_pickle('/home/denn/home/ml/data/gp/reaxys2_s1_exp_theo_b3lyp.pickle')

from sklearn.metrics import mean_absolute_error,mean_squared_error
def print_errors(title:str,exp:pd.DataFrame,calc:pd.DataFrame):
        print(title,'MAE:',mean_absolute_error(exp, calc), 'RMSE',np.sqrt(mean_squared_error(exp, calc)))

df_=df_evt_b3lyp_s1
print(df_[s1].count())
#print_errors('s1 mode',NM_TO_EV/df_['mode'], df_[s1])
print_errors('s1 mean',df_['mean'], df_[s1])
print_errors('s1 mean_rj',df_['mean_rj'], df_[s1])
print_errors('s1 median',df_['median'], df_[s1])
print_errors('s1 min',df_['min'], df_[s1])
print('NM:')
print_errors('s1 mean',NM_TO_EV/df_['mean'], NM_TO_EV/df_[s1])
print_errors('s1 mean_rj',NM_TO_EV/df_['mean_rj'], NM_TO_EV/df_[s1])
print_errors('s1 median',NM_TO_EV/df_['median'], NM_TO_EV/df_[s1])
print_errors('s1 min',NM_TO_EV/df_['min'], NM_TO_EV/df_[s1])

In [None]:
df_lcpbe_s1=df_s1_agg.xs('s1_vert_lcpbe',level=1,axis=1).copy()
df_lcpbe_s1['max_min']=df_lcpbe_s1['max']-df_lcpbe_s1['min']
df_lcpbe_s1=df_reaxys1.join(df_lcpbe_s1,how='inner',rsuffix='_s1')

s1='s1_vert_lcpbe'
df_expvstheo=df_expvstheo1.dropna(subset=[s1,'mean_rj']).copy()
df_expvstheo=df_expvstheo[df_expvstheo['max_min']<0.2]
#df_expvstheo=df_expvstheo[df_expvstheo['std_rj'].fillna(0.)<0.1]
#df_expvstheo=df_expvstheo[df_expvstheo['mean_rj']>320]
print(df_expvstheo[s1].count())
#df_expvstheo['s1_exp']=NM_TO_EV/df_expvstheo['max']
df_expvstheo['s1_exp']=df_expvstheo['mean_rj']
df_expvstheo['s1_diff']=df_expvstheo['s1_exp']-df_expvstheo[s1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<0.8]

#df_expvstheo[['mol',s1,'s1_exp','s1_diff']].to_pickle('/home/denn/home/ml/data/gp/anth_exp_theo_b3lyp_reaxys2.pickle')
#df_expvstheo[['s1_diff','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_diff_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_exp','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_exp_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_vert','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_b3lyp_b3lyp_reaxys2.csv',header=False,index=False)

plot_calc_vs_theo(df_expvstheo,s1,'s1_exp',1200, 1200, True)
# SLOPE=1.05
# INT=0
df_lcpbe_s1=df_lcpbe_s1[df_lcpbe_s1.max_min_s1<0.2]
print(df_lcpbe_s1.shape)
df_lcpbe_s1=df_lcpbe_s1[df_lcpbe_s1.max_min<0.2]
print(df_lcpbe_s1.shape)
df_lcpbe_s1['s1_exp']=df_lcpbe_s1['min']
df_lcpbe_s1['s1_vert']=df_lcpbe_s1['mean_s1']#*SLOPE

# X=df_lcpbe_s1[['s1_exp','s1_vert']].values
# from sklearn.covariance import EllipticEnvelope
# outliers_fraction = 0.04
# clf=EllipticEnvelope(contamination=outliers_fraction)
# clf.fit(X)
# y_pred = clf.predict(X)
# df_lcpbe_s1=df_lcpbe_s1[y_pred==1]
# print(df_lcpbe_s1.shape)

df_lcpbe_s1['s1_vert']=df_lcpbe_s1['s1_vert']#*SLOPE
df_lcpbe_s1['s1_diff']=df_lcpbe_s1['s1_exp']-df_lcpbe_s1['s1_vert']
#df_lcpbe_s1=df_lcpbe_s1[np.abs(df_lcpbe_s1['s1_diff'])<0.6]
#print(df_lcpbe_s1.shape)

#plot_calc_vs_theo(df_lcpbe_s1,'s1_vert','s1_exp',800, 800, False,True)
plot_calc_vs_theo(df_lcpbe_s1,'s1_vert','s1_exp',800, 800, False,False)
df_lcpbe_s1[['mol','s1_vert','s1_exp','s1_diff']].to_pickle('/home/denn/home/ml/data/gp/reaxys2_s1_exp_theo_lcpbe.pickle')

from sklearn.metrics import mean_absolute_error,mean_squared_error
def print_errors(title:str,exp:pd.DataFrame,calc:pd.DataFrame):
        print(title,'MAE:',mean_absolute_error(exp, calc), 'RMSE',np.sqrt(mean_squared_error(exp, calc)))

df_=df_lcpbe_s1
s1='s1_vert'
print(df_[s1].count())
#print_errors('s1 mode',NM_TO_EV/df_['mode'], df_[s1])
print_errors('s1 mean',df_['mean'], df_[s1])
print_errors('s1 mean_rj',df_['mean_rj'], df_[s1])
print_errors('s1 median',df_['median'], df_[s1])
print_errors('s1 min',df_['min'], df_[s1])
print('NM:')
print_errors('s1 mean',NM_TO_EV/df_['mean'], NM_TO_EV/df_[s1])
print_errors('s1 mean_rj',NM_TO_EV/df_['mean_rj'], NM_TO_EV/df_[s1])
print_errors('s1 median',NM_TO_EV/df_['median'], NM_TO_EV/df_[s1])
print_errors('s1 min',NM_TO_EV/df_['min'], NM_TO_EV/df_[s1])

In [None]:
df_wb97xd_s1=df_s1_agg.xs('s1_vert_wb97xd',level=1,axis=1).copy()
df_wb97xd_s1['max_min']=df_wb97xd_s1['max']-df_wb97xd_s1['min']
df_wb97xd_s1=df_reaxys1.join(df_wb97xd_s1,how='inner',rsuffix='_s1')

s1='s1_vert_wb97xd'
df_expvstheo=df_expvstheo1.dropna(subset=[s1,'mean_rj']).copy()
df_expvstheo=df_expvstheo[df_expvstheo['max_min']<0.2]
#df_expvstheo=df_expvstheo[df_expvstheo['std_rj'].fillna(0.)<0.1]
#df_expvstheo=df_expvstheo[df_expvstheo['mean_rj']>320]
print(df_expvstheo[s1].count())
#df_expvstheo['s1_exp']=NM_TO_EV/df_expvstheo['max']
df_expvstheo['s1_exp']=df_expvstheo['mean_rj']
df_expvstheo['s1_diff']=df_expvstheo['s1_exp']-df_expvstheo[s1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<0.8]

#df_expvstheo[['mol',s1,'s1_exp','s1_diff']].to_pickle('/home/denn/home/ml/data/gp/anth_exp_theo_b3lyp_reaxys2.pickle')
#df_expvstheo[['s1_diff','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_diff_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_exp','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_exp_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_vert','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_b3lyp_b3lyp_reaxys2.csv',header=False,index=False)

#plot_calc_vs_theo(df_expvstheo,s1,'s1_exp',1200, 1200, True)
# SLOPE=1.05
# INT=0
df_wb97xd_s1=df_wb97xd_s1[df_wb97xd_s1.max_min_s1<0.2]
print(df_wb97xd_s1.shape)
df_wb97xd_s1=df_wb97xd_s1[df_wb97xd_s1.max_min<0.2]
print(df_wb97xd_s1.shape)
df_wb97xd_s1['s1_exp']=df_wb97xd_s1['min']
df_wb97xd_s1['s1_vert']=df_wb97xd_s1['mean_s1']#*SLOPE

# X=df_wb97xd_s1[['s1_exp','s1_vert']].values
# from sklearn.covariance import EllipticEnvelope
# outliers_fraction = 0.04
# clf=EllipticEnvelope(contamination=outliers_fraction)
# clf.fit(X)
# y_pred = clf.predict(X)
# df_wb97xd_s1=df_wb97xd_s1[y_pred==1]
# print(df_wb97xd_s1.shape)

df_wb97xd_s1['s1_vert']=df_wb97xd_s1['s1_vert']#*SLOPE
df_wb97xd_s1['s1_diff']=df_wb97xd_s1['s1_exp']-df_wb97xd_s1['s1_vert']
#df_wb97xd_s1=df_wb97xd_s1[np.abs(df_wb97xd_s1['s1_diff'])<0.6]
#print(df_wb97xd_s1.shape)

#plot_calc_vs_theo(df_wb97xd_s1,'s1_vert','s1_exp',800, 800, False,True)
plot_calc_vs_theo(df_wb97xd_s1,'s1_vert','s1_exp',800, 800, False,False)
df_wb97xd_s1[['mol','s1_vert','s1_exp','s1_diff']].to_pickle('/home/denn/home/ml/data/gp/reaxys2_s1_exp_theo_wb97xd.pickle')

from sklearn.metrics import mean_absolute_error,mean_squared_error
def print_errors(title:str,exp:pd.DataFrame,calc:pd.DataFrame):
        print(title,'MAE:',mean_absolute_error(exp, calc), 'RMSE',np.sqrt(mean_squared_error(exp, calc)))

df_=df_wb97xd_s1
s1='s1_vert'
print(df_[s1].count())
#print_errors('s1 mode',NM_TO_EV/df_['mode'], df_[s1])
print_errors('s1 mean',df_['mean'], df_[s1])
print_errors('s1 mean_rj',df_['mean_rj'], df_[s1])
print_errors('s1 median',df_['median'], df_[s1])
print_errors('s1 min',df_['min'], df_[s1])
print('NM:')
print_errors('s1 mean',NM_TO_EV/df_['mean'], NM_TO_EV/df_[s1])
print_errors('s1 mean_rj',NM_TO_EV/df_['mean_rj'], NM_TO_EV/df_[s1])
print_errors('s1 median',NM_TO_EV/df_['median'], NM_TO_EV/df_[s1])
print_errors('s1 min',NM_TO_EV/df_['min'], NM_TO_EV/df_[s1])

In [None]:
#show(df_expvstheo[:5])

In [None]:
s1='s1_vert_lcpbe'
df_expvstheo=df_expvstheo1.dropna(subset=[s1,'mean_rj']).copy()
df_expvstheo=df_expvstheo[df_expvstheo['max_min']<0.2]
df_expvstheo=df_expvstheo[df_expvstheo['std_rj'].fillna(0.)<0.1]
#df_expvstheo=df_expvstheo[df_expvstheo['mean_rj']>320]
print(df_expvstheo['Structure'].count())
#df_expvstheo['s1_exp']=NM_TO_EV/df_expvstheo['max']
df_expvstheo['s1_exp']=df_expvstheo['mean_rj']
df_expvstheo['s1_diff']=df_expvstheo['s1_exp']-df_expvstheo[s1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<0.8]

#df_expvstheo[['mol',s1,'s1_exp','s1_diff']].to_pickle('/home/denn/home/ml/data/gp/anth_exp_theo_b3lyp_reaxys2.pickle')
#df_expvstheo[['s1_diff','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_diff_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_exp','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_exp_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_vert','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_b3lyp_b3lyp_reaxys2.csv',header=False,index=False)

plot_calc_vs_theo(df_expvstheo,s1,'s1_exp')

from sklearn.metrics import mean_absolute_error,mean_squared_error
def print_errors(title:str,exp:pd.DataFrame,calc:pd.DataFrame):
        print(title,'MAE:',mean_absolute_error(exp, calc), 'RMSE',np.sqrt(mean_squared_error(exp, calc)))

df_=df_expvstheo
print(df_['Structure'].count())
#print_errors('s1 mode',NM_TO_EV/df_['mode'], df_[s1])
print_errors('s1 mean',df_['mean'], df_[s1])
print_errors('s1 mean_rj',df_['mean_rj'], df_[s1])
print_errors('s1 median',df_['median'], df_[s1])
print_errors('s1 max',df_['max'], df_[s1])
print('NM:')
print_errors('s1 mean',NM_TO_EV/df_['mean'], NM_TO_EV/df_[s1])
print_errors('s1 mean_rj',NM_TO_EV/df_['mean_rj'], NM_TO_EV/df_[s1])
print_errors('s1 median',NM_TO_EV/df_['median'], NM_TO_EV/df_[s1])
print_errors('s1 max',NM_TO_EV/df_['max'], NM_TO_EV/df_[s1])


In [None]:
s1='s1_vert_wb97xd'
df_expvstheo=df_expvstheo1.dropna(subset=[s1,'mean_rj']).copy()
#df_expvstheo=df_expvstheo[df_expvstheo['max_min']<0.2]
df_expvstheo=df_expvstheo[df_expvstheo['std_rj'].fillna(0.)<0.1]
#df_expvstheo=df_expvstheo[df_expvstheo['mean_rj']>320]
print(df_expvstheo['Structure'].count())
#df_expvstheo['s1_exp']=NM_TO_EV/df_expvstheo['max']
df_expvstheo['s1_exp']=df_expvstheo['mean_rj']
df_expvstheo['s1_diff']=df_expvstheo['s1_exp']-df_expvstheo[s1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<1]
#df_expvstheo=df_expvstheo[np.abs(df_expvstheo['s1_diff'])<0.8]

#df_expvstheo[['mol',s1,'s1_exp','s1_diff']].to_pickle('/home/denn/home/ml/data/gp/anth_exp_theo_b3lyp_reaxys2.pickle')
#df_expvstheo[['s1_diff','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_diff_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_exp','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_exp_b3lyp_reaxys2.csv',header=False,index=False)
#df_expvstheo[['s1_vert','smiles']].to_csv('/home/denn/home/ml/data/gp/anth_b3lyp_b3lyp_reaxys2.csv',header=False,index=False)

plot_calc_vs_theo(df_expvstheo,s1,'s1_exp')

from sklearn.metrics import mean_absolute_error,mean_squared_error
def print_errors(title:str,exp:pd.DataFrame,calc:pd.DataFrame):
        print(title,'MAE:',mean_absolute_error(exp, calc), 'RMSE',np.sqrt(mean_squared_error(exp, calc)))

df_=df_expvstheo
print(df_['Structure'].count())
#print_errors('s1 mode',NM_TO_EV/df_['mode'], df_[s1])
print_errors('s1 mean',df_['mean'], df_[s1])
print_errors('s1 mean_rj',df_['mean_rj'], df_[s1])
print_errors('s1 median',df_['median'], df_[s1])
print_errors('s1 max',df_['max'], df_[s1])
print('NM:')
print_errors('s1 mean',NM_TO_EV/df_['mean'], NM_TO_EV/df_[s1])
print_errors('s1 mean_rj',NM_TO_EV/df_['mean_rj'], NM_TO_EV/df_[s1])
print_errors('s1 median',NM_TO_EV/df_['median'], NM_TO_EV/df_[s1])
print_errors('s1 max',NM_TO_EV/df_['max'], NM_TO_EV/df_[s1])


In [None]:
dfexp = pd.read_csv('/home/denn/harvard/SF/Library/exp_set_1_th_no_pentderiv.csv')
dfexp['block'] = dfexp.smiles.apply(lambda s: block.Block(smiles=s))
dfexp['mol'] = dfexp.block.apply(lambda b: b.mol)
dfexp['mol__inchikey'] = dfexp.block.apply(lambda b: b.inchikey).str.split('-').str[0]
dfexp = dfexp.set_index('mol__inchikey')

df_expt1 = pd.merge(df_all, dfexp, how='right', right_index=True, left_index=True)

In [None]:
show(df_expt1['mol'].to_frame())

In [None]:
df_d=df_expvstheo1.copy()
df_d['s1_exp']=NM_TO_EV/df_d['mean_rj']
df_d['2t1_opt']=df_d['t1_opt']*2
df_d['criteria1']=df_d['s1_exp']-df_d['2t1_opt']
df_d['criteria2']=df_d['t2_vert']-df_d['2t1_opt']
#df_d1=df_d.reset_index().groupby('mol__inchikey').describe()#.to_frame()
df_d=df_d[df_d['criteria1']>-0.2]
df_d=df_d[df_d['criteria2']>-0.2]
df_d=df_d[df_d['t1_opt']>1.4]
df_d=df_d[df_d['std_rj']<5]
df_d=df_d[np.abs(df_d['s1_exp']-df_d['s1_vert'])<0.3]
#df_d=df_d.reset_index().groupby('mol__inchikey').describe()#.to_frame()
df_d=df_d.reset_index().groupby('mol__inchikey').max()#.to_frame()
#df_d=pd.merge(df_d, df_smiles, how='right', right_index=True, left_index=True)
df_d=df_d.sort_values('criteria1',ascending=False)
df_d=df_d.sort_values('t1_opt',ascending=False)
df_d=df_d.join(df_smiles)
df_d['mol']=df_d.mol__smiles.apply(Chem.MolFromSmiles)
#print(df_d.index)

df_match=pd.merge(df_d,dfxray,how='left',right_index=True,left_index=True,suffixes=('','_x'))
df_match=df_match.sort_values('t1_opt',ascending=False)
#df_match=df_match.sort_values('criteria1',ascending=False)
print(df_d['mol'].count())
print(df_match['ccdc'].count())
#show(df_mae[[0,'mol']])
#pd.options.display.float_format = '{:,.2f}'.format
#show(df_d[['mol','t1_opt','s1_vert','2t1_opt','criteria1','criteria2','geoms__parents','mol__smiles']])
#show(df_d[['mol','t1_opt','s1_vert','2t1_opt','criteria1','t2_vert','criteria2']])
#show(df_match[['mol','ccdc','t1_opt','s1_vert','2t1_opt','criteria1','t2_vert','criteria2','geoms__parents']][df_match['ccdc'].notnull()])
#show(df_match[['mol','ccdc','t1_opt','s1_exp','s1_vert','2t1_opt','criteria1','t2_vert','criteria2','std_rj','count']])
#df_match[['mol','ccdc','t1_opt','s1_vert','2t1_opt','criteria1','t2_vert','criteria2']].to_html('/home/denn/harvard/SF/Meetings/18Oct2016/anthracene_derivatives_t1_1400meV.html',escape=False)
#df_match[['mol','ccdc','t1_opt','s1_vert','2t1_opt','criteria1','t2_vert','criteria2']].to_html('/home/denn/harvard/SF/Meetings/18Oct2016/anthracene_results.html',escape=False)
# subprocess.call(
#     'wkhtmltoimage -f png --width 0 table.html table.png', shell=True)

In [None]:
df_emol=pd.read_pickle('/home/denn/home/ml/data/emolecules.pickle')

In [None]:
df_match_emol=pd.merge(df_match,df_emol,right_index=True,left_index=True,how='left')

In [None]:
#print(df_match_emol.count())
show(df_match_emol[df_match_emol['isosmiles'].notnull()][:10])

In [None]:
#df_min_1=df_min.reset_index().groupby('mol__inchikey').max()#.to_frame()
#dfmatch=pd.merge(df_d,dfxray,right_index=True,left_index=True)
#show(dfmatch[['mol_x','ccdc','smiles']])
#show(df_min_1[:1])
print(dfmatch['mol_x'].count())

In [None]:
def plot_box(dfi:pd.DataFrame, calc, exp=None):
    

    #df=dfi[[exp,calc]].dropna()
    if (exp==None):
        dfi['diff']=dfi[calc]
    else:
        dfi['diff']=dfi[calc]-dfi[exp]
    #print(df_['diff'][:10])
    traces = []
    for mol,new_df in dfi.groupby(level=0):
        traces.append(Box(
            y=new_df['diff'].as_matrix(),
            name=mol,
            boxpoints='all',
            jitter=0.1,
            whiskerwidth=0.2,
            #fillcolor=cls,
            marker=dict(
                size=2,
            ),
            line=dict(width=1),
            boxmean='sd',
        ))
        

    layout = Layout(
        title=calc,
        yaxis=dict(
            autorange=True,
            showgrid=True,
            zeroline=True,
            dtick=0.05,
            #gridcolor='rgb(255, 255, 255)',
            gridwidth=1,
            #zerolinecolor='rgb(255, 255, 255)',
            zerolinewidth=2,
        ),
        #paper_bgcolor='rgb(243, 243, 243)',
        #plot_bgcolor='rgb(243, 243, 243)',
        showlegend=False,
        height=800,
    )
    
    fig = dict( data=traces, layout=layout )
    
    return iplot(fig)

In [None]:
plot_box(df_low,'s1_vert')
plot_box(df_low,'t1_opt')

In [None]:
# def mm(group, y_true):
#     d = group[y_true]
#     #w = group[y_pred]
#     try:
#         return np.mean(np.abs((d - w)))
#     except ZeroDivisionError:
#         return d.mean()
df_d=df_all[['t1_opt','s1_vert','t2_vert','dft_energy']]

df_d['2t1_opt']=df_d['t1_opt']*2
df_d['criteria1']=df_d['s1_vert']-df_d['2t1_opt']
df_d['criteria2']=df_d['t2_vert']-df_d['2t1_opt']
df_d1=df_d
#df_d1=df_d.reset_index().groupby('mol__inchikey').describe()#.to_frame()
df_d=df_d[df_d['criteria1']>-0.2]
#df_d=df_d.reset_index().groupby('mol__inchikey').describe()#.to_frame()
df_d=df_d.reset_index().groupby('mol__inchikey').max()#.to_frame()
#df_d=pd.merge(df_d, df_smiles, how='right', right_index=True, left_index=True)
df_d=df_d.sort_values('criteria1',ascending=False)
df_d=df_d.join(df_smiles)
df_d['mol']=df_d.mol__smiles.apply(Chem.MolFromSmiles)
#print(df_d.index)
#show(df_mae[[0,'mol']])
#show(df_d[['mol','t1_opt','s1_vert','2t1_opt','criteria1','criteria2']])
#show(df_d1[['mol','t1_opt','s1_vert','2t1_opt','criteria1','criteria2']])
#show(df_d1.ix['BZJISJKSSPCDGT-UHFFFAOYNA-N'])
df_d1=df_d1.join(df_smiles)
#show(df_d1.loc['BZJISJKSSPCDGT-UHFFFAOYNA-N'])
show(df_d1.loc['HCSGQHDONHRJCM-UHFFFAOYNA-N'])

# df_d1[['t1_opt','s1_vert','2t1_opt','criteria1','t2_vert','criteria2']].loc['HCSGQHDONHRJCM-UHFFFAOYNA-N'].to_html('table.html',escape=False)
# subprocess.call(
#     'wkhtmltoimage -f png --width 0 table.html table.png', shell=True)

In [None]:
#plot_calc_vs_theo(df_all,'s1_vert_lcpbe','exp_s1')
#plot_calc_vs_theo(df_all,'s1_vert','exp_s1')

In [None]:
df_mae=df_min.reset_index().groupby('mol__inchikey').apply(mae,'exp_s1','s1_vert').to_frame()
df_mae=pd.merge(df_mae, dfexpi, how='outer', right_index=True, left_index=True)
df_mae.sort_values([0],ascending=False,inplace=True)
#
#show(df_mae[[0,'mol']])

In [None]:
# error_cals = Job.objects.filter(
#             status__exact='error')\
#     .order_by('config__name')
# show(pd.DataFrame(list(error_cals.values('config__name','config__parent_class_name'))))

error_mols = Geom.objects.filter(
            childjobs__status__exact='error')\
            .filter(Q(mol__group__name__exact='sf_lib') )\
            .order_by('mol__inchikey')
dfm=pd.DataFrame(list(error_mols.values('mol__inchikey','mol__smiles','childjobs__config__name','mol__tags')))
dfm['mol']=dfm.mol__smiles.apply(Chem.MolFromSmiles)
#dfm=dfm.set_index(['mol__inchikey','childjobs__config__name'])
show(dfm[['childjobs__config__name','mol','mol__inchikey']])

In [None]:
#            .filter(Q(mol__group__name__exact='sf_acenes') | Q(mol__group__name__exact='sf'))\

error_mols = Geom.objects.filter(
            childjobs__status__contains='claimed')\
            .order_by('mol__inchikey')

dfm=pd.DataFrame(list(error_mols.values('mol__smiles','childjobs__config__name','mol__tags','childjobs__uuid')))

dfm['mol']=dfm.mol__smiles.apply(Chem.MolFromSmiles)

#show(dfm)

In [None]:
#            .filter(Q(mol__group__name__exact='sf_acenes') | Q(mol__group__name__exact='sf'))\

emptu_jobs = Geom.objects.filter(
            childjobs__status__contains='claimed')\
            .order_by('mol__inchikey')

dfm=pd.DataFrame(list(error_mols.values('mol__smiles','childjobs__config__name','mol__tags','childjobs__uuid')))

dfm['mol']=dfm.mol__smiles.apply(Chem.MolFromSmiles)

#show(dfm)

In [None]:
#grouped = df.groupby(by=['mol__inchikey'],axis=0)
#print(grouped.groups)
#print(grouped['parentjob__config__name'].head())

In [None]:
time_calcs = Calc.objects.filter(mol__group__name__exact='sf', parentjob__status__exact='done',method__name__contains='tddft')\
        .exclude(method__name__exact='tddft_tda_hybrid_b3lyp', parentjob__config__name__exact='b3lyp_6-31gs_tddft')\
    .values('mol__inchikey','parentjob__config__name','parentjob__duration').distinct()\
    .order_by('mol__inchikey')

#print(time_calcs)

dftime = pd.DataFrame(list(time_calcs))

#show(df.head())
#for c in all_calcs:
#    print(c.id,c.mol.inchikey, c.parentjob.config.name, c.parentjob.duration)

In [None]:
mol = Mol.objects.filter(Q(group__name__exact='sf_acenes') | Q(group__name__exact='sf')).get(inchikey='MMHOLFVKLFQKKD-UHFFFAOYNA-N')

geoms = Geom.objects.filter(mol=mol,parentjob__config__parent_class_name__exact='Mol')

# g = geoms[0]
# 
# g.
for g in geoms[:1]:
    for chg in g.children.all():
        calcs = Calc.objects.filter(parentjob__parentid=chg.id)
        print(calcs)
    
    
#print(geoms.values('method__name'))
#print(mols.geom_set.all().values('calcs__children__parentjob__config__name'))

In [None]:
jobsformol=Calc.objects.filter(mol__inchikey__exact='MWPLVEDNUUSJAV-UHFFFAOYNA-N').values('parentjob__config__name','parentjob__status','geoms','geoms__parents')
pd.DataFrame(list(jobsformol))

In [None]:
time_calcs = Calc.objects.filter(parentjob__status__exact='done')\
        .exclude(method__name__exact='tddft_tda_hybrid_b3lyp')\
    .values('mol__inchikey','parentjob__config__name','parentjob__duration').distinct()\
    .order_by('mol__inchikey')

#print(time_calcs)

dftime = pd.DataFrame(list(time_calcs))

#show(df.head())
#for c in all_calcs:
#    print(c.id,c.mol.inchikey, c.parentjob.config.name, c.parentjob.duration)

In [None]:
dftime.parentjob__duration.sum()/60/60


In [None]:
import glob
import os
import json


In [None]:
JOBS_DIR_ROOT = "/mnt/storage/jobsdirs/archive/"
#JOBS_DIR_ROOT = "/mnt/storage/jobs_sf/20Sep/"

job_info_glob = glob.glob(JOBS_DIR_ROOT+'/**/job_info.json',recursive=True)

paths_by_uuid = dict()

for job_info_path in job_info_glob:
    job_dir_path = os.path.dirname(job_info_path)
    with open(job_info_path,'r') as job_info_file:
        json_info = json.load(job_info_file)
        paths_by_uuid[json_info['uuid']]=job_dir_path

In [None]:
len(paths_by_uuid)
