In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

%load_ext autotime
import os
import shutil
import yaml
import matplotlib.pyplot as plt
from scipy import interpolate
from scipy.optimize import minimize
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_hex
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import scipy.ndimage.filters as filters
import scipy.ndimage.morphology as morphology

from arc.species.species import ARCSpecies
from arc.species.converter import modify_coords, xyz_to_xyz_file_format, cluster_confs_by_rmsd, compare_confs
from arc.plotter import show_sticks
from arc.common import key_by_val
from arc.parser import parse_e_elect, parse_geometry, parse_frequencies
import rmgpy.constants as constants 
from arc.exceptions import ParserError
import re

In [2]:
cwd = os.getcwd()
print(cwd)

/home/oscarwu/Dropbox (MIT)/Scripts/TS_conf_search
time: 636 µs


In [3]:
def process_gaussian_opt_freq_output(logfile):
    if not check_gaussian_normal_termination(logfile):
        raise ParserError('Gaussian error termination.')
    info = dict()
    info['freq'] = get_gaussian_freq(logfile, checkneg=True, ts=False)
    info['xyz'] = get_gaussian_geometry(logfile, plot=False)
    info['energy'] = get_gaussian_energy(logfile)
    return info

time: 4.72 ms


In [4]:
def get_gaussian_freq(logfile, checkneg=True, ts=True):
    freq = parse_frequencies(logfile, software='gaussian')
    neg_freq = tuple([float(x) for x in freq if x < 0])
    if checkneg:
        if ts:
            if len(neg_freq) == 0:
                raise ParserError('Did not find any negative frequencies.')
            elif len(neg_freq) > 1:
                raise ParserError(f'Find more than one negative frequencies: {neg_freq}')
        else:
            if len(neg_freq):
                raise ParserError(f'Find negative frequencies for non-TS species: {neg_freq}')
    return (freq, neg_freq)

time: 2.33 ms


In [5]:
def check_gaussian_normal_termination(logfile):
    with open(logfile, 'r') as f:
        lines = f.readlines()
        forward_lines = tuple(lines)
    for line in forward_lines[-1:-20:-1]:
        if 'Normal termination' in line:
            return True
    else:
        return False

time: 1.52 ms


In [6]:
def get_gaussian_energy(logfile):
    energy_dict = dict()
    e_j_mol = parse_e_elect(logfile)
    energy_dict['J/mol'] = e_j_mol
    e_kj_mol = e_j_mol / 1000
    energy_dict['kJ/mol'] = e_kj_mol
    e_kcal_mol = e_j_mol / 4184
    energy_dict['kcal/mol'] = e_kcal_mol
    e_scf = round(e_j_mol/(constants.E_h * constants.Na / 1000), 9)
    energy_dict['scf'] = e_scf
    return energy_dict

time: 2.77 ms


In [7]:
def get_gaussian_geometry(logfile, plot=False):
    xyz = parse_geometry(logfile)
    if plot:
        show_sticks(xyz)
    return xyz

time: 1.01 ms


In [8]:
ts_name = 'imipramine_4_oo'
g16_opt_output_dir = 'imipramine_4_oo_g16_results'
g16_opt_output_dir_path = os.path.join(cwd, g16_opt_output_dir)

time: 1.13 ms


In [9]:
all_converged_ts = dict()
for f in os.listdir(g16_opt_output_dir_path):
    findex = int(f.split('_')[0])
    d1 = tuple([int(x) for x in re.search('d1(.*)deg1', f).group(1).split('_') if x.isnumeric()])
    deg1 = int(tuple([x for x in re.search('deg1(.*)n', f).group(1).split('_') if x.isnumeric()])[0])
    d2 = tuple([int(x) for x in re.search('d2(.*)deg2', f).group(1).split('_') if x.isnumeric()])
    deg2 = int(tuple([x for x in re.search('deg2(.*)g16', f).group(1).split('_') if x.isnumeric()])[0])
    
    try:
        all_converged_ts[(d1, deg1, d2, deg2, findex)] = process_gaussian_opt_freq_output(os.path.join(g16_opt_output_dir_path, f))
    except:
        continue

time: 11.2 s


In [10]:
len(all_converged_ts.keys())

222

time: 3.79 ms


In [11]:
all_converged_ts_xyz_dict = dict()
for k in all_converged_ts.keys():
    all_converged_ts_xyz_dict[k] = all_converged_ts[k]['xyz']

time: 4.28 ms


In [12]:
all_converged_ts_xyz_distinct_tuple = cluster_confs_by_rmsd(tuple(all_converged_ts_xyz_dict.values()))

time: 8.24 s


In [13]:
len(all_converged_ts_xyz_distinct_tuple)

119

time: 1.59 ms


In [14]:
all_converged_ts_energy_distinct_dict = dict()
for distinct_xyz in all_converged_ts_xyz_distinct_tuple:
    k = key_by_val(all_converged_ts_xyz_dict, distinct_xyz)
    all_converged_ts_energy_distinct_dict[k] = all_converged_ts[k]['energy']['scf']

time: 10.7 ms


In [15]:
len(all_converged_ts_energy_distinct_dict.values())

119

time: 5.41 ms


In [16]:
df = pd.DataFrame.from_dict(all_converged_ts_energy_distinct_dict, orient='index')

time: 2.15 ms


In [17]:
df

Unnamed: 0,0
"((2, 1, 7, 14), 95, (1, 3, 10, 18), 0, 110)",-457.683053
"((2, 1, 7, 14), 243, (1, 2, 4, 11), 308, 165)",-457.677178
"((1, 2, 4, 11), 0, (2, 4, 11, 21), 0, 122)",-457.688089
"((1, 3, 10, 18), 190, (2, 4, 11, 21), 205, 44)",-457.689016
"((2, 1, 7, 14), 133, (1, 3, 10, 18), 37, 112)",-457.681780
...,...
"((1, 3, 9, 15), 94, (1, 3, 10, 18), 39, 142)",-457.683053
"((2, 1, 3, 9), 91, (1, 3, 9, 15), 293, 74)",-457.677306
"((2, 1, 3, 9), 0, (1, 3, 9, 15), 295, 71)",-457.682037
"((2, 1, 3, 9), 334, (1, 2, 4, 11), 187, 33)",-457.686111


time: 15.3 ms


In [18]:
# df.to_excel('imipramine_4_oo_wb97xd.xlsx')

time: 1.05 ms


In [19]:
apfd_liq_script = """%chk={name}.chk
%mem=300000mb
%NProcShared=40

#P integral=(grid=ultrafine, Acc2E=12) SCRF=(smd,Solvent=water) uapfd/6-311+g(2d,p) scf=xqc iop(2/9=2000) 

{name}

0 2
{xyz}




"""

time: 1.62 ms


In [20]:
# apfd_liq_script = """%chk={name}.chk
# %mem=300000mb
# %NProcShared=40

# #P SCRF=(smd,Solvent=water) uwb97xd def2svp iop(2/9=2000) scf=xqc

# {name}

# 0 2
# {xyz}




# """

time: 1.63 ms


In [21]:
apfd_gas_script = """%chk={name}.chk
%mem=300000mb
%NProcShared=40

#P integral=(grid=ultrafine, Acc2E=12) uapfd/6-311+g(2d,p) scf=xqc iop(2/9=2000)   

{name}

0 2
{xyz}




"""

time: 2.05 ms


In [22]:
# apfd_gas_script = """%chk={name}.chk
# %mem=300000mb
# %NProcShared=40

# #P uwb97xd def2svp iop(2/9=2000) scf=xqc

# {name}

# 0 2
# {xyz}




# """

time: 3.79 ms


In [23]:
dlpno_script = """!uHF dlpno-ccsd(t) def2-tzvp def2-tzvp/c  
!NRSCF 
!sp 

%maxcore 6500
%pal # job parallelization settings
nprocs 20
end
%scf # recommended SCF settings 
NRMaxIt 400
NRStart 0.00005
MaxIter 500
end


* xyz 0 2
{xyz}
*


"""

time: 1.23 ms


In [24]:
# dlpno_script = """!uHF wB97X-D3 def2-svp
# !NRSCF 
# !sp 

# %CPCM SMD TRUE
#       SMDSOLVENT "WATER"
# END
# %maxcore 7000
# %pal # job parallelization settings
# nprocs 40
# end
# %scf # recommended SCF settings 
# NRMaxIt 400
# NRStart 0.00005
# MaxIter 500
# end


# * xyz 0 2
# {xyz}
# *


# """

time: 2.21 ms


In [25]:
save_folder = cwd + '/' + ts_name + '_sp_after_opt_inputs'
if os.path.exists(save_folder):
    shutil.rmtree(save_folder)
os.mkdir(save_folder)

time: 4.42 ms


In [26]:
file_counter = 0
save_batch_size = 100000
batch_folder_counter = 1  

for distinct_xyz in all_converged_ts_xyz_distinct_tuple:
    
    k = key_by_val(all_converged_ts_xyz_dict, distinct_xyz)
    
    indices_1 = k[0]
    deg_1 = k[1]
    
    indices_2 = k[2]
    deg_2 = k[3]
    
    if not file_counter % save_batch_size:
        batch_foler = 'batch_' + str(batch_folder_counter) 
        if not os.path.exists(save_folder + '/' + batch_foler):
            os.mkdir(save_folder + '/' + batch_foler)
            
            apfd_gas_dir_path = os.path.join(save_folder, batch_foler, 'apfd_gas')
            apfd_liq_dir_path = os.path.join(save_folder, batch_foler, 'apfd_liq')
#             dlpno_dir_path = os.path.join(save_folder, batch_foler, 'dlpno')

            os.mkdir(apfd_gas_dir_path)
            os.mkdir(apfd_liq_dir_path)
#             os.mkdir(dlpno_dir_path)
    
            batch_folder_counter += 1

    file_counter += 1

    xyz_str = xyz_to_xyz_file_format(distinct_xyz)
    xyz_str = '\n'.join(xyz_str.split('\n')[2:-1])

    d1str = "{0:.4g}".format(deg_1)
    d2str = "{0:.4g}".format(deg_2)

    d1name = '_'.join([str(elem) for elem in indices_1])
    d2name = '_'.join([str(elem) for elem in indices_2])
    comb_name_list = ['d1', d1name, 'deg1', d1str, 'n', 'd2', d2name, 'deg2', d2str]
    comb_name = '_'.join(comb_name_list)
    
    file_base_name = str(file_counter) + '_' + ts_name + '_' + comb_name
        
    apfd_gas_file_base_name = file_base_name + '_apfd_gas'
    apfd_liq_file_base_name = file_base_name + '_apfd_liq'
#     dlpno_file_base_name = file_base_name + '_dlpno'

#     apfd_gas_file_base_name = file_base_name + '_wb97xd_def2svp_gas'
#     apfd_liq_file_base_name = file_base_name + '_wb97xd_def2svp_liq'
#     dlpno_file_base_name = file_base_name + '_dlpno_wb97xd_def2svp_liq'
    
    apfd_gas_file_path = os.path.join(apfd_gas_dir_path, apfd_gas_file_base_name + '.gjf')
    with open(apfd_gas_file_path, 'wt') as f:
        f.write(apfd_gas_script.format(name=apfd_gas_file_base_name, xyz=xyz_str))
    
    
    apfd_liq_file_path = os.path.join(apfd_liq_dir_path, apfd_liq_file_base_name + '.gjf')
    with open(apfd_liq_file_path, 'wt') as f:
        f.write(apfd_liq_script.format(name=apfd_liq_file_base_name, xyz=xyz_str))
    
    
#     dlpno_file_path = os.path.join(dlpno_dir_path, dlpno_file_base_name + '.in')
#     with open(dlpno_file_path, 'wt') as f:
#         f.write(dlpno_script.format(xyz=xyz_str))

time: 49.5 ms


In [27]:
# with open(os.path.join(cwd, 'all_converged_ts_energy_distinct_g16.yml'), 'w') as outfile:
#     yaml.dump(all_converged_ts_energy_distinct_dict, outfile, default_flow_style=False)

time: 530 µs


In [28]:
# def highlight_max(data, color='yellow'):
#     '''
#     highlight the maximum in a Series or DataFrame
#     '''
#     attr = 'background-color: {}'.format(color)
#     if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
#         is_max = data == data.max()
#         return [attr if v else '' for v in is_max]
#     else:  # from .apply(axis=None)
#         is_max = data == data.max().max()
#         return pd.DataFrame(np.where(is_max, attr, ''),
#                             index=data.index, columns=data.columns)
# def highlight_min(data, color='lightgreen'):
#     '''
#     highlight the minimum in a Series or DataFrame
#     '''
#     attr = 'background-color: {}'.format(color)
#     if data.ndim == 1:  # Series from .apply(axis=0) or axis=1
#         is_min = data == data.min()
#         return [attr if v else '' for v in is_min]
#     else:  # from .apply(axis=None)
#         is_min = data == data.min().min()
#         return pd.DataFrame(np.where(is_min, attr, ''),
#                             index=data.index, columns=data.columns)

time: 1.63 ms


In [29]:
# df = pd.DataFrame(all_converged_ts_energy_distinct_dict.values()).sort_values(by=[0])
# # df = df - np.nanmin(df.to_numpy())
# cm = ListedColormap(sns.color_palette("coolwarm", 7))
# df.style.background_gradient(cmap=cm).highlight_null('white').apply(highlight_max, axis=None).apply(highlight_min, axis=None)

time: 2.56 ms


In [30]:
# 627.5094740631 * 0.024598

time: 8.41 ms


In [31]:
# (-573.277143084 - -573.292500) * 627.5094740631

time: 3.92 ms


In [32]:
# dummy = 0
# xyz_dict = dict()
# for i in all_converged_ts_xyz_distinct_tuple:
#     xyz_dict[dummy] = xyz_to_xyz_file_format(i)
#     dummy += 1

time: 4.44 ms


In [33]:
# with open(os.path.join(cwd, 'converged_distinct_xyz.yml'), 'w') as outfile:
#     yaml.dump(xyz_dict, outfile, default_flow_style=False)

time: 3.05 ms
