In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

%load_ext autotime
import os
import re
import shutil
import yaml
import matplotlib.pyplot as plt
from scipy import interpolate
from scipy.optimize import minimize
from matplotlib.colors import ListedColormap
from matplotlib.colors import to_hex
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import scipy.ndimage.filters as filters
import scipy.ndimage.morphology as morphology

from arc.species.species import ARCSpecies
from arc.species.converter import modify_coords, xyz_to_xyz_file_format, cluster_confs_by_rmsd, zmat_to_xyz, check_isomorphism
from arc.plotter import show_sticks
from arc.common import key_by_val, colliding_atoms

In [2]:
def detect_local_minima(arr):
    # https://stackoverflow.com/questions/3684484/peak-detection-in-a-2d-array/3689710#3689710
    """
    Takes an array and detects the troughs using the local maximum filter.
    Returns a boolean mask of the troughs (i.e. 1 when
    the pixel's value is the neighborhood maximum, 0 otherwise)
    """
    # define an connected neighborhood
    # http://www.scipy.org/doc/api_docs/SciPy.ndimage.morphology.html#generate_binary_structure
    neighborhood = morphology.generate_binary_structure(len(arr.shape),2)
    # apply the local minimum filter; all locations of minimum value 
    # in their neighborhood are set to 1
    # http://www.scipy.org/doc/api_docs/SciPy.ndimage.filters.html#minimum_filter
    local_min = (filters.minimum_filter(arr, footprint=neighborhood)==arr)
    # local_min is a mask that contains the peaks we are 
    # looking for, but also the background.
    # In order to isolate the peaks we must remove the background from the mask.
    # 
    # we create the mask of the background
    background = (arr==0)
    # 
    # a little technicality: we must erode the background in order to 
    # successfully subtract it from local_min, otherwise a line will 
    # appear along the background border (artifact of the local minimum filter)
    # http://www.scipy.org/doc/api_docs/SciPy.ndimage.morphology.html#binary_erosion
    eroded_background = morphology.binary_erosion(
        background, structure=neighborhood, border_value=1)
    # 
    # we obtain the final mask, containing only peaks, 
    # by removing the background from the local_min mask
    detected_minima = local_min ^ eroded_background
    return np.where(detected_minima)   

time: 900 µs


In [3]:
def run_2d_params(params, func):
    a, b = params
    return func(a, b)

time: 7.61 ms


In [4]:
ts_name = 'imipramine_4_oo'
cwd = os.getcwd()
print(cwd)
save_folder = cwd + '/' + ts_name + '_g16_inputs'
if os.path.exists(save_folder):
    shutil.rmtree(save_folder)
os.mkdir(save_folder)

/home/oscarwu/Dropbox (MIT)/Scripts/TS_conf_search
time: 7.41 ms


In [5]:
# script = """%chk={name}.chk
# %mem=300000mb
# %NProcShared=40

# #p opt=(calcfc,ts,noeigentest,maxcycles=120) freq guess=mix uwb97xd def2svp iop(2/9=2000) scf=xqc

# {name}

# 0 2
# {xyz}




# """

time: 259 µs


In [6]:
script = """%chk={name}.chk
%mem=300000mb
%NProcShared=40

#p opt=(calcfc,noeigentest,maxcycles=120) freq guess=mix uwb97xd def2svp iop(2/9=2000) scf=xqc

{name}

0 2
{xyz}




"""

time: 6.49 ms


In [7]:
def convert_gaussian_zmat_to_arc_zmat(zmat_file_path):
# Read Gaussian input file with zmat info, output arc zmat

    # read gaussian input
    atom_dict = dict()
    hydrogen_list = list()
    connectivity_dict = dict()

    with open(zmat_file_path, 'r') as f:
        index = 1
        line = f.readline()
        flag0 = True
        flag1 = False
        flag2 = False
        while line:
            if flag0:
                try:
                    if not line.split()[0] == "0":
                        line = f.readline()
                    else:
                        flag0 = False
                        flag1 = True
                        line = f.readline()
                except IndexError:
                    line = f.readline()

            if flag1:
                line_content = line.split()
                atom_dict[index] = dict()
                atom_dict[index].update(atom = line_content[0])
                if line_content[0] == 'H':
                    hydrogen_list.append(index)
                if index > 1:
                    atom_dict[index].update(bond = (line_content[1], line_content[2]))
                if index > 2:
                    atom_dict[index].update(angle = (line_content[3], line_content[4]))
                if index > 3:
                    atom_dict[index].update(dihedral = (line_content[5], line_content[6]))
                line = f.readline()
                index += 1
                if not line.split():
                    flag1 = False
                    flag2 = True
                    line = f.readline()
            elif flag2:
                line_content = line.split()
                if not line_content:
                    break
                key = line_content[0]
                val = line_content[1]
                connectivity_dict[key] = val
                line = f.readline()
      
    # convert to ARC zmat
    arc_zmat = dict()
    symbols_list = list()
    coords_list = list()
    vars_dict = dict()
    map_dict = dict()
            
    for atom_id in atom_dict.keys():
        atom_num = atom_id - 1
        map_dict[atom_num] = atom_num

        atom_id_dict = atom_dict[atom_id]

        symbols_list.append(atom_id_dict['atom'])

        bond_info_tuple = atom_id_dict.get('bond', None)
        angle_info_tuple = atom_id_dict.get('angle', None)
        dihedral_info_tuple = atom_id_dict.get('dihedral', None)

        R = None
        A = None
        D = None

        if bond_info_tuple is not None:
            r = str(int(bond_info_tuple[0]) - 1)
            R = '_'.join(['R', str(atom_num), r])
            vars_dict[R] = float(connectivity_dict[bond_info_tuple[1]])

        if angle_info_tuple is not None:
            a = str(int(angle_info_tuple[0]) - 1)
            A = '_'.join(['A', str(atom_num), r, a])
            vars_dict[A] = float(connectivity_dict[angle_info_tuple[1]])

        if dihedral_info_tuple is not None:
            d = str(int(dihedral_info_tuple[0]) - 1)
            D = '_'.join(['D', str(atom_num), r, a, d])
            vars_dict[D] = float(connectivity_dict[dihedral_info_tuple[1]])

        coords_list.append((R, A, D))

    arc_zmat['symbols'] = tuple(symbols_list)
    arc_zmat['coords'] = tuple(coords_list)
    arc_zmat['vars'] = vars_dict
    arc_zmat['map'] = map_dict
    
    return arc_zmat

time: 5.71 ms


In [8]:
zmat_file_path = os.path.join(cwd, 'imipramine_4_oo.gjf')
arc_zmat = convert_gaussian_zmat_to_arc_zmat(zmat_file_path)

time: 5.1 ms


In [9]:
spc_name = 'imipramine_4_oo'
spc = ARCSpecies(label=spc_name, xyz=arc_zmat, is_ts=False, multiplicity=2, smiles='NCCC(O[O])N(C)C')
spc.mol_from_xyz(arc_zmat)

time: 275 ms


In [10]:
spc.determine_rotors()
dihedrals = list()
for i in range(len(spc.rotors_dict.keys())):
    dihedrals.append(spc.rotors_dict[i]['scan'])
dihedrals

[[3, 1, 2, 4],
 [2, 1, 3, 9],
 [2, 1, 7, 14],
 [1, 2, 4, 11],
 [1, 3, 9, 15],
 [1, 3, 10, 18],
 [2, 4, 11, 21]]

time: 2.78 ms


In [11]:
# pivot_to_scan_dict = dict()
# for k in spc.rotors_dict.keys():
#     pivot_to_scan_dict[tuple(spc.rotors_dict[k]['pivots'])] = tuple(spc.rotors_dict[k]['scan'])
# pivot_to_scan_dict[(23, 22)] = (24, 23, 22, 11)
# pivot_to_scan_dict[(22, 11)] = (23, 22, 11, 8)
# pivot_to_scan_dict

time: 6.5 ms


In [12]:
# del pivot_to_scan_dict[(1, 19)]
# del pivot_to_scan_dict[(8, 12)]
# del pivot_to_scan_dict[(23, 24)]
# del pivot_to_scan_dict[(24, 25)]

time: 2.07 ms


In [13]:
# pivot_to_scan_dict

time: 2.29 ms


In [14]:
# fragments= [[f+1 for f in range(21)],[22, 23, 24, 25, 26, 27, 28]]

time: 2.2 ms


In [15]:
psi4_scan_dir = cwd + '/' + 'en_imipramine_4_oo_2d_scan'

time: 2.09 ms


In [16]:
all_xyz_dict = dict()

fingerprint = 0
scan_pts = 45
scan_deg = 360 / scan_pts

for grid_search_file_name in os.listdir(psi4_scan_dir):
    
    indices_1 = tuple([int(x) for x in re.search('_oo_(.*)_n_', grid_search_file_name).group(1).split('_') if x.isnumeric()])
    indices_2 = tuple([int(x) for x in re.search('_n_(.*)_coord', grid_search_file_name).group(1).split('_') if x.isnumeric()])
    
    print('----------------------------')
    print(f'Considering dihedral combinations: {indices_1} and {indices_2}')
    
    with open(psi4_scan_dir + '/' + grid_search_file_name, 'r') as outfile:
        energy = yaml.load(outfile, Loader=yaml.FullLoader)
        
    a = np.zeros((scan_pts,scan_pts))
    for k in energy.keys():
        a[k[0],k[1]] = energy[k]

    label = [str(n*scan_deg) for n in range(scan_pts)]
    df = pd.DataFrame(a, index=label, columns=label)

    df = df.interpolate(method='linear', axis=0, limit_direction='both').interpolate(method='linear', axis=1, limit_direction='both')

    g = interpolate.RectBivariateSpline(range(scan_pts), range(scan_pts), df)

    local_minima_locations = detect_local_minima(df)

    x0s = (np.array([x,y]) for x, y in zip(local_minima_locations[0], local_minima_locations[1]))  

    res_list = list()
    for x0 in x0s:
        res = minimize(run_2d_params, x0=x0, args=g, method='Nelder-Mead', tol=1e-12)
        res_list.append(res)
    res_tuple = tuple(res_list)
    
    res_result = tuple([(r.x[0], r.x[1], r.fun) for r in res_tuple])
    print('Fitted local minima')
    print(res_result)
    
    for r in res_result:
        
        new_val_1 = r[0]
        new_val_2 = r[1]
        
        xyz_1_new_dihedral = new_val_1*scan_deg
        xyz_2_new_dihedral = new_val_2*scan_deg
        

        xyz_1 = modify_coords(coords=arc_zmat,
                              indices=indices_1,
                              new_value=xyz_1_new_dihedral,
                              modification_type='groups',
                              mol=spc.mol,
                              index=1,
                              output_zmat=True,
                             )

        xyz_2 = modify_coords(coords=xyz_1,
                              indices=indices_2,
                              new_value=xyz_2_new_dihedral,
                              modification_type='groups',
                              mol=spc.mol,
                              index=1,
                              output_zmat=True,
                             )
        
        fingerprint += 1
    
        all_xyz_dict[(indices_1, int(round(xyz_1_new_dihedral)), indices_2, int(round(xyz_2_new_dihedral)), fingerprint)] = zmat_to_xyz(xyz_2) 

----------------------------
Considering dihedral combinations: (3, 1, 2, 4) and (2, 1, 3, 9)
Fitted local minima
((0.39579509129656787, 5.924877947660185, -461.22358294161046), (0.3957951030363521, 5.924878111782172, -461.22358294161046), (0.4044709742860767, 13.336728190569838, -459.9933172435907), (0.37831294729561693, 17.37799271826996, -459.74030499543113), (0.3814966209576901, 35.55550863991819, -459.5140037337526), (0.38725425981471295, 351.19308569474913, -458.32628473245023), (7.246009765153062, 8.666498282224719, -455.95866488830194), (7.515973096987006, 24.629254316625772, -456.3352116933844), (7.822392011993491, 40.205986740534385, -456.3651384422879), (12.697182119514888, 39.91466055147659, -457.57891767773646), (16.0246480763953, 4.917367307832507, -457.5638615930331), (20.75505835384796, 11.37108604978962, -457.6513525037687), (20.600085915420912, 41.721608504148826, -457.67214699315474), (21.95575537726897, 23.808047567736878, -457.6886061188621), (20.717049571768193, 9

----------------------------
Considering dihedral combinations: (2, 1, 3, 9) and (2, 4, 11, 21)
Fitted local minima
((4.062537970631654, 5.244806473250254, -457.5883826593529), (4.162666262794708, 33.01987424467159, -457.58797298217263), (4.1626632020509415, 33.01988249564549, -457.5879729821725), (9.42407955215106, 38.75513379224377, -457.6677340100584), (9.423910270014435, 4.0550692387392235, -457.6705528992289), (9.42407915572088, 38.75512382423593, -457.6677340100585), (23.8186155253989, 10.460189277811214, -457.6886027498928), (23.989202677330287, 37.3717544418237, -457.68033908863333), (24.784696319215872, 24.89311079577023, -457.6783878598785), (41.85388130267671, 10.674895117483011, -457.67027568277666), (41.81754512170713, 22.409038194079628, -457.66670592377267), (41.74753544738428, 37.39042596675172, -457.66565649822695))
----------------------------
Considering dihedral combinations: (3, 1, 2, 4) and (2, 1, 7, 14)
Fitted local minima
((0.4732918884395666, 6.403974629770403,

----------------------------
Considering dihedral combinations: (2, 1, 3, 9) and (2, 1, 7, 14)
Fitted local minima
((0.21948024281573214, 18.34473210596296, -457.75714085364376), (-0.00040488594770431434, 22.262996864318836, -457.61234325487), (-0.00026597691213692086, 30.98911602894713, -457.5551283351648), (6.833798189199327e-06, 43.66389451499074, -457.63290808775383), (2.8884398230214714, 35.73585850618424, -457.5532880020593), (3.4793564349252826, 9.601465098948239, -457.66152173295296), (3.2949870765415934, 12.270697218501542, -457.68040534030166), (2.3722346694184724, 19.38679714213189, -457.73535015939916), (2.8884386408529688, 35.73585811441896, -457.5532880020593), (3.714776752617087, 20.51422231842502, -457.67170946673207), (8.159290579172673, 15.521044194831635, -457.66992597077547), (7.639786964061583, 31.36729432394968, -457.67222499190206), (9.323083204406919, 22.745217643100382, -457.69163555293096), (9.367296568070882, 44.389523264649696, -457.4781823850088), (10.50149

In [17]:
non_colliding_xyz = [xyz for xyz in tuple(all_xyz_dict.values()) if not colliding_atoms(xyz=xyz, threshold=0.65)]

time: 168 ms


In [18]:
len(non_colliding_xyz)

299

time: 1.59 ms


In [19]:
isomorphic_xyz = list()
for xyz in tuple(non_colliding_xyz):
    try:
        spc_to_check = ARCSpecies(label='check', xyz=xyz, is_ts=False, multiplicity=2, smiles='NCCC(O[O])N(C)C')
        spc_to_check.mol_from_xyz(xyz)
        
        if check_isomorphism(spc.mol, spc_to_check.mol):
            isomorphic_xyz.append(xyz)
    except:
        continue

<bound method Molecule.to_adjacency_list of Molecule(smiles="[O-]O[NH2+]C[CH2].C[N].[CH3].[CH]")>
Got the following error:
Could not map non isomorphic molecules
<bound method Molecule.to_adjacency_list of Molecule(smiles="[N-](C=[CH+])C.O[O].[CH2]N.[CH2]")>
Got the following error:
Could not map non isomorphic molecules
Unable to correctly parse InChI=1/C2H4N.CH4N.2CH2.HO.O/c1-3-2;1-2;;;;/h1H,2H3;1-2H2;2*1H2;1H; with backend try-all.
Unable to correctly parse InChI=1/C2H4N.CH4N.2CH2.HO.O/c1-3-2;1-2;;;;/h1H,2H3;1-2H2;2*1H2;1H; with backend try-all.
<bound method Molecule.to_adjacency_list of Molecule(smiles="[N-](C=[CH+])C.O[O].[CH2]N.[CH3]")>
Got the following error:
Could not map non isomorphic molecules
Unable to correctly parse InChI=1/C2H6N.C2H2.CH3N.O2.2H/c1-2-3;3*1-2;;/h1-3H2;1-2H;1H3;;; with backend try-all.
Unable to correctly parse InChI=1/C2H6N.C2H2.CH3N.O2.2H/c1-2-3;3*1-2;;/h1-3H2;1-2H;1H3;;; with backend try-all.
<bound method Molecule.to_adjacency_list of Molecule(smiles=

time: 18.5 s


In [20]:
all_xyz_distinct_tuple = cluster_confs_by_rmsd(tuple(isomorphic_xyz))

time: 17.8 s


In [21]:
len(all_xyz_distinct_tuple)

236

time: 1.87 ms


In [22]:
file_counter = 0
save_batch_size = 200000
batch_folder_counter = 1  

for distinct_xyz in all_xyz_distinct_tuple:
    
    k = key_by_val(all_xyz_dict, distinct_xyz)
    
    indices_1 = k[0]
    deg_1 = k[1]
    
    indices_2 = k[2]
    deg_2 = k[3]
    
    if not file_counter % save_batch_size:
        batch_foler = 'batch_' + str(batch_folder_counter) 
        if not os.path.exists(save_folder + '/' + batch_foler):
            os.mkdir(save_folder + '/' + batch_foler)
            batch_folder_counter += 1

    file_counter += 1

    xyz_str = xyz_to_xyz_file_format(distinct_xyz)
    xyz_str = '\n'.join(xyz_str.split('\n')[2:-1])

    d1str = "{0:.4g}".format(deg_1)
    d2str = "{0:.4g}".format(deg_2)

    d1name = '_'.join([str(elem) for elem in indices_1])
    d2name = '_'.join([str(elem) for elem in indices_2])
    comb_name_list = ['d1', d1name, 'deg1', d1str, 'n', 'd2', d2name, 'deg2', d2str]
    comb_name = '_'.join(comb_name_list)
    g16_file_base_name = str(file_counter) + '_' + ts_name + '_' + comb_name + '_g16'   

    g16_file_path = save_folder + '/' + batch_foler + '/' + g16_file_base_name + '.gjf'
    with open(g16_file_path, 'wt') as f:
        f.write(script.format(name=g16_file_base_name, xyz=xyz_str))

time: 79.5 ms
