In [83]:
import re
import numpy as np
import pandas as pd
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from ase.visualize import view
from ase.io import read
from pysmiles import write_smiles
from rdkit import Chem
from tqdm import tqdm
from rdkit.Chem import Draw
%matplotlib inline

In [2]:
smiles = []
# Clean up file with SMILE strings for molecules.
# File created from all_relaxed_molecules.xyz using Open Babel.
with open("data/all_relaxed_molecules.can", "r") as reader:
    for line in tqdm(reader):
        smile = line[:line.index("Lattice")].strip()
        smiles.append(smile)

3242it [00:00, 620570.17it/s]


In [3]:
len(smiles)

3242

In [4]:
# Naive attempt at finding aromatic rings using a regular expression based on SMILES string for benzene.
benzene_re = ".*c[0-9]+ccccc[0-9]+"
benzenes = [x for x in zip(smiles, range(len(smiles))) if re.search(benzene_re, x[0])]
b_smiles, b_idxs = list(zip(*benzenes))

In [5]:
# Find any molecule whose SMILES string contains at least one aromatic carbon (denoted with a "c").
aromatic_carbons = [t for t in enumerate(smiles) if t[1].count("c") > 0]
ac_idxs, ac_smiles = list(zip(*aromatic_carbons))

In [6]:
# List the indices of molecules with at least one aromatic carbon that were NOT detected with the 
# regular expression.
ac_no_b_idxs = [x for x in ac_idxs if not x in b_idxs]
len(ac_no_b_idxs)

1257

In [7]:
# Interactive interface for visualizing molecules with one or more aromatic carbons that were not
# detected with the regex search.
out = widgets.Output()

def picture_mols(x):
    out.clear_output()
    with out:
        print(f"showing smiles[{ac_no_b_idxs[x]}]")
        print("SMILES string:", smiles[ac_no_b_idxs[x]])
    display(out)
    mol = Chem.MolFromSmiles(smiles[ac_no_b_idxs[x]])
#    for atom in mol.GetAtoms():
#        atom.SetAtomMapNum(atom.GetIdx())
    return mol

ibox = widgets.BoundedIntText(min=0, max=1256)
interact(picture_mols, x=ibox)

interactive(children=(BoundedIntText(value=0, description='x', max=1256), Output()), _dom_classes=('widget-int…

<function __main__.picture_mols(x)>

In [None]:
molecules = [Chem.MolFromSmiles(s) for s in smiles]

failed_idxs = [i for i in range(len(molecules)) if not molecules[i]]
molecules = list(filter(lambda x: not x == None, molecules))

In [73]:
len(failed_idxs)

170

In [82]:
def is_aromatic_ring(mol, bonds):
    for idx in bonds:
        if not mol.GetBondWithIdx(idx).GetIsAromatic():
            return False
    return True

def is_only_carbon(mol, bonds):
    for idx in bonds:
        bond = mol.GetBondWithIdx(idx)
        if not bond.GetIsAromatic():
            return False
        print(type(bond))

def has_aromatic_ring(mol):
    ri = mol.GetRingInfo()
    for ring in ri.BondRings():
        if is_aromatic_ring(mol, ring):
            return True
    return False

def has_ac_ring(mol):
    ri = mol.GetRingInfo()
    print(type(ri))
    for ring in ri.BondRings():
        if is_aromatic_ring(mol, ring):
            return True
    return False

In [77]:
ar_mols = [(i, mol) for (i, mol) in enumerate(molecules) if has_aromatic_ring(mol)]
len(ar_mols)

1428

In [81]:
out = widgets.Output()
i = 0

def picture_mols(x):
    out.clear_output()
    with out:
        print(f"showing molecules[{ar_mols[x][0]}]")
    display(out)
    mol = ar_mols[x][1]
    return mol

ibox = widgets.BoundedIntText(min=0, max=1256)
interact(picture_mols, x=ibox)

interactive(children=(BoundedIntText(value=0, description='x', max=1256), Output()), _dom_classes=('widget-int…

<function __main__.picture_mols(x)>

In [None]:
ar_smiles = [Chem.MolToSmiles(mol) for (i, mol) in ar_mols]
ar_idxs = [i for (i, mol) in ar_mols]

In [87]:
mol_frames = read("data/all_relaxed_molecules.xyz", ":")

In [89]:
mol_frames[0]

Atoms(symbols='OHSH2OCH2CHC4HCH2COH2SHO', pbc=False, cell=[32.66666928, 32.66666928, 32.66666928])

In [99]:
mol_info = []

with open("data/all_relaxed_molecules_tagged.xyz", "r") as f:
    for line in tqdm(f):
        if line.count("Lattice") > 0:
            mol_info.append(line)

mol_info[0]

78614it [00:00, 3369932.19it/s]


'Lattice="32.66666928 0.0 0.0 0.0 32.66666928 0.0 0.0 0.0 32.66666928" Properties=species:S:1:pos:R:3 dft_energy_ryd=-404.45301436 molecule_idx=0 crystal_idx=0 motif_idx="_JSON [[6, 2, 0, 12, 5, 1, 7, 17], [13, 23, 20, 19, 25, 8, 18, 24], [11, 12, 3, 15, 7, 17], [13, 14, 8, 18, 10, 22]]" motif_names="_JSON [\\"sulfonyl\\", \\"sulfonyl\\", \\"alkane\\", \\"alkane\\"]" pbc="F F F"\n'

In [106]:
tmp = [line[line.index('"') + 1 : line.index('" Properties')] for line in mol_info]
lattices = [tuple(x.split(" ")) for x in tmp]
ar_lattices = [x for (i, x) in enumerate(lattices) if i in ar_idxs]

In [107]:
ar_mol_dict = {
    "SMILES" : ar_smiles, 
    "Number" : ar_idxs,
    "Lattice" : ar_lattices,
}

In [108]:
ar_mol_df = pd.DataFrame.from_dict(ar_mol_dict)
ar_mol_df

Unnamed: 0,SMILES,Number,Lattice
0,CCc1cc(C(N)=S)ccn1,2,"(30.502558800000003, 0.0, 0.0, 0.0, 30.5025588..."
1,On1nnc2cccnc21,6,"(20.36922552, 0.0, 0.0, 0.0, 20.36922552, 0.0,..."
2,Cc1c(O)c(=O)ccn1C,7,"(24.34648104, 0.0, 0.0, 0.0, 24.34648104, 0.0,..."
3,Cc1ccc2c(c1)CN(C)CO2,8,"(24.650559360000003, 0.0, 0.0, 0.0, 24.6505593..."
4,c1ccc2cc3cc4cc5ccccc5cc4cc3cc2c1,9,"(29.09277, 0.0, 0.0, 0.0, 29.09277, 0.0, 0.0, ..."
...,...,...,...
1423,COc1cc(C(=O)O)ccc1O,3061,"(26.94702276, 0.0, 0.0, 0.0, 26.94702276, 0.0,..."
1424,COC(=O)c1cnc(C(=O)OC)cn1,3062,"(37.42015536, 0.0, 0.0, 0.0, 37.42015536, 0.0,..."
1425,c1ccc2c(c1)c1c3c4ccccc4n4c5ccccc5c(c5c6ccccc6n...,3063,"(38.86254288000001, 0.0, 0.0, 0.0, 38.86254288..."
1426,Oc1ccc(O)cc1,3064,"(31.45395456, 0.0, 0.0, 0.0, 31.45395456, 0.0,..."


In [109]:
ar_mol_df.to_csv("data/ar_molecules.tsv", sep="\t")