In [None]:
from rdkit import Chem
from glob import glob
from rdkit.Chem import rdMolTransforms
import numpy as np
import pickle


In [None]:
def sp2_indices(mol):
    sp2_index = []
    atoms = mol.GetAtoms()
    bonded = []
    for bond in mol.GetAtoms()[0].GetBonds():
        bonded.append(bond.GetEndAtomIdx())
    for atom in atoms:
        neighbors = atom.GetNeighbors()
        if len(neighbors) == 3:
            for a, b in ((0, 1), (1, 2), (2, 0)):
                angle = rdMolTransforms.GetAngleDeg(mol.GetConformer(),bonded[a],0,bonded[b])
                if (angle > 109 and angle < 145):
                    sp2_index.append(atom.GetIdx())
    return list(set(sp2_index))

In [None]:
def connectivity_matrix(mol):
    atoms = mol.GetAtoms()
    carbons = [atom for atom in atoms if atom.GetAtomicNum() == 6 ]
    carbons_idx = [int(carbon.GetIdx()) for carbon in carbons]
    conmat = np.eye(len(carbons))
    for idx in carbons_idx:
        neighbors = atoms[idx].GetNeighbors()
        for nei in neighbors:
            nei_idx = int(nei.GetIdx())
            if nei_idx in carbons_idx:
                conmat[nei_idx,idx] = -1
            else:
                pass
    return conmat-np.eye(len(carbons))

In [None]:
def num_carbons(mol):
    atoms = mol.GetAtoms()
    return len([atom for atom in atoms if atom.GetAtomicNum() == 6 ])

In [None]:
def is_fully_aromatic(mol):
    if len(sp2_indices(mol)) == num_carbons(mol):
        return True
    else:
        return False

In [None]:
# list to hold all our database

pdbs = glob('*pdb')

allsp2_dict = []
for item in pdbs:
    m = Chem.MolFromPDBFile(item, sanitize=False)
    print('new molecule:  ' + item + ' nr# of atoms: ' + str(m.GetNumAtoms()))
    if is_fully_aromatic(m):
        allsp2_dict.append({'name' : item[:-4], 'conmat' : connectivity_matrix(m), 'num_carbon': num_carbons(m)})
    else:
        print('WARNING: ' + item + ' is not fully aromatic, thus skipped.')

# pickle
pickle.dump( allsp2_dict, open( "pahs.pkl", "wb" ) )

In [None]:
def choose_n(myid, choices, n_molecules=10):
    import random
    random.seed(myid)
    subset = random.sample(list(choices), n_molecules)
    return subset

In [None]:
def generate_dataset(myid=None, n_molecules=10, n_carbon_atoms=24, debug=False):
    """
    This function picks 10 random molecules from a database based on your student ID.
    """
    
    allsp2_dict = pickle.load( open( "pahs.pkl", "rb" ) )
    maxidx = len(allsp2_dict)
    if debug:
        print(' | --- Total number of fully aromatic molecules in database: ' + str(maxidx))
        print(' | --- By number of carbon atoms: ')
    # number of fully conjugated molecules by number of carbon atoms
        for num in range(2,40,2):
            print('      len = ' + str(num) + ' : molecules = ' + str(len([mol for mol in allsp2_dict if mol['num_carbon'] == num])))

    print(' | --- Your molecule dataset contains molecules with {0} sp2 hybridised carbon atoms'.format(n_carbon_atoms))
    # Since we only have more than 100 for 24 and 30 atoms I chose 24
    equal_carbon_number = [elm for elm in allsp2_dict if elm['num_carbon'] == n_carbon_atoms]

    print(' | --- Based on your student ID we chose the following molecules for you: ')

    subset = choose_n(myid, equal_carbon_number, n_molecules)
    mols = []
    legend = []
    for pmol in subset:
        print('      ' + pmol['name'])
        legend.append(pmol['name'])
        mols.append(Chem.MolFromMolFile(pmol['name']+'.mol'))

    return subset, mols, legend

In [None]:
dataset, mols, legend = generate_dataset(myid=1673606, n_molecules=24, n_carbon_atoms=20, debug=True)

In [None]:
from rdkit import Chem
import rdkit.Chem.Draw
from rdkit.Chem import AllChem
from IPython.display import SVG

SVG(Chem.Draw.MolsToGridImage(mols, useSVG=True, legends=legend, subImgSize=(300,300)))