
# By reading in the structure files and analysing the features of the atomic connections we can find out things about the ligands in there.

In [1]:
import os
import pickle

import numpy as np
import sqlite3
import json

from ase import io, Atoms, neighborlist
from ase.build import molecule
from collections import Counter, defaultdict
from scipy import sparse
from mendeleev import element
from scipy.sparse.csgraph import connected_components


con = sqlite3.connect('ligands.db')
cur = con.cursor()

cur.execute('''CREATE TABLE IF NOT EXISTS ligand_data
              (ccdc_id text, denticity integer, coordinating_elements text, coordinating_indices text,
               original_metal integer, xyz text)'''
          )


<sqlite3.Cursor at 0x7ff8aad8f420>

This attempts to grab all the csd structures that have one of the intermediates adsorbed 

In [12]:
cur.execute("DELETE FROM ligand_data WHERE ccdc_id='BOXTUD'")

<sqlite3.Cursor at 0x7ff8aad8f420>

We needed a way to read in structures and get data about the ligands, here we add them to an SQL database.

In [13]:
structure_xyzs = os.listdir("all_relevant_xyzs/")

relevant_pentas = []
relevant_tris = []
relevant_tetras = []

seen_ids = []

for row in cur.execute('SELECT ccdc_id FROM ligand_data'):
    seen_ids.append(row[0])

to_redo = ["KAJCED", "BOXTUD"]
for idx, struct in enumerate(structure_xyzs):
    if "tmQM_X" in struct:
        continue
    if struct.strip(".xyz") in seen_ids:
        #print('skipped')
        continue
    if struct.strip(".xyz") not in to_redo:
        continue
    print(struct)
    #continue
    mol = io.read("all_relevant_xyzs/{}".format(struct))
    # use Ase to get neigborlist
    cutOff = neighborlist.natural_cutoffs(mol)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True)
    neighborList.update(mol)
    atomic_numbers = mol.get_atomic_numbers()
    #if len(atomic_numbers)>100:
    #    continue
    metal_idx = np.argmax(atomic_numbers)
    #if max(atomic_numbers) not in [24, 25, 26, 27, 28]:
     #   continue
    metal_neighbor_indices = neighborList.get_neighbors(metal_idx)[0]
    orig_atomic_num_nns = atomic_numbers[metal_neighbor_indices]
    indices_without_metal = sorted([x if x<metal_idx else x-1 for x in metal_neighbor_indices])
    #print(set(orig_atomic_num_nns))
    #if len(metal_neighbor_indices)==6:
        # is octahedral, so we check for tetra and pentadentate ligands using connectivity
        # by removing connecting metal atom and check denticity.
    del mol[metal_idx]
    new_atomic_num_nns = mol.get_atomic_numbers()[indices_without_metal]
    assert new_atomic_num_nns.all()==orig_atomic_num_nns.all()
    cutOff = neighborlist.natural_cutoffs(mol)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True)
    neighborList.update(mol)
    graph = neighborList.get_connectivity_matrix(sparse=False)
    graph = graph+np.eye(graph.shape[0])
    #print(graph)
    connected_comps = connected_components(graph)
    #print(connected_comps)

    h_connects_twice = False
    for atomic_n, connectivity_vector in zip(mol.get_atomic_numbers(), graph):
        if atomic_n==1 and sum(connectivity_vector)>2:
            h_connects_twice = True
    if h_connects_twice:
        continue

    # ensure one fragment is only connected to the metal once.
    # count the number of times a connected component contains
    # an atom connected to metal 
    connected_count = defaultdict(int)
    # this variable tracks which atoms are coordinating
    connected_comp_atom_count = defaultdict(int)
    # this variable tracks which atoms are coordinating
    # to the metal for a given connected component
    idx_connected = defaultdict(list)

    atomic_nums_connected = defaultdict(list)
    count = 0
    # this iterates over a list that specifies which connected component a given atom is in.
    for idx, val in enumerate(connected_comps[1]):
        if idx in indices_without_metal:
            idx_connected[val].append(connected_comp_atom_count[val])
            connected_count[val] += 1 
            atomic_nums_connected[val].append(new_atomic_num_nns[count])
            count += 1
        connected_comp_atom_count[val] += 1

    most_connected = -1
    most_connected_key = -1
    for key, value in atomic_nums_connected.items():
        if len(value)>most_connected:
            most_connected = len(value)
            most_connected_key = key
    
    try:
        max(connected_count.values())
    except ValueError:
        # the surrounding atoms are not connected to the metal.
        continue
    
    connecting_atoms = list(set(atomic_nums_connected[most_connected_key]))
    connecting_atoms = [int(x) for x in connecting_atoms]
    if max(connected_count.values())==5:# and (connecting_atoms==[8] or connecting_atoms==[7] or sorted(connecting_atoms)==[7, 8]):
        key_of_highest_denticity = max(connected_count, key=connected_count.get)
        ligand_atomic_nums = []
        ligand_positions = []
        iter_count = 0
        for atomic_num, position in zip(mol.numbers, mol.positions):
            if connected_comps[1][iter_count]==key_of_highest_denticity:
                ligand_atomic_nums.append(atomic_num)
                ligand_positions.append(list(position))
            iter_count += 1

        relevant_pentas.append(struct)
        tmqm_id = struct.strip("all_relevant_xyzs/.")
        ligand = Atoms(ligand_atomic_nums, positions=ligand_positions)
        cwd = os.getcwd()
        io.write("tmp.xyz", ligand)
        with open('tmp.xyz', 'r') as f:
            ligand_xyz_string = f.read()
        cur.execute("INSERT INTO ligand_data VALUES (:ccdc_id, :denticity, :coordinating_elements, :coordinating_indices, :original_metal, :xyz)",
                   {
                       "ccdc_id": tmqm_id, "denticity": max(connected_count.values()),
                       "coordinating_elements": json.dumps(connecting_atoms), 
                       "coordinating_indices": json.dumps(idx_connected[key_of_highest_denticity]),
                       "original_metal": int(max(atomic_numbers)),"xyz": ligand_xyz_string,
                       #"orig_coordination_number": json.dumps(list(set(elements))), 
                   })
        con.commit()

    elif max(connected_count.values())==3:# and (connecting_atoms==[8] or connecting_atoms==[7] or sorted(connecting_atoms)==[7, 8]):
        key_of_highest_denticity = max(connected_count, key=connected_count.get)
        ligand_atomic_nums = []
        ligand_positions = []
        iter_count = 0
        for atomic_num, position in zip(mol.numbers, mol.positions):
            if connected_comps[1][iter_count]==key_of_highest_denticity:
                ligand_atomic_nums.append(atomic_num)
                ligand_positions.append(list(position))
            iter_count += 1

        relevant_pentas.append(struct)
        tmqm_id = struct.strip("all_relevant_xyzs/.")
        ligand = Atoms(ligand_atomic_nums, positions=ligand_positions)
        cwd = os.getcwd()
        relevant_tris.append(struct)
        io.write("tmp.xyz", ligand)
        with open('tmp.xyz', 'r') as f:
            ligand_xyz_string = f.read()
        cur.execute("INSERT INTO ligand_data VALUES (:ccdc_id, :denticity, :coordinating_elements, :coordinating_indices, :original_metal, :xyz)",
                   {
                       "ccdc_id": tmqm_id, "denticity": max(connected_count.values()),
                       "coordinating_elements": json.dumps(connecting_atoms), 
                        "coordinating_indices": json.dumps(idx_connected[key_of_highest_denticity]),
                       "original_metal": int(max(atomic_numbers)),"xyz": ligand_xyz_string,
                       #"orig_coordination_number": json.dumps(list(set(elements))), 
                   })
        con.commit()

    elif max(connected_count.values())==4:# and (connecting_atoms==[8] or connecting_atoms==[7] or sorted(connecting_atoms)==[7, 8]):
        key_of_highest_denticity = max(connected_count, key=connected_count.get)
        ligand_atomic_nums = []
        ligand_positions = []
        iter_count = 0
        for atomic_num, position in zip(mol.numbers, mol.positions):
            if connected_comps[1][iter_count]==key_of_highest_denticity:
                ligand_atomic_nums.append(atomic_num)
                ligand_positions.append(list(position))
            iter_count += 1

        relevant_pentas.append(struct)
        tmqm_id = struct.strip("all_relevant_xyzs/.")
        ligand = Atoms(ligand_atomic_nums, positions=ligand_positions)
        cwd = os.getcwd()
        #io.write(cwd+"/tetra_ligands/"+tmqm_id+"_tetra_ligand.xyz", ligand)
        #pickle.dump(idx_connected[key_of_highest_denticity], open(cwd+"/tetra_ligands/"+tmqm_id+"_tetra_connections.p", "wb"), protocol=2)

        relevant_tetras.append(struct)
        io.write("tmp.xyz", ligand)
        with open('tmp.xyz', 'r') as f:
            ligand_xyz_string = f.read()
        cur.execute("INSERT INTO ligand_data VALUES (:ccdc_id, :denticity, :coordinating_elements, :coordinating_indices, :original_metal, :xyz)",
                   {
                       "ccdc_id": tmqm_id, "denticity": max(connected_count.values()),
                       "coordinating_elements": json.dumps(connecting_atoms), 
                       "coordinating_indices": json.dumps(idx_connected[key_of_highest_denticity]),
                       "original_metal": int(max(atomic_numbers)),"xyz": ligand_xyz_string,
                       #"orig_coordination_number": json.dumps(list(set(elements))), 
                   })
        con.commit()

BOXTUD.xyz


In [None]:
ok_fella = 0 
for tri_containing_struct in relevant_pentas:
    mol = io.read("all_relevant_xyzs/{}".format(tri_containing_struct))
    cutOff = neighborlist.natural_cutoffs(mol)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True)
    neighborList.update(mol)
    atomic_numbers = mol.get_atomic_numbers()
    if len(atomic_numbers)>100:
        continue
    metal_idx = np.argmax(atomic_numbers)
    if max(atomic_numbers) not in [24, 25, 26, 27, 28]:
        continue
    metal_neighbor_indices = neighborList.get_neighbors(metal_idx)[0]
    #print(atomic_numbers[metal_neighbor_indices])
    orig_atomic_num_nns = atomic_numbers[metal_neighbor_indices]
    
    # get the indices of the neighbours
    indices_without_metal = sorted([x if x<metal_idx else x-1 for x in metal_neighbor_indices])

    del mol[metal_idx]
    new_atomic_num_nns = mol.get_atomic_numbers()[indices_without_metal]
    assert new_atomic_num_nns.all()==orig_atomic_num_nns.all()
    cutOff = neighborlist.natural_cutoffs(mol)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True)
    neighborList.update(mol)
    graph = neighborList.get_connectivity_matrix(sparse=False)
    graph = graph+np.eye(graph.shape[0])
    connected_comps = connected_components(graph)
    total_atomic_nums = mol.numbers


    connected_count = defaultdict(int)
    # this variable tracks which atoms are coordinating
    connected_comp_atom_count = defaultdict(int)
    # this variable tracks which atoms are coordinating
    # to the metal for a given connected component
    idx_connected = defaultdict(list)

    atomic_nums_connected = defaultdict(list)
    count = 0
    # this iterates over a list that specifies which connected component a given atom is in.
    for idx, val in enumerate(connected_comps[1]):
        if idx in indices_without_metal:
            idx_connected[val].append(connected_comp_atom_count[val])
            connected_count[val] += 1 
            atomic_nums_connected[val].append(new_atomic_num_nns[count])
            count += 1
    connected_comp_atom_count[val] += 1

    most_connected = -1
    most_connected_key = -1
    for key, value in atomic_nums_connected.items():
        if len(value)>most_connected:
            most_connected = len(value)
            most_connected_key = key
    
    # Bool to check whether the most connected comp has beta Hs
    beta_found = False
    copy_mol = mol
    for idx, atomic_n in enumerate(total_atomic_nums):
        c_neighbor_atoms = []
        if atomic_n==6 and connected_comps[1][idx]==most_connected_key:
            c_neighbor_indices = neighborList.get_neighbors(idx)[0]
            for c_neighbor_index in c_neighbor_indices:
                c_neighbor_atoms.append(total_atomic_nums[c_neighbor_index])
        c_atom_neighbor_counter = Counter(c_neighbor_atoms)
        
        if c_atom_neighbor_counter[1]==2 and len(c_neighbor_atoms)==4:# and len(set(indices_without_metal).union(set(c_neighbor_indices)))<len(c_neighbor_indices)+len(indices_without_metal):
            #print(len(set(indices_without_metal).union(set(c_neighbor_indices))))
            beta_found = True
    if beta_found:
        print("beta C attached to metal co-ordinating atom in {}".format(tri_containing_struct))
    else:
        print("no beta C attached to metal co-ordinating atom for {}".format(tri_containing_struct))
        ok_fella+=1
    #print(list(set(atomic_nums_connected[most_connected_key]))==[8] or list(set(atomic_nums_connected[most_connected_key]))==[7] or sorted(list(set(atomic_nums_connected[most_connected_key])))==[7, 8])

In [22]:

denticities = []
metals = []
coordinating_elements = []
for row in cur.execute('SELECT denticity, coordinating_elements, original_metal, xyz FROM ligand_data'):
    denticities.append(row[0])
    coordinating_elements.append(json.loads(row[1]))
    metals.append(row[2])
    #print(row[2])
    

In [23]:
coordinating_elements

[[8, 6],
 [16],
 [7],
 [8, 7],
 [6, 7],
 [8, 6, 7],
 [8, 7],
 [7],
 [6],
 [5, 6],
 [14, 6],
 [6],
 [6],
 [8, 7],
 [8, 7],
 [16, 8, 6, 7],
 [7, 14, 15],
 [6, 7],
 [14, 15],
 [8, 6],
 [6, 7],
 [6],
 [14, 15],
 [8, 6],
 [6, 7],
 [7],
 [6],
 [7],
 [8, 7],
 [7],
 [7],
 [7],
 [8, 6, 7],
 [16, 6],
 [6, 7],
 [14, 6, 7],
 [6, 7],
 [7],
 [6, 15],
 [6],
 [8, 6, 7],
 [6, 7],
 [6, 15],
 [14, 15],
 [6],
 [16, 6, 7],
 [6],
 [7],
 [7],
 [6],
 [8, 6],
 [7],
 [8, 7],
 [16, 6],
 [8, 6, 7],
 [1, 7, 15],
 [15, 7],
 [6],
 [6],
 [6],
 [15],
 [6],
 [8, 6, 7],
 [8, 7],
 [7],
 [6],
 [8, 7],
 [6],
 [7],
 [16, 7],
 [7],
 [6, 7],
 [6, 7],
 [16, 8, 7],
 [7, 15],
 [8, 6, 7],
 [16, 7],
 [6],
 [16, 7],
 [6],
 [7],
 [16, 8, 6],
 [8, 7],
 [7],
 [16, 6, 7],
 [16, 15],
 [7],
 [8, 6],
 [6],
 [6],
 [6],
 [6],
 [8, 7],
 [6],
 [1, 6, 7],
 [6],
 [8, 6],
 [6],
 [8, 7],
 [6],
 [6],
 [6],
 [7],
 [7, 15],
 [8, 7],
 [6],
 [6],
 [7],
 [15],
 [7],
 [6],
 [16],
 [7],
 [6],
 [6],
 [1, 15],
 [7],
 [8, 6],
 [6],
 [7],
 [6],
 [6, 7],
 [7]