
# By reading in the structure files and analysing the features of the atomic connections we can find out things about the ligands in there.

In [1]:
import os
import pickle

import numpy as np
import sqlite3
import json

from ase import io, Atoms, neighborlist
from ase.build import molecule
from collections import Counter, defaultdict
from scipy import sparse
from scipy.sparse.csgraph import connected_components




## We needed a way to read in structures and get data about the ligands, here we add them to an SQL database. Wanted a database because there are many structures to read through, don't want to run this multiple times on xyz files that are already in there. Suggestions for doing this in a better way would be useful!

In [2]:

con = sqlite3.connect('ligands.db')
cur = con.cursor()

cur.execute('''CREATE TABLE IF NOT EXISTS ligand_data
              (ccdc_id text, denticity integer, coordinating_elements text, coordinating_indices text,
               original_metal integer, xyz text)'''
          )


<sqlite3.Cursor at 0x7fd220b14b90>

In [None]:
structure_xyzs = os.listdir("all_relevant_xyzs/")
#print(structure_xyzs)
relevant_pentas = []
relevant_tris = []
relevant_tetras = []

seen_ids = []

# here we check whether there are any entered ligands in the database and add them to the seen_ids
# list, when starting this list should be empty
for row in cur.execute('SELECT ccdc_id FROM ligand_data'):
    seen_ids.append(row[0])

for idx, struct in enumerate(structure_xyzs):
    if "tmQM_X" in struct:
        # hacky way to avoid the big long file with 400
        continue
    if struct.strip(".xyz") in seen_ids:
        print('skipped {}'.format(struct.strip(".xyz")))
        continue
    print(struct)
    mol = io.read("all_relevant_xyzs/{}".format(struct))
    # use Ase to get neigborlist
    cutOff = neighborlist.natural_cutoffs(mol)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True)
    neighborList.update(mol)
    atomic_numbers = mol.get_atomic_numbers()
    metal_idx = np.argmax(atomic_numbers)
    metal_neighbor_indices = neighborList.get_neighbors(metal_idx)[0]
    orig_atomic_num_nns = atomic_numbers[metal_neighbor_indices]
    indices_without_metal = sorted([x if x<metal_idx else x-1 for x in metal_neighbor_indices])

    # delete metal atom index, otherwise we couldn't count denticity, since we would have that
    # the molecule is fully connected, instead we want to find the fragments that 
    # are fully connected, and determine whether it contains atoms that coordinate to the metal.
    del mol[metal_idx]
    new_atomic_num_nns = mol.get_atomic_numbers()[indices_without_metal]
    assert new_atomic_num_nns.all()==orig_atomic_num_nns.all()
    cutOff = neighborlist.natural_cutoffs(mol)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True)
    neighborList.update(mol)
    graph = neighborList.get_connectivity_matrix(sparse=False)
    graph = graph+np.eye(graph.shape[0])
    connected_comps = connected_components(graph)
    h_connects_twice = False
    for atomic_n, connectivity_vector in zip(mol.get_atomic_numbers(), graph):
        if atomic_n==1 and sum(connectivity_vector)>2:
            h_connects_twice = True
    if h_connects_twice:
        continue

    # ensure one fragment is only connected to the metal once.
    # count the number of times a connected component contains
    # an atom connected to metal 
    connected_count = defaultdict(int)
    # this variable tracks which atoms are coordinating
    connected_comp_atom_count = defaultdict(int)
    # this variable tracks which atoms are coordinating
    # to the metal for a given connected component
    idx_connected = defaultdict(list)

    atomic_nums_connected = defaultdict(list)
    count = 0
    # this iterates over a list that specifies which connected component a given atom is in.
    for idx, val in enumerate(connected_comps[1]):
        if idx in indices_without_metal:
            idx_connected[val].append(connected_comp_atom_count[val])
            connected_count[val] += 1 
            atomic_nums_connected[val].append(new_atomic_num_nns[count])
            count += 1
        connected_comp_atom_count[val] += 1

    most_connected = -1
    most_connected_key = -1
    for key, value in atomic_nums_connected.items():
        if len(value)>most_connected:
            most_connected = len(value)
            most_connected_key = key
    print(connected_count)
    if max(connected_count.values())==4:
        print(struct)
        print(max(connected_count.values()))
    continue
    try:
        max(connected_count.values())
    except ValueError:
        # the surrounding atoms are not connected to the metal.
        continue
    
    connecting_atoms = list(set(atomic_nums_connected[most_connected_key]))
    connecting_atoms = [int(x) for x in connecting_atoms]
    if max(connected_count.values())==5:# and (connecting_atoms==[8] or connecting_atoms==[7] or sorted(connecting_atoms)==[7, 8]):
        key_of_highest_denticity = max(connected_count, key=connected_count.get)
        ligand_atomic_nums = []
        ligand_positions = []
        iter_count = 0
        for atomic_num, position in zip(mol.numbers, mol.positions):
            if connected_comps[1][iter_count]==key_of_highest_denticity:
                ligand_atomic_nums.append(atomic_num)
                ligand_positions.append(list(position))
            iter_count += 1

        relevant_pentas.append(struct)
        tmqm_id = struct.strip("all_relevant_xyzs/.")
        ligand = Atoms(ligand_atomic_nums, positions=ligand_positions)
        cwd = os.getcwd()
        io.write("tmp.xyz", ligand)
        with open('tmp.xyz', 'r') as f:
            ligand_xyz_string = f.read()
        cur.execute("INSERT INTO ligand_data VALUES (:ccdc_id, :denticity, :coordinating_elements, :coordinating_indices, :original_metal, :xyz)",
                   {
                       "ccdc_id": tmqm_id, "denticity": max(connected_count.values()),
                       "coordinating_elements": json.dumps(connecting_atoms), 
                       "coordinating_indices": json.dumps(idx_connected[key_of_highest_denticity]),
                       "original_metal": int(max(atomic_numbers)),"xyz": ligand_xyz_string,
                       #"orig_coordination_number": json.dumps(list(set(elements))), 
                   })
        con.commit()

    elif max(connected_count.values())==3:# and (connecting_atoms==[8] or connecting_atoms==[7] or sorted(connecting_atoms)==[7, 8]):
        key_of_highest_denticity = max(connected_count, key=connected_count.get)
        ligand_atomic_nums = []
        ligand_positions = []
        iter_count = 0
        for atomic_num, position in zip(mol.numbers, mol.positions):
            if connected_comps[1][iter_count]==key_of_highest_denticity:
                ligand_atomic_nums.append(atomic_num)
                ligand_positions.append(list(position))
            iter_count += 1

        relevant_pentas.append(struct)
        tmqm_id = struct.strip("all_relevant_xyzs/.")
        ligand = Atoms(ligand_atomic_nums, positions=ligand_positions)
        cwd = os.getcwd()
        relevant_tris.append(struct)
        io.write("tmp.xyz", ligand)
        with open('tmp.xyz', 'r') as f:
            ligand_xyz_string = f.read()
        cur.execute("INSERT INTO ligand_data VALUES (:ccdc_id, :denticity, :coordinating_elements, :coordinating_indices, :original_metal, :xyz)",
                   {
                       "ccdc_id": tmqm_id, "denticity": max(connected_count.values()),
                       "coordinating_elements": json.dumps(connecting_atoms), 
                        "coordinating_indices": json.dumps(idx_connected[key_of_highest_denticity]),
                       "original_metal": int(max(atomic_numbers)),"xyz": ligand_xyz_string,
                       #"orig_coordination_number": json.dumps(list(set(elements))), 
                   })
        con.commit()

    elif max(connected_count.values())==4:# and (connecting_atoms==[8] or connecting_atoms==[7] or sorted(connecting_atoms)==[7, 8]):
        key_of_highest_denticity = max(connected_count, key=connected_count.get)
        ligand_atomic_nums = []
        ligand_positions = []
        iter_count = 0
        for atomic_num, position in zip(mol.numbers, mol.positions):
            if connected_comps[1][iter_count]==key_of_highest_denticity:
                ligand_atomic_nums.append(atomic_num)
                ligand_positions.append(list(position))
            iter_count += 1

        relevant_pentas.append(struct)
        tmqm_id = struct.strip("all_relevant_xyzs/.")
        ligand = Atoms(ligand_atomic_nums, positions=ligand_positions)
        cwd = os.getcwd()
        #io.write(cwd+"/tetra_ligands/"+tmqm_id+"_tetra_ligand.xyz", ligand)
        #pickle.dump(idx_connected[key_of_highest_denticity], open(cwd+"/tetra_ligands/"+tmqm_id+"_tetra_connections.p", "wb"), protocol=2)

        relevant_tetras.append(struct)
        io.write("tmp.xyz", ligand)
        with open('tmp.xyz', 'r') as f:
            ligand_xyz_string = f.read()
        cur.execute("INSERT INTO ligand_data VALUES (:ccdc_id, :denticity, :coordinating_elements, :coordinating_indices, :original_metal, :xyz)",
                   {
                       "ccdc_id": tmqm_id, "denticity": max(connected_count.values()),
                       "coordinating_elements": json.dumps(connecting_atoms), 
                       "coordinating_indices": json.dumps(idx_connected[key_of_highest_denticity]),
                       "original_metal": int(max(atomic_numbers)),"xyz": ligand_xyz_string,
                       #"orig_coordination_number": json.dumps(list(set(elements))), 
                   })
        con.commit()

## Can now query the database in fun ways! (Your output is probably different, this is my result after having read through the structures)

In [4]:
string = """select ccdc_id, denticity, coordinating_elements, coordinating_indices, xyz from ligand_data 
where (denticity=3) and (coordinating_elements like '%[8]%' or coordinating_elements like '%[7]%' 
            or coordinating_elements like '%[8, 7]%' or coordinating_elements like '%[7, 8]%')"""

query_res = list(cur.execute(string))
print("Number of tridentate ligands with O/N coordinating: {}".format(len(query_res)))

Number of tridentate ligands with O/N coordinating: 5234


## Can ignore this for now, but here I was trying to find ligands with no β-Carbon atoms near the metal, since these should be easy to oxidise and therefore bad for OER.

In [None]:
ok_fella = 0 
for tri_containing_struct in relevant_pentas:
    mol = io.read("all_relevant_xyzs/{}".format(tri_containing_struct))
    cutOff = neighborlist.natural_cutoffs(mol)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True)
    neighborList.update(mol)
    atomic_numbers = mol.get_atomic_numbers()
    if len(atomic_numbers)>100:
        continue
    metal_idx = np.argmax(atomic_numbers)
    if max(atomic_numbers) not in [24, 25, 26, 27, 28]:
        continue
    metal_neighbor_indices = neighborList.get_neighbors(metal_idx)[0]
    #print(atomic_numbers[metal_neighbor_indices])
    orig_atomic_num_nns = atomic_numbers[metal_neighbor_indices]
    
    # get the indices of the neighbours
    indices_without_metal = sorted([x if x<metal_idx else x-1 for x in metal_neighbor_indices])

    del mol[metal_idx]
    new_atomic_num_nns = mol.get_atomic_numbers()[indices_without_metal]
    assert new_atomic_num_nns.all()==orig_atomic_num_nns.all()
    cutOff = neighborlist.natural_cutoffs(mol)
    neighborList = neighborlist.NeighborList(cutOff, self_interaction=False, bothways=True)
    neighborList.update(mol)
    graph = neighborList.get_connectivity_matrix(sparse=False)
    graph = graph+np.eye(graph.shape[0])
    connected_comps = connected_components(graph)
    total_atomic_nums = mol.numbers


    connected_count = defaultdict(int)
    # this variable tracks which atoms are coordinating
    connected_comp_atom_count = defaultdict(int)
    # this variable tracks which atoms are coordinating
    # to the metal for a given connected component
    idx_connected = defaultdict(list)

    atomic_nums_connected = defaultdict(list)
    count = 0
    # this iterates over a list that specifies which connected component a given atom is in.
    for idx, val in enumerate(connected_comps[1]):
        if idx in indices_without_metal:
            idx_connected[val].append(connected_comp_atom_count[val])
            connected_count[val] += 1 
            atomic_nums_connected[val].append(new_atomic_num_nns[count])
            count += 1
    connected_comp_atom_count[val] += 1

    most_connected = -1
    most_connected_key = -1
    for key, value in atomic_nums_connected.items():
        if len(value)>most_connected:
            most_connected = len(value)
            most_connected_key = key
    
    # Bool to check whether the most connected comp has beta Hs
    beta_found = False
    copy_mol = mol
    for idx, atomic_n in enumerate(total_atomic_nums):
        c_neighbor_atoms = []
        if atomic_n==6 and connected_comps[1][idx]==most_connected_key:
            c_neighbor_indices = neighborList.get_neighbors(idx)[0]
            for c_neighbor_index in c_neighbor_indices:
                c_neighbor_atoms.append(total_atomic_nums[c_neighbor_index])
        c_atom_neighbor_counter = Counter(c_neighbor_atoms)
        
        if c_atom_neighbor_counter[1]==2 and len(c_neighbor_atoms)==4:# and len(set(indices_without_metal).union(set(c_neighbor_indices)))<len(c_neighbor_indices)+len(indices_without_metal):
            #print(len(set(indices_without_metal).union(set(c_neighbor_indices))))
            beta_found = True
    if beta_found:
        print("beta C attached to metal co-ordinating atom in {}".format(tri_containing_struct))
    else:
        print("no beta C attached to metal co-ordinating atom for {}".format(tri_containing_struct))
        ok_fella+=1
    #print(list(set(atomic_nums_connected[most_connected_key]))==[8] or list(set(atomic_nums_connected[most_connected_key]))==[7] or sorted(list(set(atomic_nums_connected[most_connected_key])))==[7, 8])