In [1]:
import glob
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

# README

This notebook extracts some information about fitting. For each molecule, it creates a CSV file.

It calculates the Euclidean distance and topological distance (number of bonds separating an atom and the halogen).

In [2]:
def parsePrepAc(prep_ac):
    
    # read file content
    with open(prep_ac) as stream:
        lines = stream.readlines()
    
    # browse file content
    atoms = {}
    bonds = []
    ref_at_name = None
    for line in lines:
        
        l_spl = line.split()
        
        # skip short
        if len(l_spl) == 0:
            continue
        
        # save atom
        if l_spl[0] == "ATOM":
            at_id = int(l_spl[1])
            at_name = l_spl[2]
            at_type = l_spl[-1]
            x = float(line[30:38])
            y = float(line[38:46])
            z = float(line[46:54])
            atoms[at_name] = [at_id, at_type, np.array((x, y, z))]
            
            if "I" in at_name or "Cl" in at_name or "Br" in at_name:
                ref_at_name = at_name
            continue
        

        if l_spl[0] == "BOND":
            at_name1 = l_spl[-2]
            at_name2 = l_spl[-1]
            bonds.append([at_name1, at_name2])
            
    return atoms, bonds, ref_at_name

                    

def getNBDistances(atoms, bonds, ref_at_name):
    
    distances = []
    
    for atom in atoms:

        distance = findShortestNBDistance(atom, bonds, ref_at_name)
        distances.append(distance)
        
    return distances


def findShortestNBDistance(atom, bonds, ref_atom):
    dist = 0
    
    starts = [atom]
    
    while True:
        ends = []
        for start in starts:
            if start == ref_atom:
                return dist
            for bond in bonds:
                if start in bond:
                    end = [i for i in bond if i != start][0]
                    ends.append(end)
        starts = ends
        dist += 1

        
def getEuclideanDistances(atoms, ref_at_name):
    
    distances = []
    
    coords_ref = atoms[ref_at_name][2]
    
    for at_name, at_values in atoms.items():

        at_id, at_type, coords = at_values
        
        distance = np.linalg.norm(coords_ref - coords)
        distances.append(distance)
        
    return distances




def getChargesFromPunch(punch, n_atoms, sigma=False):
    
    # initialize output container
    charges = []
    
    # read file content
    with open(punch) as stream:
        lines = stream.readlines()

    # define, where to find atoms and charges
    lines_start = 11
    lines_end = lines_start + n_atoms
    if sigma:
        lines_end += 1
        
    # browse selected lines and save charges
    for line in lines[lines_start:lines_end]:
        l_spl = line.split()
        charge = float(l_spl[3])
        charges.append(charge)
        
    return charges


def sortAtoms(atoms):
    at_names = list(atoms.keys())
    at_ids = [i[0] for i in atoms.values()]
    at_types = [i[1] for i in atoms.values()]
    atoms_unsorted = list(zip(at_names, at_ids, at_types))
    atoms_sorted = sorted(atoms_unsorted, key=lambda x: x[1])
    at_names_sorted = [a[0] for a in atoms_sorted]
    at_types_sorted = [a[2] for a in atoms_sorted]
    return at_names_sorted, at_types_sorted



for halogen in "chlorine bromine iodine".split():
    
    mols = sorted(glob.glob(f"../{halogen}/ZINC*"))

    for mol in mols:

        # get info about atoms and bonds
        prep_ac = mol + "/antechamber/ANTECHAMBER_PREP.AC"
        atoms, bonds, ref_at_name = parsePrepAc(prep_ac)
        n_atoms = len(atoms)

        # number-of-bond distance from the halogen
        nb_distances = getNBDistances(atoms, bonds, ref_at_name)
        
        # eucledian distances from the halogen
        distances = getEuclideanDistances(atoms, ref_at_name)

        # standard RESP charges
        punch_std = mol + "/antechamber/punch"
        qs_std = getChargesFromPunch(punch_std, n_atoms)

        # modified RESP charges including sigma-hole
        punch_mod = mol + "/mod2/punch"
        qs_mod = getChargesFromPunch(punch_mod, n_atoms, sigma=True)

        # correct sorting of atoms
        atom_names_sorted, atom_types_sorted = sortAtoms(atoms)

        # output dataframe
        df = pd.DataFrame({"name": atom_names_sorted + ["X"],
                           "type": atom_types_sorted + ["x"],
                           "nb_distance": nb_distances + [-1],
                           "distance": distances + [-1],
                           "q_std": qs_std + [0],
                           "q_mod": qs_mod})

        # save
        df.to_csv(mol + "/overview.csv", index=False)

"done"

'done'

In [3]:
df

Unnamed: 0,name,type,nb_distance,distance,q_std,q_mod
0,C1,c3,5,6.902818,-0.22257,-0.250778
1,C2,cc,4,5.519319,0.064904,0.077989
2,C3,cd,5,5.557544,-0.114958,-0.081952
3,C4,c,4,4.513245,0.734528,0.658696
4,O1,o,5,4.96497,-0.541076,-0.533859
5,O2,os,3,3.161978,-0.393708,-0.23579
6,C5,ca,2,3.073588,0.453232,0.146991
7,C6,ca,3,4.415732,-0.13196,-0.090371
8,C7,ca,4,4.913992,-0.082927,-0.101669
9,C8,ca,3,4.39697,-0.40496,-0.358144
