In [None]:
import csv, chemiscope, ast
import numpy as np
import ase.io as aseio
from ase import Atom
from statistics import mean
from rdkit import Chem
import pandas as pd
from scipy.spatial import distance, ConvexHull, Voronoi, Delaunay
from scipy.integrate import quad
from scipy.stats import gaussian_kde

In [None]:
# Creating the coordinate file for the chemiscope
frame_edit = open("frame.xyz", "w")
df = pd.read_csv("zeolitesList.csv")

smile_edit, zeolite_list = [], []
with open('input_smiles.csv', mode ='r')as file:
    csvFile = csv.reader(file)
    for lines in csvFile:
        if lines[2] != "XYZ Coordinates" and lines[2] != "":
            frame_edit.write(lines[2])
            smile_edit.append(lines[1])
            
            ind_row = np.where(df['SMILES'] == lines[1])[0][0] # SMILES Location
            zeolite_list.append(ast.literal_eval(df['List of Zeolites'][ind_row]))
frame_edit.close()

In [None]:
# PCA result
pca_result = np.loadtxt("pca4N.csv", delimiter = ',')
print(f"PCA result : shape {pca_result.shape}")

In [None]:
## TMA, TEA, TPA location
target_list = ["C[N+](C)(C)C", "CC[N+](CC)(CC)CC", "CCC[N+](CCC)(CCC)CCC"]
name_list = ["TMA", "TEA", "TPA"]

with open('input_smiles.csv', mode ='r')as file:
    csvFile = csv.reader(file)
    for lines in csvFile:
        
        if lines[2] != "XYZ Coordinates" and lines[2] != "":
            if lines[1] in target_list:
                print("Number of the OSDA:" + lines[0] + "  " + lines[1] +  "  " + name_list[target_list.index(lines[1])])

<span style="font-size:25px;font-weight:bold">Gathering structural features  (Environment)</span>

The following are the features to represetn the gathered local environment by SOAP vector. (Geometry Feature)

<span style="font-size:23px;font-weight:bold">1. Coordination Number </span>  <span style="font-size:16px;font-weight:bold;color:red">Expressed as quantized cluster</span>
<br>The coordination number, define by center atom type and neighboring atom.
For our case, the maxium is **4**, and the minimum is **1**.<br>
For collecting the bonded particles, interatomic distance **1.56 Angstrom** is used. 

<span style="font-size:23px;font-weight:bold">2. Local Density  </span> 
<br>$\Large\frac{\# \;of\; particles}{Local\;Environment\;Volume}$

<span style="font-size:23px;font-weight:bold">3. Unique Interatomic Distance</span><br>
Computing the variance of every intertatomic distance of Center - Neighbor particles.<br>
It gives the insight of the number of types of neighboring particles.

<span style="font-size:23px;font-weight:bold">4. Legendre Polynomial</span><br>
Computing the averaged angle of three particles, and represent as the spread out of surrounding particles in the environment.<br>
$\Large \cos{\theta_{Neighbor_{1}\;Center\;Neighbor_{2}}} = \omega$<br>
i.e. If the average is close to 1.0, the neighboring particles are configured as linear polymer.

<span style="font-size:23px;font-weight:bold">5. Different types of bonded first neighboring atoms</span>  <span style="font-size:16px;font-weight:bold;color:red">C-Centered (4 Angstrom) PC2 correlates!!</span><br>
Similar concept as cooridnation number, representing the different types of first neighboring atom(bonded atom)<br>
$\large \rm{Score = \sum{(\#\;designated\;neighboring\;atom - \#\;other\;types\;of\;atom)}}$<br>


In [None]:
center_atom = "N" # Element type (Center, Neighbor)
    
################################################
## Coordinate number around the center atom ##
################################################
coordinate_file = open("frame.xyz").read().splitlines()
ea_line, label_sda = 0, 0

dist_threshold = 1.6              # 1.56 for both (N - C) (C - C)
cutoff_r = 4.0                     # Cutoff radius (Change before)

local_d, local_d_avg = [], []        # local density

cn_dictionary, cn_dictionary_avg = [], []   # Coordination Number

interatomic_d = []  # Interatomic distance

three_b_cor = []    # Legendre Polynomial 

connect_score, connect_score_avg = [], []  # Connectivity

cluster_size = []    # Cluster Size

num_rings = [] # number of ring

avg_bondangle = [] # Average Angle

dist_matrix_gather, dist_matrix_gather_avg = [], [] # Pair Wise Distance

voronoi_volume = []  # Convex Hull Volume

In [None]:
def voronoi_cell_volume(vor, point_index):
    """Calculates the volume of a Voronoi cell given its index."""
    
    # Get the indices of the vertices for the cell
    ridge_vertices = vor.regions[vor.point_region[point_index]]

    # Only consider finite Voronoi cells
    if -1 not in ridge_vertices:
        # Construct a Delaunay triangulation from the cell vertices
        tri = Delaunay(vor.vertices[ridge_vertices])

        # Calculate the volume of each tetrahedron
        tetrahedra_volumes = []
        for simplex in tri.simplices:
            vertices = vor.vertices[simplex]
            tetrahedra_volumes.append(np.abs(np.linalg.det(vertices[1:] - vertices[0])) / 6)

        # Sum the volumes of all tetrahedra to get the cell volume
        return np.sum(tetrahedra_volumes)
    else:
        return 0.0

In [None]:
while ea_line < len(coordinate_file):

    atom_n = int(coordinate_file[ea_line].split()[0])

    atom_dictionary = {}
    for an_ in np.arange(ea_line + 2, ea_line + atom_n):
        ann_ = coordinate_file[an_].split()

        if ann_[0] not in atom_dictionary.keys():
            atom_dictionary[ann_[0]] = []
        atom_dictionary[ann_[0]].append(np.asarray([float(an_n) for an_n in ann_[1:]])) # Coordinate

    
    label_sda += 1 
    ld_avg, cnd_avg, cs_avg, dm_avg = [], [], [], []
    
    # Check if the center/neighbor atom exists in the molecule
    if center_atom in atom_dictionary.keys():
        
        for elem_one in atom_dictionary[center_atom]: # Center atom


            dist_, dist_id, not_neigh_dist_ = 0, 0, 0
            mean_angle, ina_d, surr_vec = [], [], []
            ina_d_coord = []
            phos_cnt = 0

            ina_d_coord.append(elem_one)
            
            ## Dictionary of Surrounding Atoms
            for _e, elem_around in atom_dictionary.items():
                    
                for ea_elem_ar in elem_around:
                    
                    dist_cen_surr = np.linalg.norm(elem_one - ea_elem_ar)       # interatomic distance
                    
                    if dist_cen_surr <= cutoff_r and dist_cen_surr > .001:      # Within the cutoff and not self counting

                        if _e != "H":    
                            
                            dist_id += 1   # Counting number of particle in the local element
                            
                            if _e == "N" or _e =="C":
                                ina_d.append(dist_cen_surr) # All of the Interatomic distance
                                
                                ina_d_coord.append(ea_elem_ar)
                                
                            if (_e != "P") and (dist_cen_surr <= dist_threshold):  # If it is neigboring atom
                                    
                                surr_vec.append(ea_elem_ar - elem_one) # Vector
                                if _e == "C":
                                    dist_ += 1               # Coordination Number
                        
                                elif _e == "N":
                                    not_neigh_dist_ += 1
                                    
                            elif (_e == "P") and (dist_cen_surr <= 1.9):
                                phos_cnt += 1
                                surr_vec.append(ea_elem_ar - elem_one) # Vector
                                        
            # Angle Average
            angle_gather_avg = []
            if len(surr_vec) > 1:  # At least two particles are bonded
                if len(surr_vec) > 4:
                    print("More than 4 bonded atoms")
                    break
                
                for ever_surr_1 in np.arange(len(surr_vec) - 1):
                    for ever_surr_2 in np.arange(ever_surr_1 + 1, len(surr_vec)):
                        angle_gather_avg.append(np.einsum("i,i->",surr_vec[ever_surr_1], surr_vec[ever_surr_2]) / \
                            (np.linalg.norm(surr_vec[ever_surr_1]) * np.linalg.norm(surr_vec[ever_surr_2])))
            
            elif len(surr_vec) <= 1:
                
                if len(surr_vec) == 0:
                    print(f"No Bonded Atoms Labled {label_sda}")
                    break
                    
                angle_gather_avg.append(1.0)
            
            # Ring information
            mol = Chem.rdmolfiles.MolFromSmiles(smile_edit[label_sda - 1]) 
            ri = mol.GetRingInfo()

            
            # Pair Wise Distance Matrix
            dis_matrix = distance.pdist(ina_d_coord)
            
            # Compute Voronoi diagram
            if len(ina_d_coord) >= 5:
                vor = Voronoi(np.asarray(ina_d_coord))
                
                cell_volumes = voronoi_cell_volume(vor, 0)
            
            elif len(ina_d_coord) < 5:
                cell_volumes = 1.0
            
            ## Gathering all of the features ##
            
            cn_dictionary.append(dist_)                        
            cnd_avg.append(dist_)
            
            local_d.append(dist_id / (4 * np.pi * (cutoff_r ** 3)/3))
            ld_avg.append(dist_id / (4 * np.pi * (cutoff_r ** 3)/3))
            
            connect_score.append(dist_ - not_neigh_dist_ - phos_cnt/2)
            cs_avg.append(dist_ - not_neigh_dist_ - phos_cnt/2)
            
            interatomic_d.append(np.std(ina_d))
            
            num_rings.append(len(list(ri.AtomRings())))

            avg_bondangle.append(np.average(angle_gather_avg))

            dist_matrix_gather.append(np.median(dis_matrix))
            dm_avg.append(np.median(dis_matrix))
            
            voronoi_volume.append(cell_volumes)
                             
            if dist_ > 4: # To see if the distance value is right (Coordinate)
                print(f"{label_sda}  Unrealistic Coordination Number with {dist_}")


        ## Average value of structura feature (Usage for Molecular Structure)
        ## Local Density, Coordination Number, Types of Element bonded, Pair wise distance matrix
        local_d_avg.append(np.mean(ld_avg))
        cn_dictionary_avg.append(np.mean(cnd_avg))
        connect_score_avg.append(np.mean(cs_avg))
        dist_matrix_gather_avg.append(np.mean(dm_avg))
    
    ea_line += atom_n + 2 # Going to next SDA molecule
    
print(f"Finished Running  length = {len(cn_dictionary)}")

In [None]:
frames = aseio.read("frame.xyz", ':')

properties = {'pc-1': dict(values=pca_result[:,0], target="atom",
                                  description="PCA - first component"),
                  'pc-2': dict(values=pca_result[:,1], target="atom",
                                  description="PCA - second component"),
                  'pc-3': dict(values=pca_result[:,2], target="atom",
                                  description="PCA - third component"),
             'Coordination Number':dict(values=cn_dictionary, target="atom",
                                  description="CN"),
             'Local Density':dict(values=local_d, target="atom",
                                  description="local_density"),
               'Variance of bonded atoms':dict(values=connect_score, target="atom",
                                  description="connection score"),
                'Interatomic Distance':dict(values=interatomic_d, target="atom",
                                  description="Standev Interatomic Distance"),
                 'Number of Rings':dict(values=num_rings ,target="atom",
                                  description="Number of Rings"), 
              'Average Bond Angle(H exclude)':dict(values=avg_bondangle ,target="atom",
                                  description="Average Bond Angle"),
               'Median Pair Wise Distance':dict(values=dist_matrix_gather ,target="atom",
                                  description="Mean_pairwise_dist")}                                  

'''
frames: Reading XYZ file using frames = aseio.read(path, ":")
name_environment: Title of the plot
environments=defining the centered atom (Change the frames[i].numbers=="Corresponding Atom number")
'''

# 1. Check the name of the JSON file
# 2. Confirm the environment gathered center atom species

chemiscope.write_input("pca_Ncenter_4.json", frames,
              meta={'name':'N Center (4Å)'},
                properties=properties,
                settings={
                    'map': {'x': { 'property': 'pc-1' }, 'y': { 'property': 'pc-2' },
                            'color': { 'property': 'pc-1'},
                           'size': {'factor': 50, 'mode': 'linear', 'property': 'pc-1'} },
                    'structure': [{ 'unitCell': False, 'supercell': {'0': 1, '1': 1, '2': 1},  'keepOrientation': True,
                                  'environments': {'center':True}}]
                },
                environments=np.vstack( [ [ [i, e, cutoff_r] for e in np.where(frames[i].numbers==Atom(center_atom).number)[0] ]\
                         for i in range(len(frames)) if len([ [i, e, cutoff_r] for e in np.where(frames[i].numbers==Atom(center_atom).number)[0] ]) != 0]  )
                
                )

## Gathering structural features  (Averaged - Molecule)

In [None]:
# PCA result
pca_result = np.loadtxt("avg_pca4N.csv", delimiter = ',')
print(f"PCA result : shape {pca_result.shape}")

In [None]:
center_atom=  "N" # Center Atom Type 

################################################
## Coordinate number around the center atom ##
################################################
coordinate_file = open("frame.xyz").read().splitlines()
ea_line, label_sda = 0, 0

dist_threshold = 1.6              # 1.56 for both (N - C) (C - C)

cluster_size = []    # Cluster Size

num_rings = []      # Number of rings

num_zeolite = []    # Number of Synthesizable zeolite

while ea_line < len(coordinate_file):

    atom_n = int(coordinate_file[ea_line].split()[0])

    atom_dictionary = {}
    for an_ in np.arange(ea_line + 2, ea_line + atom_n):
        ann_ = coordinate_file[an_].split()

        if ann_[0] not in atom_dictionary.keys():
            atom_dictionary[ann_[0]] = []
        atom_dictionary[ann_[0]].append(np.asarray([float(an_n) for an_n in ann_[1:]])) # Coordinate

    if center_atom in atom_dictionary.keys():

        mol = Chem.rdmolfiles.MolFromSmiles(smile_edit[label_sda]) 
        
        ri = mol.GetRingInfo()
        
        num_rings.append(len(list(ri.AtomRings())))
            
        atom_size = 0
        
        for atom_spe, atom_C in atom_dictionary.items():
            
            if atom_spe != "H":
                atom_size += len(atom_C)
    
        cluster_size.append(atom_size)
        
        num_zeolite.append(len(zeolite_list[label_sda]))
        
    label_sda += 1 
    ea_line += atom_n + 2 # Going to next SDA molecule
print(f"Finished Running : Length {len(cluster_size)}")

In [None]:
frames = aseio.read("frame.xyz", ':')
n_frames = []
for i in range(len(frames)):
    if len([ [i, e] for e in np.where(frames[i].numbers==7)[0] ]) != 0:
        n_frames.append(frames[i])

## Average SOAP vector 

properties = {'pc-1': dict(values=pca_result[:,0], target="structure",
                                  description="PCA - first component"),
                  'pc-2': dict(values=pca_result[:,1], target="structure",
                                  description="PCA - second component"),
                  'pc-3': dict(values=pca_result[:,2], target="structure",
                                  description="PCA - third component"),
                   'Molecule Size ': dict(values=cluster_size, target="structure",
                                  description="Sum number of atoms"),
                'Number of Ring': dict(values=num_rings, target="structure",
                                  description="Number of Rings"),
              'Number of Syntesized Zeolite': dict(values=num_zeolite, target="structure",
                                  description="num_syn_zeolite"),
               'Local Density(Avg)': dict(values=local_d_avg, target="structure",
                                  description="local_d_avg"),
              'Coordination Number(Avg)': dict(values=cn_dictionary_avg, target="structure",
                                  description="coordination number avg"),
              'Types of bonded element (Avg)': dict(values=connect_score_avg, target="structure",
                                  description="connection score avg"),
             'Distance Matrix (Avg)': dict(values=dist_matrix_gather_avg, target="structure",
                                  description="distance matrix gather avg")}    

chemiscope.write_input("pca_Ncenter_4avg.json", n_frames, 
                 meta={'name':'N-Center (4Å, Avg)'},
                properties=properties,
                settings={
                    'map': {'x': { 'property': 'pc-1' }, 'y': { 'property': 'pc-2' }, 
                            'color': { 'property': 'pc-1' },
                           'size': {'factor': 50, 'mode': 'linear', 'property': 'pc-1'} }, 
                    'structure': [{ 'unitCell': False, 'supercell': {'0': 1, '1': 1, '2': 1}}]
                })

## Correlation Analysis

In [None]:
from scipy.stats import pearsonr

new_dict = {}
for pca_n in np.arange(2):

    new_dict[f"PC{pca_n + 1}"] = [np.round(pearsonr(pca_result[:,pca_n], local_d_avg)[0],2), 
                        np.round(pearsonr(pca_result[:,pca_n], cn_dictionary_avg)[0],2),
                         np.round(pearsonr(pca_result[:,pca_n], connect_score_avg)[0],2),
                         np.round(pearsonr(pca_result[:,pca_n], dist_matrix_gather_avg)[0],2)
                        ]

df = pd.DataFrame(new_dict, \
                  index=["Local Density (Avg)", "Coordination Number(Avg)", "Types of Bonded Element(Avg)", "Distance Matrix(Avg)"]).T
df.to_csv("avgSOAP_Ncenter_pearsonr.csv")