In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

Goal is to make a transformation matrix that converts a bond type vector to atom count vector.

$$
\begin{bmatrix}
   c_\text{(CC1, Br)} & c_\text{(CC2, Br)} & \dots & c_\text{((OS1, Br)}\\
   c_\text{(CC1, C)} & c_\text{(CC2, C)} & \dots & c_\text{(OS1, C)}\\
   \vdots & \vdots & \dots & \vdots\\
   c_\text{(CC1, S)} & c_\text{(CC2, S)} & \dots & c_\text{(OS1, S)}\\
\end{bmatrix}
\cdot
\begin{bmatrix}
   b_\text{CC1} \\
   b_\text{CC2} \\
   \vdots \\
   b_\text{OS1}
\end{bmatrix}
=
\begin{bmatrix}
   a_\text{Br} \\
   a_\text{C} \\
   \vdots \\
   a_\text{S}
\end{bmatrix}
$$

In [25]:
bond_types = pd.read_csv('./data/processed/csp_bond_types.csv', index_col=0)

bond_types.reset_index(drop=True, inplace=True)

bond_types.columns

Index(['BrC1', 'BrN1', 'CC1', 'CC1.5', 'CC2', 'CC3', 'CCl1', 'CF1', 'CH1',
       'CN1', 'CN1.5', 'CN2', 'CN3', 'CO1', 'CO2', 'CS1', 'CS2', 'HN1', 'HO1',
       'NN1', 'NN2', 'NO1', 'NS1', 'NS2', 'OS1', 'OS2'],
      dtype='object')

In [26]:
bond_types.head()

Unnamed: 0,BrC1,BrN1,CC1,CC1.5,CC2,CC3,CCl1,CF1,CH1,CN1,...,CS2,HN1,HO1,NN1,NN2,NO1,NS1,NS2,OS1,OS2
0,0,0,5,6,1,0,0,0,19,9,...,0,1,0,0,0,0,0,0,0,0
1,0,0,5,6,1,0,0,0,19,9,...,0,1,0,0,0,0,0,0,0,0
2,0,0,5,6,1,0,0,0,19,9,...,0,1,0,0,0,0,0,0,0,0
3,0,0,5,6,1,0,0,0,19,9,...,0,1,0,0,0,0,0,0,0,0
4,0,0,7,0,1,0,0,0,20,9,...,0,1,1,0,0,0,0,0,0,0


In [27]:
atom_counts = pd.read_csv('./data/processed/atom_counts.csv')

atom_counts.columns

Index(['Br', 'C', 'Cl', 'F', 'H', 'N', 'O', 'S'], dtype='object')

In [28]:
obj = {
    'Br':1, 
    'C': 4, 
    'Cl':1, 
    'F': 1, 
    'H': 1, 
    'N': 3, 
    'O': 2, 
    'S': 2
}

In [29]:
TM = pd.DataFrame(
    index = atom_counts.columns, 
    columns = bond_types.columns,
    data=0
)

TM

Unnamed: 0,BrC1,BrN1,CC1,CC1.5,CC2,CC3,CCl1,CF1,CH1,CN1,...,CS2,HN1,HO1,NN1,NN2,NO1,NS1,NS2,OS1,OS2
Br,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
H,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
S,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
for atom in atom_counts.columns:
    for bond in bond_types.columns:
        bond_name = "".join([s if s.isalpha() else "" for s in bond])
        bond_num = float("".join([s if not s.isalpha() else "" for s in bond]))
        
        if atom in bond:
            coef = bond_num
            
            if bond_name in ["CC", "NN"]:
                coef *= 2
            
            coef /= obj[atom]
            
            TM.loc[atom, bond] = coef

TM.round(3)

Unnamed: 0,BrC1,BrN1,CC1,CC1.5,CC2,CC3,CCl1,CF1,CH1,CN1,...,CS2,HN1,HO1,NN1,NN2,NO1,NS1,NS2,OS1,OS2
Br,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
C,0.25,0.0,0.5,0.75,1.0,1.5,0.25,0.25,0.25,0.25,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Cl,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
N,0.0,0.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333,...,0.0,0.333,0.0,0.667,1.333,0.333,0.333,0.667,0.0,0
O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.5,1
S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5,1


In [79]:
atom_counts_h = (TM @ bond_types.T).T 

diff = (atom_counts_h - atom_counts).round(2)

diff[diff['C'] != 0]

Unnamed: 0,Br,C,Cl,F,H,N,O,S
63,0.0,1.5,0.0,0.0,0.0,0.67,0.0,2.0
340,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
341,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
342,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
343,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
357,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
358,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
574,0.0,1.0,0.0,0.0,0.0,-0.0,0.0,2.0
575,0.0,1.0,0.0,0.0,0.0,-0.0,0.0,2.0
576,0.0,1.0,0.0,0.0,0.0,-0.0,0.0,2.0


In [80]:
e = pd.read_csv('data/processed/energy.csv')

e.loc[58020:58034].describe()

Unnamed: 0,energy
count,15.0
mean,-1632.416987
std,0.154427
min,-1632.5117
25%,-1632.5014
50%,-1632.4868
75%,-1632.44515
max,-1631.9965


In [None]:
a = abs((TM @ bond_types.T - atom_counts.T).T).sum(axis=1) == 0
a[a]

In [None]:
len(bond_types)

# Learning the bond matrix


In [None]:
import tensorflow as tf
from tensorflow import transpose as t
from tqdm import tqdm

w = tf.Variable(TM, name='w', trainable=False, dtype='float32')
x = tf.Variable(bond_types, name='x', trainable=True, dtype='float32')
mask = tf.Variable(bond_types!=0, name='mask', trainable=False, dtype='float32')
y = tf.Variable(atom_counts, name='y', trainable=False, dtype='float32')

opt = tf.keras.optimizers.Adam()
mse = tf.keras.losses.MeanSquaredError()

h = []

for i in tqdm(range(10000)):
    with tf.GradientTape(persistent=True) as tape:
        y_h = tf.nn.relu(tf.math.multiply(x, mask)) @ t(w)
        loss = mse(y, y_h)
        h.append(loss)

    grads = tape.gradient(loss, [x])
    opt.apply_gradients(zip(grads, [x]))
    
print('done')

In [None]:
import matplotlib.pyplot as plt

plt.plot(list(range(len(h))), h)

h[-1]

In [None]:
output = tf.nn.relu(tf.math.multiply(x, mask)).numpy().astype('int32')

output

In [None]:
abs((TM @ output.T - atom_counts.T).T).mean(axis=0)

In [None]:
abs((TM @ output.T - atom_counts.T).T).mean(axis=0)

In [None]:
learned_bond_types = pd.DataFrame(
    columns=bond_types.columns,
    data=output
)

learned_bond_types.head()

In [None]:
learned_bond_types.to_csv('data/processed/learned_bond_types.csv', index=False)

## CSP


In [None]:
from funcs.csp import get_csp_bond_type_count

df = pd.DataFrame()

for i in tqdm(range(1, 138366)):
    row = train_db.get(i)

    bond_counts = get_csp_bond_type_count(row)

    if bond_counts == None:
        print(i)
        break

    for bond in bond_counts:
        if bond not in df.columns:
            df[bond] = 0
        df.loc[i, bond] = bond_counts[bond]
            
df.head()

In [None]:
get_csp_bond_type_count(train_db.get(34))

In [None]:
df.to_csv('data/processed/csp_bond_types.csv', index=False)

In [None]:
df.to_csv('data/processed/csp_bond_types.csv', index=True)

In [None]:
len(df)

In [4]:
row = train_db.get(34)

# get_csp_bond_type_count(row)

In [2]:
from funcs.distance import *
from funcs.convert import *
from funcs.display import *

# display_molecule_video(row, atom_index=True, bond_type=False, equal_aspect_ratio=True)

In [5]:
get_bond_type('BrC', convert_to_picometers(get_distance_matrix(row)[15][16]))

1

In [6]:
dist_mat = get_distance_matrix(row)
bond_matrix = get_bond_type_matrix(dist_mat, row.symbols)

CC 0.0
CC 150.3715464412515
CC 255.46969433400395
CC 382.45293014307765
CC 436.76451882776763
CO 570.3229783829095
CC 694.2715835016974
CC 749.2260795983774
CO 690.205209953711
CN 887.5827973210694
CN 970.8426369361246
CC 954.2812665721674
CN 1080.2885632034722
CN 1165.917948714534
CC 1105.9715498234807
CC 381.59323151256103
BCr 543.8507997974194
CC 252.73839160175646
CH 113.23292845805807
CH 112.81893155537864
CH 111.65960808749088
CH 270.57424612836013
CH 470.8647657689687
CH 771.8468385150982
CH 743.3438339604573
CH 942.8443898488193
CH 869.834086454054
CH 1162.2962501219638
CH 267.9754567737905
CC 0.0
CC 140.3555917608663
CC 243.2252404937431
CC 286.43176411972394
CO 419.9551633777837
CC 544.2341383982871
CC 604.127349395365
CO 553.4680228349533
CN 743.1873474559223
CN 832.8585070336605
CC 826.0839789747296
CN 956.3059943548109
CN 1034.7584623472578
CC 967.7437532965179
CC 245.01527579085032
BCr 426.00875856198
CC 139.99613229565048
CH 214.83922807949645
CH 214.23773211693504
CH 21

In [20]:
bond_matrix[16][15]

0.0

In [15]:
bond_length = convert_to_picometers(get_distance_matrix(row)[15][16])
bond_name = 'BrC'

print(bond_length < shortest_bond_length)
print(bond_length > longest_bond_length)
print(bond_name not in type_bond_length)

False
False
False


- increase the maximum bond length to 195
- something is wrong with sulfur bonds

In [None]:
from funcs.csp import csp_solve_bond_matrix

for i in range(35, 140000):
    row = train_db.get(i)
    sols = csp_solve_bond_matrix(row, find_all=True)
    if sols == None or len(sols) == 0 or None in sols:
        print(i)
        break

In [None]:
len([None])