In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

Goal is to make a transformation matrix that converts a bond type vector to atom count vector.

$$
\begin{bmatrix}
   c_\text{(CC1, Br)} & c_\text{(CC2, Br)} & \dots & c_\text{((OS1, Br)}\\
   c_\text{(CC1, C)} & c_\text{(CC2, C)} & \dots & c_\text{(OS1, C)}\\
   \vdots & \vdots & \dots & \vdots\\
   c_\text{(CC1, S)} & c_\text{(CC2, S)} & \dots & c_\text{(OS1, S)}\\
\end{bmatrix}
\cdot
\begin{bmatrix}
   b_\text{CC1} \\
   b_\text{CC2} \\
   \vdots \\
   b_\text{OS1}
\end{bmatrix}
=
\begin{bmatrix}
   a_\text{Br} \\
   a_\text{C} \\
   \vdots \\
   a_\text{S}
\end{bmatrix}
$$

In [25]:
bond_types = pd.read_csv('./data/processed/csp_bond_types.csv', index_col=0)

bond_types.reset_index(drop=True, inplace=True)

bond_types.columns

Index(['BrC1', 'BrN1', 'CC1', 'CC1.5', 'CC2', 'CC3', 'CCl1', 'CF1', 'CH1',
       'CN1', 'CN1.5', 'CN2', 'CN3', 'CO1', 'CO2', 'CS1', 'CS2', 'HN1', 'HO1',
       'NN1', 'NN2', 'NO1', 'NS1', 'NS2', 'OS1', 'OS2'],
      dtype='object')

In [26]:
bond_types.head()

Unnamed: 0,BrC1,BrN1,CC1,CC1.5,CC2,CC3,CCl1,CF1,CH1,CN1,...,CS2,HN1,HO1,NN1,NN2,NO1,NS1,NS2,OS1,OS2
0,0,0,5,6,1,0,0,0,19,9,...,0,1,0,0,0,0,0,0,0,0
1,0,0,5,6,1,0,0,0,19,9,...,0,1,0,0,0,0,0,0,0,0
2,0,0,5,6,1,0,0,0,19,9,...,0,1,0,0,0,0,0,0,0,0
3,0,0,5,6,1,0,0,0,19,9,...,0,1,0,0,0,0,0,0,0,0
4,0,0,7,0,1,0,0,0,20,9,...,0,1,1,0,0,0,0,0,0,0


In [27]:
atom_counts = pd.read_csv('./data/processed/atom_counts.csv')

atom_counts.columns

Index(['Br', 'C', 'Cl', 'F', 'H', 'N', 'O', 'S'], dtype='object')

In [28]:
obj = {
    'Br':1, 
    'C': 4, 
    'Cl':1, 
    'F': 1, 
    'H': 1, 
    'N': 3, 
    'O': 2, 
    'S': 2
}

In [29]:
TM = pd.DataFrame(
    index = atom_counts.columns, 
    columns = bond_types.columns,
    data=0
)

TM

Unnamed: 0,BrC1,BrN1,CC1,CC1.5,CC2,CC3,CCl1,CF1,CH1,CN1,...,CS2,HN1,HO1,NN1,NN2,NO1,NS1,NS2,OS1,OS2
Br,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Cl,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
H,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
N,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
S,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
for atom in atom_counts.columns:
    for bond in bond_types.columns:
        bond_name = "".join([s if s.isalpha() else "" for s in bond])
        bond_num = float("".join([s if not s.isalpha() else "" for s in bond]))
        
        if atom in bond:
            coef = bond_num
            
            if bond_name in ["CC", "NN"]:
                coef *= 2
            
            coef /= obj[atom]
            
            TM.loc[atom, bond] = coef

TM.round(3)

Unnamed: 0,BrC1,BrN1,CC1,CC1.5,CC2,CC3,CCl1,CF1,CH1,CN1,...,CS2,HN1,HO1,NN1,NN2,NO1,NS1,NS2,OS1,OS2
Br,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
C,0.25,0.0,0.5,0.75,1.0,1.5,0.25,0.25,0.25,0.25,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Cl,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
F,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
N,0.0,0.333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333,...,0.0,0.333,0.0,0.667,1.333,0.333,0.333,0.667,0.0,0
O,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.5,1
S,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,0.5,1


In [79]:
atom_counts_h = (TM @ bond_types.T).T 

diff = (atom_counts_h - atom_counts).round(2)

diff[diff['C'] != 0]

Unnamed: 0,Br,C,Cl,F,H,N,O,S
63,0.0,1.5,0.0,0.0,0.0,0.67,0.0,2.0
340,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
341,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
342,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
343,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
357,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
358,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0
574,0.0,1.0,0.0,0.0,0.0,-0.0,0.0,2.0
575,0.0,1.0,0.0,0.0,0.0,-0.0,0.0,2.0
576,0.0,1.0,0.0,0.0,0.0,-0.0,0.0,2.0


In [80]:
e = pd.read_csv('data/processed/energy.csv')

e.loc[58020:58034].describe()

Unnamed: 0,energy
count,15.0
mean,-1632.416987
std,0.154427
min,-1632.5117
25%,-1632.5014
50%,-1632.4868
75%,-1632.44515
max,-1631.9965


In [None]:
a = abs((TM @ bond_types.T - atom_counts.T).T).sum(axis=1) == 0
a[a]

In [None]:
len(bond_types)