In [1]:
import pandas as pd
import scipy.io
import numpy as np
from scipy.spatial.distance import pdist, squareform
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import tensorflow as tf
import time
rand_state = 42
tf.set_random_seed(rand_state)
np.random.seed(rand_state)

import warnings
warnings.simplefilter('ignore')

# Data  

I am using the QM7 dataset, which is a subset of GDB-13 (a database of nearly 1 billion stable and synthetically accessible organic molecules) containing up to 7 heavy atoms C, N, O, and S. The 3D Cartesian coordinates of the most stable conformations and their atomization energies were determined using ab-initio density functional theory (PBE0/tier2 basis set). This dataset also provided Coulomb matrices as calculated in [Rupp et al. PRL, 2012]:  
  * $C_{i,i} = 0.5 \cdot Z^{2.4}$  
  * $C_{i,j} = Z_i \cdot \frac{Z_j}{|(R_i−R_j)|}$ 
  * $Z_i$ - nuclear charge of atom i  
  * $R_i$ - cartesian coordinates of atom i  

The data file (.mat format, we recommend using `scipy.io.loadmat` for python users) contains five arrays:  
  * "X" - (7165 x 23 x 23), Coulomb matrices  
  * "T" - (7165), atomization energies (unit: kcal/mol)  
  * "P" - (5 x 1433), cross-validation splits as used in [Montavon et al. NIPS, 2012]  
  * "Z" - (7165 x 23), atomic charges  
  * "R" - (7165 x 23 x 3), cartesian coordinate (unit: Bohr) of each atom in the molecules 

In [2]:
qm7 = scipy.io.loadmat('./data/qm7/qm7.mat')
y = np.transpose(qm7['T']).reshape((7165,))
R = qm7['R']
Z = qm7['Z']

## Data Augmentation  

We simply do not have enough data for our convnet to be really accurate, so we can simply rearrange the order of the atoms and compute the Coulomb Matrix for that configuration. Note that the charge vector must also be rearranged in the exact same order. Theoretically for every molecule there are 23! permutations, which is a crazy large number so we will simply take 100 of these, so our total data will be 100 times larger. 

In [65]:
'''
If we wanted to randomize the ordering of the atoms,
thus changing the Coulomb Matrix but not the atomization energy
we first need to generate a permutation of the indices (0 to num_atoms-1)
We generate 10 uniform random samples to use as indices to rearrange R and Z
and then compute the Coulomb Matrix for this configuration
'''
np.random.choice()
def get_coulomb_matrix(R, Z, indices=None):
    CM = np.zeros((num_atoms, num_atoms))
    
    if indices is not None:
        # need to rearrange R and Z
        R = R[indices]
        Z = Z[indices]
    
    dist_matrix = squareform(pdist(R))
    for i in range(num_atoms):
        for j in range(num_atoms):
            # diagonals
            if i == j:
                CM[i,j] = 0.5 * Z[i]**2.4
            else:
                CM[i,j] = Z[i]*Z[j]/dist_matrix[i,j]
    # because dividing by zero results in nan (invisible atoms)
    CM[np.isnan(CM)] = 0
    return CM

In [83]:
# This will increase our examples by a factor of 10
augment_factor = 10

CM_augmented = np.zeros((qm7['X'].shape[0]*augment_factor, num_atoms, num_atoms)) 
y_augmented = np.zeros((y.shape[0]*augment_factor,))

for i in range(qm7['X'].shape[0]):
    for j in range(augment_factor):
        rand_indices = np.random.choice(np.arange(num_atoms), num_atoms, replace=False)
        CM_augmented[(i*augment_factor)+j] = get_coulomb_matrix(R[i], Z[i], rand_indices)
        y_augmented[(i*augment_factor)+j] = y[i]

In [113]:
CM_augmented.shape

(71650, 23, 23, 1)