In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time

import numpy as np
import pandas as pd
import h5py
from sklearn.model_selection import train_test_split 

import torch
torch.set_default_dtype(torch.float64)

import e3nn
import e3nn.point.data_helpers as dh 
from training_utils import *

In [17]:
# make sure CUDA is available
print(torch.cuda.current_device())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())
print(torch.version.cuda)
device = "cuda"
#torch.rand(10).to(device)
#torch.rand(10, device=device)

AssertionError: Torch not compiled with CUDA enabled

In [18]:
device="cpu"

In [4]:
# read stationary data
with h5py.File("acetone/acetone-b3lyp_d3bj-631gd-gas-equilibrium_geometry.hdf5", "r") as h5:
    atomic_numbers = np.array(h5.get("atomic_numbers"))
    isotropic_shieldings = np.array(h5.get("isotropic_shieldings"))
print(atomic_numbers)
print(isotropic_shieldings)

[6 8 6 1 1 1 6 1 1 1]
[ -20.1123 -337.7406  159.0655   29.4662   29.4664   29.7726  159.0733
   29.4659   29.4658   29.773 ]


In [5]:
# read perturbed data
with h5py.File("acetone/acetone-b3lyp_d3bj-631gd-gas-NMR-pcSseg_1.hdf5", "r") as h5:
    geoms_and_shieldings = np.array(h5.get("data"))
print(np.shape(geoms_and_shieldings))
geometries = geoms_and_shieldings[:,:,:3]
shieldings = geoms_and_shieldings[:,:,3]

(100000, 10, 4)


In [6]:
# normalize data on a per-site basis
# this is extreme, but let's see how well we can do with this
means = np.mean(shieldings, axis=0)
stdevs = np.std(shieldings, axis=0)
shieldings = (shieldings-means)/stdevs

In [7]:
# this function will undo scaling so we can recover the absolute shieldings from predictions
def undo_scaling(v):
    return v * stdevs + means

In [8]:
# generate one-hot features
one_hots = pd.get_dummies(atomic_numbers)
one_hots = one_hots.to_numpy()

# get some shapes
n_examples = geometries.shape[0]
n_atoms_per_example = geometries.shape[1]
n_one_hots = one_hots.shape[1]

# features are just one-hots, so repeat the same one-hots for every training example
desired_shape = (n_examples, n_atoms_per_example, n_one_hots)
features = np.zeros(desired_shape)
features[:] = one_hots              # broadcast one_hots into every row along axis 0

In [9]:
# features are one-hots for every atom,
# so this is (number of one_hots, rank zero tensor, even parity)
Rs_in = [(n_one_hots,0,1)]

# we are outputing one scalar for every atom
# so this is (one, rank zero tensor, even parity)
Rs_out = [(1,0,1)]

# maximum extent of radial basis functions in Angstroms
max_radius = 2.0

In [10]:
class DataNeighborsSelection(dh.DataNeighbors):
    def __init__(self, x, Rs_in, pos, r_max, atomic_numbers, relevant_atomic_numbers=None, self_interaction=True, **kwargs):
        if atomic_numbers is None:
            raise ValueError("atomic_numbers must be defined")
        if relevant_atomic_numbers is None:
            relevant_atomic_numbers = [1, 6]  
        relevant_atom_index = [i for i, atom_number in enumerate(atomic_numbers)
                               if atom_number in relevant_atomic_numbers]
        relevant_atom_index = torch.LongTensor(relevant_atom_index).unsqueeze(0)  # [1, N]
        
        super().__init__(x, Rs_in, pos, r_max,
                         self_interaction=self_interaction,
                         atomic_numbers=atomic_numbers, 
                         relevant_atom_index=relevant_atom_index,
                         **kwargs)

In [11]:
# save a subset of the data to a .torch file
# precompute the relevant neighbors
relevant_atomic_numbers = [1, 6]

def save_dataset(geometry_subset, shielding_subset, feature_subset, output_filename):
    dataset = []
    n_to_save = len(geometry_subset)
    print(f"Preprocessing for {output_filename}:")
    for i,(g,s,f) in enumerate(zip(geometry_subset, shielding_subset, feature_subset)):
        g,s,f = torch.tensor(g, dtype=torch.float64), torch.tensor(s, dtype=torch.float64), torch.tensor(f, dtype=torch.float64)
        data = DataNeighborsSelection(x=f, Rs_in=Rs_in, pos=g, r_max=max_radius,
                                      atomic_numbers=atomic_numbers, relevant_atomic_numbers=relevant_atomic_numbers,
                                      y=s, Rs_out=Rs_out)
        dataset.append(data)
        if (i+1) % 100 == 0 or i == n_to_save - 1:
            print(f"{i+1:10d} of {n_to_save:10d}...", end="\r", flush=True)
    print("\nWriting to disk...", end="", flush=True)
    torch.save(dataset, output_filename)
    print(f"done.  Saved {len(dataset)} records.")
    
# splits the dataset randomly into training and test sets
# train_size = # number of training examples
# test_size = # number of test examples
# prefix = filenames will start with this string
# random_state = for reproducible splits
def split_and_save(train_size, test_size, prefix, random_state):
    assert train_size + test_size <= n_examples
    # [ training_geometries, training_shieldings, training_features,
    #   testing_geometries, testing_shieldings, testing_features     ]
    splitting = train_test_split(geometries, shieldings, features,
                                 test_size = test_size, train_size = train_size,
                                 random_state = random_state, shuffle = True)
    save_dataset(*splitting[::2], f"{prefix}-train.torch")
    save_dataset(*splitting[1::2], f"{prefix}-test.torch")

In [12]:
train_size = 500
test_size = 5000
split_and_save(train_size, test_size, "acetone-split1", 1)

Preprocessing for acetone-split1-train.torch:
       500 of        500...
Writing to disk...done.  Saved 500 records.
Preprocessing for acetone-split1-test.torch:
      5000 of       5000...
Writing to disk...done.  Saved 5000 records.


In [32]:
dataset1 = torch.load("acetone-split1-train.torch")
dataset2 = torch.load("acetone-split1-test.torch")
batch_size = 5
train_dataloader = tg.data.DataListLoader(dataset1, batch_size=batch_size, shuffle=True)
test_dataloader = tg.data.DataListLoader(dataset2, batch_size=batch_size, shuffle=False)

In [14]:
# RMSE loss function for this system
# only incorporate elements of interest
# relevant_atomic_numbers = {1,6}
# def loss_function_generator(atomic_numbers):
#     indices = []
#     for i,atomic_number in enumerate(atomic_numbers):
#         if atomic_number in relevant_atomic_numbers:
#             indices.append(i)
#     indices = indices * batch_size
            
#     def loss_function(x,y):
#         error = (x-y)[indices]**2.0
#         return error.mean()
    
#     return loss_function
# loss_function = loss_function_generator(atomic_numbers)

In [33]:
# define the neural network architecture
model_kwargs = {
    'network': 'GatedConvParityNetwork', 
    'conv': 'Convolution',
    'Rs_in': Rs_in,            # shape of inputs
    'Rs_out': Rs_out,          # shape of outputs
    'mul': 5,                 # how many copies of each tensor at each layer
    'lmax': 2,                 # maximum angular momentum
    'layers': 3,               # number of layers
    'max_radius': max_radius,  # radial kernel will extend out this far
    'number_of_basis': 10,     # number of Gaussians in radial kernel
}

model_kwargs = {
    'network': 'GatedConvParityNetwork', 
    'conv': 'Convolution',
    'Rs_in': Rs_in,            # shape of inputs
    'Rs_out': Rs_out,          # shape of outputs
    'mul': 5,                 # how many copies of each tensor at each layer
    'lmax': 2,                 # maximum angular momentum
    'layers': 3,               # number of layers
    'max_radius': max_radius,  # radial kernel will extend out this far
    'number_of_basis': 10,     # number of Gaussians in radial kernel
}

model = model_from_kwargs(model_kwargs)

In [16]:
# # test the model as it now
# # n_norm is average number of convolution neighbors per atom
# model.to(device)
# start_time = time.time()
# losses = evaluate(model, test_dataloader, [loss_function], device, n_norm=5)
# end_time = time.time()
# elapsed_time = end_time - start_time
# print(f"Loss is {losses[0].data:.4f}.  Took {elapsed_time:.2f} seconds.")

In [34]:
# training parameters
learning_rate = 3e-3
opt = torch.optim.Adam(model.parameters(), learning_rate)
max_iter = 100       
n_norm = 5           # n_norm is average number of convolution neighbors per atom
n_batches = int(train_size / batch_size)

In [48]:
model.to(device)

for i in range(max_iter):
    start_time = time.time()
    loss_cum = torch.tensor(0.)
    data_good = None
    for j,data in enumerate(train_dataloader):
        if j == 2:
            break
        print(f"Iteration {i+1:5d}    batch {j+1:5d} / {n_batches:5d}", end="\r", flush=True)
        data = tg.data.Batch.from_data_list(data)
        data.to(device)
        output = model(data.x, data.edge_index, data.edge_attr, n_norm=n_norm)

        loss = ((output[data.relevant_atom_index] - data.y[data.relevant_atom_index]) ** 2).mean()
#         loss = loss_function(output, data.y)
        loss_cum += loss
        opt.zero_grad()
        loss.backward()
        opt.step()
    end_time = time.time()
    elasped_time = end_time - start_time
    print(f"Iteration {i+1:5d}    batch {j+1:5d} / {n_batches:5d}  loss = {loss_cum.data / (j+1):7.4f}    elapsed = {elasped_time:7.2f} s")
    

Iteration     1    batch     3 /   100  loss =  0.5958    elapsed =    0.57 s
Iteration     2    batch     3 /   100  loss =  0.9471    elapsed =    0.50 s
Iteration     3    batch     3 /   100  loss =  0.6032    elapsed =    0.53 s
Iteration     4    batch     3 /   100  loss =  0.4546    elapsed =    0.51 s
Iteration     5    batch     3 /   100  loss =  0.4161    elapsed =    0.51 s
Iteration     6    batch     3 /   100  loss =  0.4712    elapsed =    0.50 s
Iteration     7    batch     3 /   100  loss =  0.8175    elapsed =    0.51 s
Iteration     8    batch     3 /   100  loss =  0.6472    elapsed =    0.51 s
Iteration     9    batch     3 /   100  loss =  0.5778    elapsed =    0.51 s
Iteration    10    batch     3 /   100  loss =  0.5404    elapsed =    0.50 s
Iteration    11    batch     3 /   100  loss =  0.6549    elapsed =    0.51 s
Iteration    12    batch     3 /   100  loss =  0.5936    elapsed =    0.51 s
Iteration    13    batch     3 /   100  loss =  0.9196    elapse

KeyboardInterrupt: 

In [None]:
# # test the model as it now
# # n_norm is average number of convolution neighbors per atom
# losses = evaluate(model, test_dataloader, [loss_function], device, n_norm=5)
# print(losses)