In [1]:
%load_ext autoreload
%autoreload 2

In [28]:
import time

import numpy as np
import pandas as pd
import h5py
from sklearn.model_selection import train_test_split 

import torch
torch.set_default_dtype(torch.float64)

import e3nn
import e3nn.point.data_helpers as dh 
from training_utils import *

In [19]:
# make sure CUDA is available
print(torch.cuda.current_device())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print(torch.cuda.is_available())
print(torch.version.cuda)
device = "cuda"
#torch.rand(10).to(device)
#torch.rand(10, device=device)

0
1
Quadro M4000
True
10.1


In [4]:
# read stationary data
with h5py.File("acetone/acetone-b3lyp_d3bj-631gd-gas-equilibrium_geometry.hdf5", "r") as h5:
    atomic_numbers = np.array(h5.get("atomic_numbers"))
    isotropic_shieldings = np.array(h5.get("isotropic_shieldings"))
print(atomic_numbers)
print(isotropic_shieldings)

[6 8 6 1 1 1 6 1 1 1]
[ -20.1123 -337.7406  159.0655   29.4662   29.4664   29.7726  159.0733
   29.4659   29.4658   29.773 ]


In [5]:
# read perturbed data
with h5py.File("acetone/acetone-b3lyp_d3bj-631gd-gas-NMR-pcSseg_1.hdf5", "r") as h5:
    geoms_and_shieldings = np.array(h5.get("data"))
print(np.shape(geoms_and_shieldings))
geometries = geoms_and_shieldings[:,:,:3]
shieldings = geoms_and_shieldings[:,:,3]

(100000, 10, 4)


In [6]:
# normalize data on a per-site basis
# this is extreme, but let's see how well we can do with this
means = np.mean(shieldings, axis=0)
stdevs = np.std(shieldings, axis=0)
shieldings = (shieldings-means)/stdevs

In [7]:
# this function will undo scaling so we can recover the absolute shieldings from predictions
def undo_scaling(v):
    return v * stdevs + means

In [8]:
# generate one-hot features
one_hots = pd.get_dummies(atomic_numbers)
one_hots = one_hots.to_numpy()

# get some shapes
n_examples = geometries.shape[0]
n_atoms_per_example = geometries.shape[1]
n_one_hots = one_hots.shape[1]

# features are just one-hots, so repeat the same one-hots for every training example
desired_shape = (n_examples, n_atoms_per_example, n_one_hots)
features = np.zeros(desired_shape)
features[:] = one_hots              # broadcast one_hots into every row along axis 0

In [9]:
# features are one-hots for every atom,
# so this is (number of one_hots, rank zero tensor, even parity)
Rs_in = [(n_one_hots,0,1)]

# we are outputing one scalar for every atom
# so this is (one, rank zero tensor, even parity)
Rs_out = [(1,0,1)]

# maximum extent of radial basis functions in Angstroms
max_radius = 2.0

In [None]:
dh.DataNeighbors()

In [None]:
class DataNeighborsSelection(dh.DataNeighbors):
    def __init__(self, x, Rs_in, pos, r_max, atomic_numbers, relevant_atomic_numbers=None, self_interaction=True, **kwargs):
        if atomic_numbers is None:
            raise ValueError("atomic_numbers must be defined")
        if relevant_atomic_numbers is None:
            relevant_atomic_numbers = [1, 6]  
        relevant_atomic_number_index = [i for i, atom_number for enumerate(atomic_numbers) 
                                        if atom_number in relevant_atomic_numbers]
        
        super().__init__(self, x, Rs_in, pos, r_max, atomic_numbers=atomic_numbers, 
                         relevant_atomic_number_index=relevant_atomic_number_index,
                         self_interaction=self_interaction, **kwargs)

In [33]:
# save a subset of the data to a .torch file
# precompute the relevant neighbors
def save_dataset(geometry_subset, shielding_subset, feature_subset, output_filename):
    dataset = []
    n_to_save = len(geometry_subset)
    print(f"Preprocessing for {output_filename}:")
    for i,(g,s,f) in enumerate(zip(geometry_subset, shielding_subset, feature_subset)):
        g,s,f = torch.tensor(g, dtype=torch.float64), torch.tensor(s, dtype=torch.float64), torch.tensor(f, dtype=torch.float64)
        data = dh.DataNeighborsSelection(x=f, Rs_in=Rs_in, pos=g, r_max=max_radius, 
                                         atomic_numbers=atomic_numbers, relevant_atomic_numbers=relevant_atomic_numbers,
                                         y=s, Rs_out=Rs_out)
        dataset.append(data)
        if (i+1) % 100 == 0 or i == n_to_save - 1:
            print(f"{i+1:10d} of {n_to_save:10d}...", end="\r", flush=True)
    print("\nWriting to disk...", end="", flush=True)
    torch.save(dataset, output_filename)
    print(f"done.  Saved {len(dataset)} records.")
    
# splits the dataset randomly into training and test sets
# train_size = # number of training examples
# test_size = # number of test examples
# prefix = filenames will start with this string
# random_state = for reproducible splits
def split_and_save(train_size, test_size, prefix, random_state):
    assert train_size + test_size <= n_examples
    # [ training_geometries, training_shieldings, training_features,
    #   testing_geometries, testing_shieldings, testing_features     ]
    splitting = train_test_split(geometries, shieldings, features,
                                 test_size = test_size, train_size = train_size,
                                 random_state = random_state, shuffle = True)
    save_dataset(*splitting[::2], f"{prefix}-train.torch")
    save_dataset(*splitting[1::2], f"{prefix}-test.torch")

In [55]:
train_size = 500
test_size = 5000
split_and_save(train_size, test_size, "acetone-split1", 1)

Preprocessing for acetone-split1-train.torch:
       500 of        500...
Writing to disk...done.  Saved 500 records.
Preprocessing for acetone-split1-test.torch:
      5000 of       5000...
Writing to disk...done.  Saved 5000 records.


In [56]:
dataset1 = torch.load("acetone-split1-train.torch")
dataset2 = torch.load("acetone-split1-test.torch")
batch_size = 5
train_dataloader = tg.data.DataListLoader(dataset1, batch_size=batch_size, shuffle=True)
test_dataloader = tg.data.DataListLoader(dataset2, batch_size=batch_size, shuffle=False)

In [57]:
# RMSE loss function for this system
# only incorporate elements of interest
relevant_atomic_numbers = {1,6}
def loss_function_generator(atomic_numbers):
    indices = []
    for i,atomic_number in enumerate(atomic_numbers):
        if atomic_number in relevant_atomic_numbers:
            indices.append(i)
    indices = indices * batch_size
            
    def loss_function(x,y):
        error = (x-y)[indices]**2.0
        return error.mean()
    
    return loss_function
loss_function = loss_function_generator(atomic_numbers)

In [58]:
# define the neural network architecture
model_kwargs = {
    'network': 'GatedConvParityNetwork', 
    'conv': 'Convolution',
    'Rs_in': Rs_in,            # shape of inputs
    'Rs_out': Rs_out,          # shape of outputs
    'mul': 5,                 # how many copies of each tensor at each layer
    'lmax': 2,                 # maximum angular momentum
    'layers': 3,               # number of layers
    'max_radius': max_radius,  # radial kernel will extend out this far
    'number_of_basis': 10,     # number of Gaussians in radial kernel
}
model = model_from_kwargs(model_kwargs)

In [59]:
# test the model as it now
# n_norm is average number of convolution neighbors per atom
model.to(device)
start_time = time.time()
losses = evaluate(model, test_dataloader, [loss_function], device, n_norm=5)
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Loss is {losses[0].data:.4f}.  Took {elapsed_time:.2f} seconds.")

Loss is 1.0489.  Took 31.72 seconds.


In [63]:
# training parameters
learning_rate = 3e-3
opt = torch.optim.Adam(model.parameters(), learning_rate)
max_iter = 100       
n_norm = 5           # n_norm is average number of convolution neighbors per atom
n_batches = int(train_size / batch_size)

In [65]:
model.to(device)

for i in range(max_iter):
    start_time = time.time()
    for j,data in enumerate(train_dataloader):
        print(f"Iteration {i+1:5d}    batch {j+1:5d} / {n_batches:5d}", end="\r", flush=True)
        data = tg.data.Batch.from_data_list(data)
        data.to(device)
        output = model(data.x, data.edge_index, data.edge_attr, n_norm=n_norm)
        loss = loss_function(output, data.y)
    end_time = time.time()
    elasped_time = end_time - start_time
    print(f"Iteration {i+1:5d}    batch {j+1:5d} / {n_batches:5d}  loss = {loss.data:7.4f}    elapsed = {elasped_time:7.2f} s")
    opt.zero_grad()
    loss.backward()
    opt.step()

Iteration     1    batch   100 /   100  loss =  1.0282    elapsed =    4.80 s
Iteration     2    batch   100 /   100  loss =  1.4504    elapsed =    4.67 s
Iteration     3    batch    30 /   100

KeyboardInterrupt: 

In [None]:
# test the model as it now
# n_norm is average number of convolution neighbors per atom
losses = evaluate(model, test_dataloader, [loss_function], device, n_norm=5)
print(losses)

In [None]:
# 

In [None]:
shielding = torch.tensor(geoms_and_shieldings[:, :, 3], dtype=torch.float64).unsqueeze(-1)
shielding_mean, shielding_std = shielding.mean(), shielding.std()
print(shielding_mean, shielding_std)

In [48]:
model = model_from_kwargs(model_kwargs)

In [49]:
# model = load_model('acetone/acetone_5000_s2_small_then_large_batch.torch', model_kwargs=model_kwargs)

In [50]:
# read perturbed data
with h5py.File("acetone/acetone-b3lyp_d3bj-631gd-gas-NMR-pcSseg_1.hdf5", "r") as h5:
    geoms_and_shieldings = np.array(h5.get("data"))
    
shielding = torch.tensor(geoms_and_shieldings[:, :, 3], dtype=torch.float64).unsqueeze(-1)
shielding_mean, shielding_std = shielding.mean(), shielding.std()
print(shielding_mean, shielding_std)

tensor(38.1470) tensor(132.1694)


In [51]:
batch_size = 5

dataset = torch.load('acetone/acetone_geo/acetone_geometric_dataset_2499.torch')
dataset_2 = torch.load('acetone/acetone_geo/acetone_geometric_dataset_4999.torch')
dataloader = tg.data.DataListLoader(dataset + dataset_2, batch_size=batch_size, shuffle=True)

batch_size_eval = 32

test_dataset = torch.load('acetone/acetone_geo/acetone_geometric_dataset_7499.torch')
test_dataloader = tg.data.DataListLoader(test_dataset, batch_size=batch_size_eval)

test_dataset_2 = torch.load('acetone/acetone_geo/acetone_geometric_dataset_12499.torch')
test_dataloader_2 = tg.data.DataListLoader(test_dataset_2, batch_size=batch_size_eval)

In [52]:
# stuff = evaluate(model, dataloader, [loss_fn_mse, loss_fn_mae], 'cuda:1', 5)
stuff = evaluate(model, test_dataloader, [loss_fn_mse, loss_fn_mae], 'cuda:1', 5)

In [53]:
print('MSE, MAE in ppm')
stuff.cpu() * torch.tensor([shielding_std ** 2, shielding_std])

MSE, MAE in ppm


tensor([17763.4309,    76.9878])

In [54]:
optimizer = torch.optim.Adam(model.parameters(), 1e-3)

In [55]:
print(shielding_std)

tensor(132.1694)


In [74]:
for results in train(model, optimizer, dataloader, test_dataloader, device="cuda:1", scale_loss=shielding_std):
    with open('trial_save_small_fresh.torch', 'wb') as f:
        results['model_kwargs'] = model_kwargs
        torch.save(results, f)

tensor(132.1694)


KeyboardInterrupt: 

In [58]:
batch_size = 32

dataset = torch.load('acetone/acetone_geo/acetone_geometric_dataset_2499.torch')
dataset_2 = torch.load('acetone/acetone_geo/acetone_geometric_dataset_4999.torch')
dataloader = tg.data.DataListLoader(dataset + dataset_2, batch_size=batch_size, shuffle=True)

batch_size_eval = 32

test_dataset = torch.load('acetone/acetone_geo/acetone_geometric_dataset_7499.torch')
test_dataloader = tg.data.DataListLoader(test_dataset, batch_size=batch_size_eval)

test_dataset_2 = torch.load('acetone/acetone_geo/acetone_geometric_dataset_12499.torch')
test_dataloader_2 = tg.data.DataListLoader(test_dataset_2, batch_size=batch_size_eval)

In [75]:
# optimizer.state_dict()
# optimizer.load_state_dict(torch.load('trial_save_small_fresh_then_large_3.torch')['optimizer_state_dict'])

In [81]:
for results in train(model, optimizer, dataloader, test_dataloader, device="cuda:1", scale_loss=shielding_std):
    with open('trial_save_small_fresh_then_large_4.torch', 'wb') as f:
        results['model_kwargs'] = model_kwargs
        results['optimizer_state_dict'] = optimizer.state_dict()
        torch.save(results, f)

tensor(132.1694)
0
0 0.023256438494325772 0.06758774281502303
1
1 0.024299281997717435 0.10878588771268155
2
2 0.06089909779102471 0.13667444588613611
3
3 0.031067039163834582 0.11077813202038372
4
4 0.020992056336723554 0.07819750832269377
6
6 0.03315900033748473 0.09038669058420604
8
8 0.04377081040214497 0.10746706677435931
10
10 0.008875907741764251 0.07398536610897587
13
13 0.015275169531678692 0.07669201796318766
16
16 0.019881993784602547 0.09188974737110242
20
20 0.003872629902721232 0.04662992818969591
24
24 0.03303993555847925 0.13614711336262067
28
28 0.020550604682463922 0.08464845198598286
33
33 0.05209700788139679 0.1399424574293314
38
38 0.01578286659632377 0.07969935404991953
43
43 0.016564069675294026 0.08187145729249747
48
48 0.012948603669968767 0.07056699742493842
53
53 0.03197495904109956 0.0990769836184005
58
58 0.03821668914527086 0.0998475685185112
63
63 0.09624980110292763 0.16876971585625913
68
68 0.018231170827573886 0.07392675479664505
73
73 0.08650278999442

In [82]:
# saved = torch.load('trial_save.torch')
# saved = torch.load('trial_save_small.torch')
# saved = torch.load('trial_save_small_fresh_then_large.torch')
# saved = torch.load('trial_save_small_fresh_then_large_2.torch')
# saved = torch.load('trial_save_small_fresh_then_large_3.torch')
saved = torch.load('trial_save_small_fresh_then_large_4.torch')

In [85]:
saved['dynamics'][-1]

{'step': 98,
 'wall': 7250.098932480905,
 'batch': {'loss': 0.051605455525687315, 'mean_abs': 0.10825157145981973},
 'test': {'loss': tensor(0.0323, device='cuda:1'),
  'mean_abs': tensor(0.0934, device='cuda:1')},
 'train': {'loss': tensor(0.0205, device='cuda:1'),
  'mean_abs': tensor(0.0809, device='cuda:1')}}