In [1]:
import numpy as np
import tensorflow as tf
from pinn import get_network
from pinn.utils import atomic_dress
from ase.collections import g2
from pinn.io import  load_numpy
from docs.notebooks.network_fns import get_traintest_sets, preprocess_traintest_sets
from ase import Atoms
from ase.calculators.lj import LennardJones
from tensorboard.plugins.hparams import api as hp

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
# Helper function: get the position given PES dimension(s)
def three_body_sample(atoms, a, r):
    x = a * np.pi / 180
    pos = [[0, 0, 0],
           [0, 2, 0],
           [0, r*np.cos(x), r*np.sin(x)]]
    atoms.set_positions(pos)
    return atoms

In [3]:
atoms = Atoms('H3', calculator=LennardJones())

na, nr = 50, 50
arange = np.linspace(30,180,na)
rrange = np.linspace(1,3,nr)

# Truth
agrid, rgrid = np.meshgrid(arange, rrange)
egrid = np.zeros([na, nr])
for i in range(na):
    for j in range(nr):
        atoms = three_body_sample(atoms, arange[i], rrange[j])
        egrid[i,j] = atoms.get_potential_energy()
        
# Samples
nsample = 100
asample, rsample = [], []
distsample = []
data = {'e_data':[], 'f_data':[], 'elems':[], 'coord':[]}
for i in range(nsample):
    a, r = np.random.choice(arange), np.random.choice(rrange)
    atoms = three_body_sample(atoms, a, r)
    dist = atoms.get_all_distances()
    dist = dist[np.nonzero(dist)]
    data['e_data'].append(atoms.get_potential_energy())
    data['f_data'].append(atoms.get_forces())
    data['coord'].append(atoms.get_positions())
    data['elems'].append(atoms.numbers)
    asample.append(a)
    rsample.append(r)
    distsample.append(dist)

### Dataset from numpy arrays

In [4]:
data = {k:np.array(v) for k,v in data.items()}
dataset = load_numpy(data, splits={'train':8, 'test':2})

train_set, test_set, batch_size = get_traintest_sets(dataset, buffer_size=100, batch_size=1) #train set contains only 80 molecules so batches are unnecessary

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2022-07-22 15:14:35.356853: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-07-22 15:14:35.356964: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Train with hyperparameter tuning

In [5]:
def get_and_train_network(hparams):
 
    network = get_network(params['network'])
    preprocess_traintest_sets(train_set, test_set, network=network)

    #Set up tensorboard directory and writer
    nodes = hparams[HP_NUM_NODES]
    depth = hparams[HP_DEPTH]
    lr = hparams[HP_LR]

    run_dir = (
    "/Users/miguelnavaharris/Project/hparams_logs/fixedbatch_1_epoch/"
    + str(nodes)
    + "nodes_"
    + str(depth)
    + "depth_"
    + str(lr)
    + "lr"
    )

    run_dir_writer = tf.summary.create_file_writer(run_dir)


    import time

    # Instantiate an optimizer
    from pinn.optimizers import get
    optimizer = get(params['optimizer'])
    # Define a loss function
    loss_fn = tf.keras.losses.mse
    # Define metrics
    train_loss_metric = tf.keras.metrics.MeanSquaredError()
    val_loss_metric = tf.keras.metrics.MeanSquaredError()
    train_err_metric = tf.keras.metrics.RootMeanSquaredError()
    val_err_metric = tf.keras.metrics.RootMeanSquaredError()
    

    for epoch in range(1):
        print(f'Start of epoch {epoch + 1}')
        start_time_epoch = time.time()
        hund_step_times = []

        # Iterate over the batches of the dataset.
        for step, batch in enumerate(train_set):
            
            
            with tf.GradientTape() as tape:

                # Run the forward pass of the layer.
                pred = network(batch, training=True)  # Logits for this minibatch

                ind = batch['ind_1']
                nbatch = tf.reduce_max(ind)+1
                pred = tf.math.unsorted_segment_sum(pred, ind[:, 0], nbatch)
                e_data = batch['e_data']

                if params['e_dress']:
                    e_data -= atomic_dress(batch, params['e_dress'], dtype=pred.dtype)
                    e_data *= params['e_scale']


                # Compute the loss value for this minibatch.
                loss_value = loss_fn(e_data, pred)

            # Use the gradient tape to automatically retrieve
            # the gradients of the trainable variables with respect to the loss.
            grads = tape.gradient(loss_value, network.trainable_weights)

            # Run one step of gradient descent by updating
            # the value of the variables to minimize the loss.
            optimizer.apply_gradients(zip(grads, network.trainable_weights))

            # Update the loss and error metrics
            train_loss_metric.update_state(e_data, pred)
            train_err_metric.update_state(e_data, pred)



            # Log every 100 batches.
            if step == 0:
                print(f"Initial loss (for one batch): {float(loss_value)}")
                print(f"Seen so far: {((step + 1) * batch_size)} molecules")



            elif step % 5 == 0:
                print(f"Training loss (for one batch) at step {step}: {float(loss_value)}")
                print(f"Seen so far: {((step + 1) * batch_size)} molecules")
                hund_step_times += [(time.time() - start_time_epoch)]
                print(f'Training time for 20 batches: {((hund_step_times[-1] - hund_step_times[-2]) if len(hund_step_times) > 1 else hund_step_times[-1])} s')
                
                # Record tensorboad metrics
                with run_dir_writer.as_default():
                    print('writing to tb')
                    tf.summary.scalar('Running training loss', train_loss_metric.result(), step=step)
                    tf.summary.scalar('Running training error', train_err_metric.result(), step=step)
                    tf.summary.scalar('Training time/20 batches', ((hund_step_times[-1] - hund_step_times[-2]) if len(hund_step_times) > 1 else hund_step_times[-1]), step=step)



        print(f'Training time for epoch {epoch + 1}: {(time.time() - start_time_epoch)} s')
        
                

        # Run a validation loop at the end of each epoch
        print(f'Starting validation for epoch {(epoch + 1)}')
        for step, batch in enumerate(test_set):
            val_pred = network(batch, training=False)
            ind = batch['ind_1']
            nbatch = tf.reduce_max(ind)+1
            val_pred = tf.math.unsorted_segment_sum(val_pred, ind[:, 0], nbatch)
            e_data = batch['e_data']

            if params['e_dress']:
                e_data -= atomic_dress(batch, params['e_dress'], dtype=pred.dtype)
                e_data *= params['e_scale']


            # Update val metrics
            val_loss_metric.update_state(e_data, val_pred)
            val_err_metric.update_state(e_data, val_pred)


        print(f"Time taken for epoch {epoch + 1}: {(time.time() - start_time_epoch)} s")

        # Display metrics at the end of each epoch
        train_err = train_err_metric.result()
        print(f"Training err over epoch {(epoch + 1)}: {float(train_err)}")
        val_err = val_err_metric.result()
        print(f"Validation err for epoch {(epoch + 1)}: {float(val_err)}")



        # Write to TensorBoard and reset metrics
        with run_dir_writer.as_default():
            hp.hparams(hparams)
            train_error = train_err_metric.result()
            val_error = val_err_metric.result()
            train_loss = train_loss_metric.result()
            val_loss = val_loss_metric.result()
            tf.summary.scalar("training error", train_error, step=1)
            tf.summary.scalar("validation error", val_error, step=1)
            tf.summary.scalar("training loss", train_loss, step=1)
            tf.summary.scalar("validation loss", val_loss, step=1)


      
        train_err_metric.reset_states()
        val_err_metric.reset_states()
        train_loss_metric.reset_states()
        val_loss_metric.reset_states()

    

    

In [6]:
HP_NUM_NODES = hp.HParam("num nodes", hp.Discrete([32,64, 128]))
HP_DEPTH = hp.HParam("depth", hp.Discrete([6,8,10,128]))
HP_LR = hp.HParam("initial learning rate", hp.Discrete([0.03, 0.009, 0.003]))

for num_nodes in HP_NUM_NODES.domain.values:
    for depth in HP_DEPTH.domain.values:
        for lr in HP_LR.domain.values:
            hparams = {
                HP_NUM_NODES: num_nodes,
                HP_DEPTH: depth,
                HP_LR: lr
            }
            
            params={
                'optimizer': {
                    'class_name': 'Adam',
                    'config': {
                        'learning_rate': {
                            'class_name': 'ExponentialDecay',
                            'config': {
                                'initial_learning_rate': lr,
                                'decay_steps': 10000, 
                                'decay_rate': 0.994}}, 
                                'clipnorm': 0.01}},
                'network': {
                    'name': 'PiNet',
                    'params': {
                        'ii_nodes':[num_nodes,num_nodes],
                        'pi_nodes':[num_nodes,num_nodes],
                        'pp_nodes':[num_nodes,num_nodes],
                        'out_nodes':[num_nodes,num_nodes],
                        'depth': depth,
                        'rc': 3.0,
                        'atom_types':[1]}},

                'e_dress': {},
                'e_scale': 1,
                'e_unit': 1.0,
            } 

In [7]:
error_count = 0
while True:
    try:
        get_and_train_network(hparams)
        break
    except tf.errors.InvalidArgumentError:
        print('Raised an error')
        error_count += 1

2022-07-22 15:14:36.093229: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2022-07-22 15:14:36.093508: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Start of epoch 1
Initial loss (for one batch): 1935.3538818359375
Seen so far: 1 molecules
Training loss (for one batch) at step 5: 358396960.0
Seen so far: 6 molecules
Training time for 20 batches: 22.261831998825073 s
writing to tb
Training loss (for one batch) at step 10: 1325567744.0
Seen so far: 11 molecules
Training time for 20 batches: 16.935457944869995 s
writing to tb
Training loss (for one batch) at step 15: 495481696.0
Seen so far: 16 molecules
Training time for 20 batches: 16.96381688117981 s
writing to tb
Training loss (for one batch) at step 20: 9251945.0
Seen so far: 21 molecules
Training time for 20 batches: 16.749802112579346 s
writing to tb
Training loss (for one batch) at step 25: 16451140608.0
Seen so far: 26 molecules
Training time for 20 batches: 17.07881999015808 s
writing to tb
Training loss (for one batch) at step 30: 15802391552.0
Seen so far: 31 molecules
Training time for 20 batches: 16.8231041431427 s
writing to tb
Training loss (for one batch) at step 35: 

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


### Search 1 (5 epochs)

In [None]:
HP_NUM_NODES = hp.HParam("num nodes", hp.Discrete([4,8,16]))
HP_DEPTH = hp.HParam("depth", hp.Discrete([4,8,16]))
HP_LR = hp.HParam("initial learning rate", hp.Discrete([0.003, 0.0003, 0.00003]))

gives 16, 8, 0.003 as lowest validation error. \
Training error 0.70893 \
Validation error 0.46347

### Search 2 (5 epochs)

In [None]:
HP_NUM_NODES = hp.HParam("num nodes", hp.Discrete([16,32,64]))
HP_DEPTH = hp.HParam("depth", hp.Discrete([8,32,64]))
HP_LR = hp.HParam("initial learning rate", hp.Discrete([0.03, 0.009, 0.003]))

gives 32, 8, 0.003 as lowest validation error. \
Training error 2.9923 \
Validation error 0.99355

Repeat gives 16, 8, 0.003 as lowest validation error. \
Training error 1.8622
Validation error 1.0966

### Search 3 (5 epochs)

In [None]:
HP_NUM_NODES = hp.HParam("num nodes", hp.Discrete([32,64, 128]))
HP_DEPTH = hp.HParam("depth", hp.Discrete([6,8,10,128]))
HP_LR = hp.HParam("initial learning rate", hp.Discrete([0.03, 0.009, 0.003]))

gives 32, 6, 0.003 as lowest validation error. \
Training error 2.7386 \
Validation error 1.0408

Repeat gives 32, 8, 0.003 as lowest validation error. \
Training error 1.0956
Validation error 1.4258