In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from tqdm import tqdm 

In [None]:
x = pd.read_csv("./data/processed/csp_bond_types.csv", index_col=0)
y = pd.read_csv("./data/processed/energy.csv")

x = x.to_numpy()
y = y.to_numpy()

x.shape, y.shape

In [None]:
from sklearn.model_selection import train_test_split

# Split the data
x_, x_test, y_, y_test = train_test_split(x, y, test_size=.2, shuffle=True, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_, y_, test_size=.2, shuffle=True, random_state=42)

In [None]:
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

## Hyperparameter tuning

In [None]:
def loguniform(low=.001, high=.3, size=None):
    return np.exp(np.random.uniform(np.log(low), np.log(high), size))

n = 32
e = 10

lr = loguniform(size=n)
ls = np.random.randint(1, 50, size=n) * 20

results = []

In [None]:
%matplotlib inline

plt.scatter(lr, ls)
plt.xscale('log')

In [None]:
from tensorflow.keras.layers import Dense as den
from tensorflow.keras.layers import InputLayer as inp
from sklearn.model_selection import KFold, cross_val_score

for i in tqdm(range(n)):
    tf.keras.backend.clear_session()
    
    model = tf.keras.Sequential([
        inp(x.shape[1]),
        den(ls[i], activation='relu'),
        den(1)
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr[i]),
        loss="mse",
        metrics="mae"
    )
    
    k_fold = KFold(n_splits=e)
    
    performance = []
    for train_indices, test_indices in k_fold.split(x_val):
        h = model.fit(
            x_val[train_indices], y_val[train_indices],
            validation_data = (x_val[test_indices], y_val[test_indices]),
            epochs = 1,
            verbose=0
        )
        performance.append(h.history['val_mae'][0])

    results.append(sum(performance) / e)

In [None]:
results = np.array(results)

results

In [None]:
np.savetxt("lr.csv", lr, delimiter=",")
np.savetxt("ls.csv", ls, delimiter=",")
np.savetxt("r.csv", results, delimiter=",")

In [None]:
%matplotlib inline

fig, ax = plt.subplots()

scat_plot = ax.scatter(lr, ls, c=results)
plt.colorbar(scat_plot, pad=0.1, label='MAE')

ax.set_xscale('log')

i_min = results.argmin()
ax.scatter(lr[i_min], ls[i_min], marker="o", c='w', s=76)
ax.scatter(lr[i_min], ls[i_min], marker="*", c='#440256', s=70)

xlim = ax.get_xlim()
ylim = ax.get_ylim()

ax.plot([0, lr[i_min]], [ls[i_min], ls[i_min]], c='#696969', linestyle='dashed', linewidth=1, zorder=-10)
ax.plot([lr[i_min], lr[i_min]], [0, ls[i_min]], c='#696969', linestyle='dashed', linewidth=1, zorder=-10)

ax.text(lr[i_min]+.005, 10, '0.035')
ax.text(0.001, ls[i_min]+10, '560')

ax.set_xlim(xlim)
ax.set_ylim(ylim)

ax.set_xlabel('Learning rate')
ax.set_ylabel('Neurons in the\nhidden layer')

fig.savefig('hyperparameter-tuning.svg', bbox_inches='tight')

## Trianing the model

In [None]:
import os

work_dir = 'trained/'

os.mkdir(f'{work_dir}')
os.mkdir(f'{work_dir}/checkpoint')
os.mkdir(f'{work_dir}/model')

In [None]:
from tensorflow.keras.layers import Dense as d
from tensorflow.keras.layers import InputLayer as i

tf.keras.backend.clear_session()

model = tf.keras.Sequential([
    inp(x.shape[1]),
    den(ls[i_min],
        activation='relu'
    ),
    den(1)
])


model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr[i_min]),
    loss="MSE",
    metrics="MAE"
)

In [None]:
model.build()
model.summary()

In [None]:
from tensorflow.keras.callbacks import CSVLogger

csv_logger = CSVLogger(f'{work_dir}/training.log', append=True)

model.fit(
    x_train, y_train,
    epochs = 200,
    callbacks=[csv_logger],
    verbose=True
)

model.save(f'{work_dir}/model')

## Results

In [None]:
model = tf.keras.models.load_model(f'{work_dir}/model')

In [None]:
model.evaluate(x_test, y_test)

In [None]:
mae = abs(model.predict(x_test) - y_test)

In [None]:
atom_counts = pd.read_csv('data/processed/atom_counts.csv')

In [None]:
total_atoms = atom_counts.sum(axis=1)

total_atoms.shape

In [None]:
%matplotlib inline

_, total_atoms_test, _, _ = train_test_split(total_atoms, total_atoms, test_size=.2, shuffle=True, random_state=42)

mae = mae.reshape(-1)

print(total_atoms_test.shape)
print(mae.shape)

plt.scatter(
    total_atoms_test[mae < 5], mae[mae < 5], alpha=0.1
)

In [None]:
d = pd.DataFrame({"MAE":mae, "Total Atoms":total_atoms_test})

In [None]:
d['Total Atoms'] = round(d['Total Atoms']/5)*5

In [None]:
b = d[d['MAE']<5].groupby("Total Atoms").mean()

b.columns = ['mean MAE']


d = d.join(b, "Total Atoms")

In [None]:
d['diff'] = d['MAE'] - d['mean MAE']

d.groupby("Total Atoms").mean()

In [None]:
fig = plt.figure()
ax = plt.axes()

se = d[d['MAE']<5].groupby("Total Atoms")['diff'].std()/2


d['Total Atoms'] = d['Total Atoms'].astype('int')

d[d['MAE']<5].groupby("Total Atoms").mean().reset_index().plot(
    kind='bar',
    x="Total Atoms",
    y="MAE",
    ax=ax,
    legend=False,
    yerr=se
)


ax.set_ylabel("MAE")

fig.savefig('MAE-vs-Total-Atoms.svg', bbox_inches='tight')

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
ax = plt.axes()
plt.hist(mae[mae < 5], bins=80)


ax.set_ylabel('Frequency')
ax.set_xlabel('Absolute Error');

fig.savefig('AE-distribution.svg', bbox_inches='tight')

In [None]:
ax = plt.axes()

total_atoms.hist(bins=49, ax=ax)


ax.set_ylabel('Frequency')
ax.set_xlabel('Atoms in a molecule');

plt.savefig('distribution-of-data.svg', bbox_inches='tight')