In [1]:
import numpy as np
import pandas as pd

from molloader import *
from ML_utils import *

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split, KFold, ParameterGrid

In [2]:
def drop_duplicates(df):
    df = df.drop_duplicates(subset="compound", keep="first")
    print("Number of compounds", df.shape[0])
    return df

In [3]:
def drop_low_frequency_elements(df, limit):
    elem_counts = pd.Series(np.hstack(np.array(df.species))).value_counts()
    low_freq = elem_counts[elem_counts < limit].index.values
    low_freq_mask = df.species.apply(lambda s: len(np.setdiff1d(s, low_freq)) < len(s))
    df = df[~low_freq_mask]
    print("Number of compounds", df.shape[0])
    return df

In [4]:
def get_all_elements(df):
      return np.unique(np.hstack(np.array(df.species)))

In [5]:
class VoxelNet(nn.Module):
    def __init__(self, in_channels=1):
        super().__init__()
        # (N, C, D, H, W)
        self.conv1 = nn.Sequential(
            nn.Conv3d(in_channels, 16, 3, padding=0, stride=1),
            nn.SELU(0.3),
            nn.MaxPool3d(2))
        
        self.conv2 = nn.Sequential(
            nn.Conv3d(16, 16, 3, padding=0, stride=1),
            nn.SELU(),
            nn.MaxPool3d(2))
        
        self.conv3 = nn.Sequential(
            nn.Conv3d(16, 16, 3, padding=0, stride=1),
            nn.SELU())
        
        self.regressor = nn.Sequential(nn.Flatten(),
                                        nn.Linear(1024, 32),
                                        nn.SELU(),
                                        nn.Linear(32, 8),
                                        nn.SELU(),
                                        nn.Linear(8, 1))
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.regressor(x)
        return x

In [8]:
data_name = "icsd_221_cp5_3_species"
fig_folder = "plots/"

In [9]:
df = pd.read_pickle(f"data/{data_name}.pkl")
df = drop_duplicates(df)
df

Number of compounds 377


Unnamed: 0,auid,aurl,compound,composition,species,natoms,spacegroup,pearson_symbol,geometry,positions_fractional,positions_cartesian,enthalpy_atom,enthalpy_formation_atom
0,aflow:c2bae074d76f81b6,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/CUB/Ag1F3...,Ag1F3Zn1,"[1, 3, 1]","[Ag, F, Zn]",5,221,cP5,"[3.979183, 3.979183, 3.979183, 90.0, 90.0, 90.0]","[[0.0, 0.0, 0.0], [0.5, 0.5, 0.0], [0.0, 0.5, ...","[[0.0, 0.0, 0.0], [1.98959, 1.98959, -0.0], [0...",-3.35725,
1,aflow:c568d79d0d0e9fc8,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/CUB/Ag1Mn...,Ag1Mn3N1,"[1, 3, 1]","[Ag, Mn, N]",5,221,cP5,"[4.265445, 4.265445, 4.265445, 90.0, 90.0, 90.0]","[[0.0, 0.0, 0.0], [0.0, 0.5, 0.5], [0.5, 0.0, ...","[[0.0, 0.0, 0.0], [0.0, 2.13272, 2.13272], [2....",-5.99960,
2,aflow:8944b3eb2c189cb7,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/CUB/Ag1Nb...,Ag1Nb1O3,"[1, 1, 3]","[Ag, Nb, O]",5,221,cP5,"[4.000268, 4.000268, 4.000268, 90.0, 90.0, 90.0]","[[0.5, 0.5, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, ...","[[2.00013, 2.00013, 2.00013], [0.0, 0.0, 0.0],...",-6.76521,
3,aflow:9da57f587b92d20f,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/CUB/Ag3I1...,Ag3I1S1,"[3, 1, 1]","[Ag, I, S]",5,221,cP5,"[5.0311, 5.0311, 5.0311, 90.0, 90.0, 90.0]","[[0.5, 0.5, 0.0], [0.0, 0.5, 0.5], [0.5, 0.0, ...","[[2.51555, 2.51555, 0.0], [-0.0, 2.51555, 2.51...",-1.61106,
4,aflow:d43abd3deaeccc51,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/CUB/Al1Bi...,Al1Bi1O3,"[1, 1, 3]","[Al, Bi, O]",5,221,cP5,"[3.793297, 3.793297, 3.793297, 90.0, 90.0, 90.0]","[[0.5, 0.5, 0.5], [0.0, 0.0, 0.0], [0.0, 0.5, ...","[[1.89665, 1.89665, 1.89665], [0.0, 0.0, 0.0],...",-6.54116,-2.28122
...,...,...,...,...,...,...,...,...,...,...,...,...,...
722,aflow:f2a5b2edf497bb0b,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/CUB/O3Sn1...,O3Sn1Ti1,"[3, 1, 1]","[O, Sn, Ti]",5,221,cP5,"[3.980776, 3.980776, 3.980776, 90.0, 90.0, 90.0]","[[0.5, 0.5, 0.0], [0.0, 0.5, 0.5], [0.5, 0.0, ...","[[1.99039, 1.99039, -0.0], [0.0, 1.99039, 1.99...",-6.92630,
723,aflow:493eb6e26780d785,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/CUB/O3Sr1...,O3Sr1Tc1,"[3, 1, 1]","[O, Sr, Tc]",5,221,cP5,"[4.045254, 4.045254, 4.045254, 90.0, 90.0, 90.0]","[[0.5, 0.0, 0.0], [0.0, 0.5, 0.0], [0.0, 0.0, ...","[[2.02263, 0.0, 0.0], [0.0, 2.02263, 0.0], [-0...",-6.91254,
725,aflow:37c7f43d3abe11ce,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/CUB/O3Sr1...,O3Sr1Ti1,"[3, 1, 1]","[O, Sr, Ti]",5,221,cP5,"[3.97291, 3.97291, 3.97291, 90.0, 90.0, 90.0]","[[0.0, 0.5, 0.5], [0.5, 0.0, 0.5], [0.5, 0.5, ...","[[0.0, 1.98645, 1.98645], [1.98645, 0.0, 1.986...",-7.47382,
751,aflow:6381b955a3963bf3,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/CUB/O3Sr1...,O3Sr1V1,"[3, 1, 1]","[O, Sr, V]",5,221,cP5,"[3.897829, 3.897829, 3.897829, 90.0, 90.0, 90.0]","[[0.5, 0.0, 0.0], [0.0, 0.5, 0.0], [0.0, 0.0, ...","[[1.94891, 0.0, -0.0], [-0.0, 1.94891, 0.0], [...",-7.35920,


In [10]:
elements = get_all_elements(df)
nchannel = len(elements)
nchannel, elements

(76,
 array(['Ag', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'Ca',
        'Cd', 'Ce', 'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'F',
        'Fe', 'Ga', 'Gd', 'Ge', 'H', 'Hf', 'Hg', 'Ho', 'I', 'In', 'Ir',
        'K', 'La', 'Li', 'Lu', 'Mg', 'Mn', 'Mo', 'N', 'Na', 'Nb', 'Nd',
        'Ni', 'O', 'P', 'Pa', 'Pb', 'Pd', 'Pr', 'Pt', 'Pu', 'Rb', 'Rh',
        'Ru', 'S', 'Sb', 'Sc', 'Si', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Th',
        'Ti', 'Tl', 'Tm', 'U', 'V', 'W', 'Y', 'Yb', 'Zn', 'Zr'],
       dtype='<U2'))

In [11]:
net = VoxelNet(nchannel)
count_parameters(net)

79777

## Parameters

In [12]:
sigma = 0.1
L = 12.8
N = 32
epochs = 10
weight_decay = 0.01

device = get_default_device()

##  Cartesian

In [None]:
seed_everything()
train_df, holdout_df = train_test_split(df, test_size=0.2)
val_df, test_df = train_test_split(holdout_df, test_size=0.5)
print(train_df.shape, val_df.shape, test_df.shape)

seed_everything()

net = VoxelNet(nchannel)
net = net.to(device)

ml_train = MolLoader(train_df, sigma=sigma, elements=elements,
                     L=L, N=N, batch_size=64, nchannel=nchannel,
                     shuffle=True, rotate_randomly=True, reflect_randomly=True,
                     device=device, mode='cartesian')

ml_val = MolLoader(val_df, sigma=sigma, elements=elements,
                    L=L, N=N, batch_size=128, nchannel=nchannel,
                    shuffle=False, rotate_randomly=True, reflect_randomly=True,
                    device=device, mode='cartesian')

ml_test = MolLoader(test_df, sigma=sigma, elements=elements,
                    L=L, N=N, batch_size=128, nchannel=nchannel,
                    shuffle=False, rotate_randomly=True, reflect_randomly=True,
                    device=device, mode='cartesian')

best_net = fit(epochs, net, ml_train, ml_val, ml_test, torch.optim.Adam, lr=0.001, weight_decay=weight_decay, val_epochs=5)

### Train Plots

In [None]:
ml_pred = ml_train
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=False, reflect_randomly=False)
ys, y_hats = predict(best_net, ml_pred)
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_cartesian_train_orig.pdf")

In [None]:
# transformations
seed_everything()
ml_pred = ml_train
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=True, reflect_randomly=True)
ys, y_hats = predict_epochs(best_net, ml_pred, epochs=5)
print("Mean Variance: ", np.mean(np.var(y_hats.reshape(5,-1), axis=0)))
plot_predictions(ys, y_hats, alpha=0.01)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_cartesian_train.pdf")

### Test Plots

In [None]:
ml_pred = ml_test
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=False, reflect_randomly=False)
ys, y_hats = predict(best_net, ml_pred)
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_cartesian_test_orig.pdf")

In [None]:
# transformations
seed_everything()
ml_pred = ml_test
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=True, reflect_randomly=True)
ys, y_hats = predict_epochs(best_net, ml_pred, epochs=5)
print("Mean Variance: ", np.mean(np.var(y_hats.reshape(5,-1), axis=0)))
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_cartesian_test.pdf")

## Spherical

In [None]:
seed_everything()
train_df, holdout_df = train_test_split(df, test_size=0.2)
val_df, test_df = train_test_split(holdout_df, test_size=0.5)
print(train_df.shape, val_df.shape, test_df.shape)

seed_everything()

net = VoxelNet(nchannel)
net = net.to(device)

ml_train = MolLoader(train_df, sigma=sigma, elements=elements,
                     L=L, N=N, batch_size=64, nchannel=nchannel,
                     shuffle=True, rotate_randomly=True, reflect_randomly=True,
                     device=device, mode='spherical')

ml_val = MolLoader(val_df, sigma=sigma, elements=elements,
                    L=L, N=N, batch_size=128, nchannel=nchannel,
                    shuffle=False, rotate_randomly=True, reflect_randomly=True,
                    device=device, mode='spherical')

ml_test = MolLoader(test_df, sigma=sigma, elements=elements,
                    L=L, N=N, batch_size=128, nchannel=nchannel,
                    shuffle=False, rotate_randomly=True, reflect_randomly=True,
                    device=device, mode='spherical')

best_net = fit(epochs, net, ml_train, ml_val, ml_test, torch.optim.Adam, lr=0.001, weight_decay=weight_decay, val_epochs=5)

### Train Plots

In [None]:
ml_pred = ml_train
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=False, reflect_randomly=False)
ys, y_hats = predict(best_net, ml_pred)
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_spherical_train_orig.pdf")

In [None]:
# transformations
seed_everything()
ml_pred = ml_train
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=True, reflect_randomly=True)
ys, y_hats = predict_epochs(best_net, ml_pred, epochs=5)
print("Mean Variance: ", np.mean(np.var(y_hats.reshape(5,-1), axis=0)))
plot_predictions(ys, y_hats, alpha=0.01)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_spherical_train.pdf")

### Test Plots

In [None]:
ml_pred = ml_test
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=False, reflect_randomly=False)
ys, y_hats = predict(best_net, ml_pred)
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_spherical_test_orig.pdf")

In [None]:
# transformations
seed_everything()
ml_pred = ml_test
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=True, reflect_randomly=True)
ys, y_hats = predict_epochs(best_net, ml_pred, epochs=5)
print("Mean Variance: ", np.mean(np.var(y_hats.reshape(5,-1), axis=0)))
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_spherical_test.pdf")