In [None]:
import numpy as np
import pandas as pd

from molloader import *
from ML_utils import *

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split, KFold, ParameterGrid

In [None]:
def drop_duplicates(df):
    df = df.drop_duplicates(subset="compound", keep="first")
    print("Number of compounds", df.shape[0])
    return df

In [None]:
def drop_low_frequency_elements(df, limit):
    elem_counts = pd.Series(np.hstack(np.array(df.species))).value_counts()
    low_freq = elem_counts[elem_counts < limit].index.values
    low_freq_mask = df.species.apply(lambda s: len(np.setdiff1d(s, low_freq)) < len(s))
    df = df[~low_freq_mask]
    print("Number of compounds", df.shape[0])
    return df

In [None]:
def get_all_elements(df):
      return np.unique(np.hstack(np.array(df.species)))

In [None]:
class VoxelNet(nn.Module):
    def __init__(self, in_channels=1):
        super().__init__()
        # (N, C, D, H, W)
        self.conv1 = nn.Sequential(
            nn.Conv3d(in_channels, 16, 3, padding=0, stride=1),
            nn.SELU(0.3),
            nn.MaxPool3d(2))
        
        self.conv2 = nn.Sequential(
            nn.Conv3d(16, 16, 3, padding=0, stride=1),
            nn.SELU(),
            nn.MaxPool3d(2))
        
        self.conv3 = nn.Sequential(
            nn.Conv3d(16, 16, 3, padding=0, stride=1),
            nn.SELU())
        
        self.regressor = nn.Sequential(nn.Flatten(),
                                        nn.Linear(1024, 32),
                                        nn.SELU(),
                                        nn.Linear(32, 8),
                                        nn.SELU(),
                                        nn.Linear(8, 1))
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.regressor(x)
        return x

In [None]:
data_name = "all_oxides"
fig_folder = "plots/"

In [None]:
df1 = pd.read_pickle(f"data/all_3_species_oxides.pkl")
df2 = pd.read_pickle(f"data/all_sup4_species_oxides.pkl")
df = df1.append(df2)
df = drop_duplicates(df)
df = drop_low_frequency_elements(df, 50)
df

In [None]:
elements = get_all_elements(df)
nchannel = len(elements)
nchannel, elements

In [None]:
net = VoxelNet(nchannel)
count_parameters(net)

## Parameters

In [None]:
sigma = 0.1
L = 12.8
N = 32
epochs = 50
weight_decay = 0.01

device = get_default_device()

##  Cartesian

In [None]:
seed_everything()
train_df, holdout_df = train_test_split(df, test_size=0.2)
val_df, test_df = train_test_split(holdout_df, test_size=0.5)
print(train_df.shape, val_df.shape, test_df.shape)

seed_everything()

net = VoxelNet(nchannel)
net = net.to(device)

ml_train = MolLoader(train_df, sigma=sigma, elements=elements,
                     L=L, N=N, batch_size=64, nchannel=nchannel,
                     shuffle=True, rotate_randomly=True, reflect_randomly=True,
                     device=device, mode='cartesian')

ml_val = MolLoader(val_df, sigma=sigma, elements=elements,
                    L=L, N=N, batch_size=128, nchannel=nchannel,
                    shuffle=False, rotate_randomly=True, reflect_randomly=True,
                    device=device, mode='cartesian')

ml_test = MolLoader(test_df, sigma=sigma, elements=elements,
                    L=L, N=N, batch_size=128, nchannel=nchannel,
                    shuffle=False, rotate_randomly=True, reflect_randomly=True,
                    device=device, mode='cartesian')

best_net = fit(epochs, net, ml_train, ml_val, ml_test, torch.optim.Adam, lr=0.001, weight_decay=weight_decay, val_epochs=5)

### Train Plots

In [None]:
ml_pred = ml_train
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=False, reflect_randomly=False)
ys, y_hats = predict(best_net, ml_pred)
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_cartesian_train_orig.pdf")

In [None]:
# transformations
seed_everything()
ml_pred = ml_train
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=True, reflect_randomly=True)
ys, y_hats = predict_epochs(best_net, ml_pred, epochs=5)
print("Mean Variance: ", np.mean(np.var(y_hats.reshape(5,-1), axis=0)))
plot_predictions(ys, y_hats, alpha=0.01)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_cartesian_train.pdf")

### Test Plots

In [None]:
ml_pred = ml_test
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=False, reflect_randomly=False)
ys, y_hats = predict(best_net, ml_pred)
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_cartesian_test_orig.pdf")

In [None]:
# transformations
seed_everything()
ml_pred = ml_test
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=True, reflect_randomly=True)
ys, y_hats = predict_epochs(best_net, ml_pred, epochs=5)
print("Mean Variance: ", np.mean(np.var(y_hats.reshape(5,-1), axis=0)))
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_cartesian_test.pdf")

## Spherical

In [None]:
seed_everything()
train_df, holdout_df = train_test_split(df, test_size=0.2)
val_df, test_df = train_test_split(holdout_df, test_size=0.5)
print(train_df.shape, val_df.shape, test_df.shape)

seed_everything()

net = VoxelNet(nchannel)
net = net.to(device)

ml_train = MolLoader(train_df, sigma=sigma, elements=elements,
                     L=L, N=N, batch_size=64, nchannel=nchannel,
                     shuffle=True, rotate_randomly=True, reflect_randomly=True,
                     device=device, mode='spherical')

ml_val = MolLoader(val_df, sigma=sigma, elements=elements,
                    L=L, N=N, batch_size=128, nchannel=nchannel,
                    shuffle=False, rotate_randomly=True, reflect_randomly=True,
                    device=device, mode='spherical')

ml_test = MolLoader(test_df, sigma=sigma, elements=elements,
                    L=L, N=N, batch_size=128, nchannel=nchannel,
                    shuffle=False, rotate_randomly=True, reflect_randomly=True,
                    device=device, mode='spherical')

best_net = fit(epochs, net, ml_train, ml_val, ml_test, torch.optim.Adam, lr=0.001, weight_decay=weight_decay, val_epochs=5)

### Train Plots

In [None]:
ml_pred = ml_train
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=False, reflect_randomly=False)
ys, y_hats = predict(best_net, ml_pred)
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_spherical_train_orig.pdf")

In [None]:
# transformations
seed_everything()
ml_pred = ml_train
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=True, reflect_randomly=True)
ys, y_hats = predict_epochs(best_net, ml_pred, epochs=5)
print("Mean Variance: ", np.mean(np.var(y_hats.reshape(5,-1), axis=0)))
plot_predictions(ys, y_hats, alpha=0.01)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_spherical_train.pdf")

### Test Plots

In [None]:
ml_pred = ml_test
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=False, reflect_randomly=False)
ys, y_hats = predict(best_net, ml_pred)
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_spherical_test_orig.pdf")

In [None]:
# transformations
seed_everything()
ml_pred = ml_test
ml_pred.reset(batch_size=128, shuffle=False, rotate_randomly=True, reflect_randomly=True)
ys, y_hats = predict_epochs(best_net, ml_pred, epochs=5)
print("Mean Variance: ", np.mean(np.var(y_hats.reshape(5,-1), axis=0)))
plot_predictions(ys, y_hats)
print("MSE", np.mean((ys - y_hats)**2))
print("MAE", np.mean(np.abs(ys - y_hats)))
plt.savefig(fig_folder + data_name + "_spherical_test.pdf")