In [None]:
import torch
import torch_geometric

import pandas as pd
import numpy as np
import rdkit
import rdkit.Chem.AllChem
from tqdm import tqdm
import pickle
from copy import deepcopy
import random
import re
import os

from models.dimenetpp import *
from datasets.dataset_3D import *

from utils import generate_conformer_dataframe, make_predictions, load_model

from multiprocessing import Pool
from functools import partial

In [None]:
device = torch.device("cpu") # The code has been designed to run on CPUs (not GPUs). If you need to use GPUs to screen very large libraries efficiently, please contact the authors.
N_cpus = os.cpu_count() # Set as high as possible (given machine limits) to accelerate conformer generation


In [None]:
# supported descriptors
properties = {
    'acid': [
        'IR_freq', 
        'Sterimol_B1', 
        'Sterimol_B5', 
        'Sterimol_L', 
        'dipole',
        'C1_Vbur',
        'C1_NBO_charge', # only boltz
        'O3_NBO_charge',
        'H5_NBO_charge',
        'H5_NMR_shift',
        
    ],
    
    'amine': [
        'NBO_LP_energy',
        'NBO_LP_occupancy',
        'NBO_charge_H_avg',
        'NBO_charge_H_min',
        'NMR_shift_H_avg',
        'Vbur',
        'pyr_agranat',
        'pyr_gavrish',
        'Sterimol_B1',
        'Sterimol_B5',
        'Sterimol_L',
        'dipole',
    ],
    
    'sec_amine': [
        'NBO_LP_energy',
        'NBO_LP_occupancy',
        'NBO_charge_H',
        'NMR_shift_H',
        'Vbur',
        'pyr_agranat',
        'dipole',
    ],
}

property_aggregations = ['boltz', 'max', 'min', 'min_E']

In [None]:
# supported models
model_dictionary = {
    
    ('acid', 'atom', 'C1_NBO_charge', 'Boltz'):   'trained_models/acids/C1_NBO_charge/boltz/model_best.pt', 

    ('acid', 'atom', 'C1_Vbur', 'Boltz'): 'trained_models/acids/C1_Vbur/boltz/model_best.pt',
    ('acid', 'atom', 'C1_Vbur', 'max'): 'trained_models/acids/C1_Vbur/max/model_best.pt', 
    ('acid', 'atom', 'C1_Vbur', 'min'): 'trained_models/acids/C1_Vbur/min/model_best.pt', 
    ('acid', 'atom', 'C1_Vbur', 'lowE'): 'trained_models/acids/C1_Vbur/min_E/model_best.pt', 
    
    ('acid', 'atom', 'O3_NBO_charge', 'Boltz'): 'trained_models/acids/O3_NBO_charge/boltz/model_best.pt',
    ('acid', 'atom', 'O3_NBO_charge', 'max'): 'trained_models/acids/O3_NBO_charge/max/model_best.pt',
    ('acid', 'atom', 'O3_NBO_charge', 'min'): 'trained_models/acids/O3_NBO_charge/min/model_best.pt', 
    ('acid', 'atom', 'O3_NBO_charge', 'lowE'): 'trained_models/acids/O3_NBO_charge/min_E/model_best.pt',
    
    ('acid', 'atom', 'H5_NBO_charge', 'Boltz'): 'trained_models/acids/H5_NBO_charge/boltz/model_best.pt',
    ('acid', 'atom', 'H5_NBO_charge', 'max'): 'trained_models/acids/H5_NBO_charge/max/model_best.pt',
    ('acid', 'atom', 'H5_NBO_charge', 'min'): 'trained_models/acids/H5_NBO_charge/min/model_best.pt',
    ('acid', 'atom', 'H5_NBO_charge', 'lowE'): 'trained_models/acids/H5_NBO_charge/min_E/model_best.pt',
    
    ('acid', 'atom', 'H5_NMR_shift', 'Boltz'): 'trained_models/acids/H5_NMR_shift/boltz/model_best.pt',
    ('acid', 'atom', 'H5_NMR_shift', 'max'): 'trained_models/acids/H5_NMR_shift/max/model_best.pt',
    ('acid', 'atom', 'H5_NMR_shift', 'min'): 'trained_models/acids/H5_NMR_shift/min/model_best.pt',
    ('acid', 'atom', 'H5_NMR_shift', 'lowE'): 'trained_models/acids/H5_NMR_shift/min_E/model_best.pt',
    
    
    ('acid', 'bond', 'IR_freq', 'Boltz'): 'trained_models/acids/IR_freq/boltz/model_best.pt',
    ('acid', 'bond', 'IR_freq', 'max'):   'trained_models/acids/IR_freq/max/model_best.pt',
    ('acid', 'bond', 'IR_freq', 'min'):   'trained_models/acids/IR_freq/min/model_best.pt',
    ('acid', 'bond', 'IR_freq', 'lowE'): 'trained_models/acids/IR_freq/min_E/model_best.pt',
    
    ('acid', 'bond', 'Sterimol_B1', 'Boltz'): 'trained_models/acids/Sterimol_B1/boltz/model_best.pt',
    ('acid', 'bond', 'Sterimol_B1', 'max'):   'trained_models/acids/Sterimol_B1/max/model_best.pt',
    ('acid', 'bond', 'Sterimol_B1', 'min'):   'trained_models/acids/Sterimol_B1/min/model_best.pt',
    ('acid', 'bond', 'Sterimol_B1', 'lowE'): 'trained_models/acids/Sterimol_B1/min_E/model_best.pt',
    
    ('acid', 'bond', 'Sterimol_B5', 'Boltz'): 'trained_models/acids/Sterimol_B5/boltz/model_best.pt',
    ('acid', 'bond', 'Sterimol_B5', 'max'):   'trained_models/acids/Sterimol_B5/max/model_best.pt',
    ('acid', 'bond', 'Sterimol_B5', 'min'):   'trained_models/acids/Sterimol_B5/min/model_best.pt',
    ('acid', 'bond', 'Sterimol_B5', 'lowE'): 'trained_models/acids/Sterimol_B5/min_E/model_best.pt',
    
    ('acid', 'bond', 'Sterimol_L', 'Boltz'): 'trained_models/acids/Sterimol_L/boltz/model_best.pt',
    ('acid', 'bond', 'Sterimol_L', 'max'):   'trained_models/acids/Sterimol_L/max/model_best.pt',
    ('acid', 'bond', 'Sterimol_L', 'min'):   'trained_models/acids/Sterimol_L/min/model_best.pt',
    ('acid', 'bond', 'Sterimol_L', 'lowE'): 'trained_models/acids/Sterimol_L/min_E/model_best.pt',
    
    ('acid', 'mol', 'dipole', 'Boltz'): 'trained_models/acids/dipole/boltz/model_best.pt',
    ('acid', 'mol', 'dipole', 'max'):   'trained_models/acids/dipole/max/model_best.pt',
    ('acid', 'mol', 'dipole', 'min'):   'trained_models/acids/dipole/min/model_best.pt',
    ('acid', 'mol', 'dipole', 'lowE'): 'trained_models/acids/dipole/min_E/model_best.pt',
    
    
    ('amine', 'atom', 'NBO_LP_energy', 'Boltz'): 'trained_models/combined_amines/NBO_LP_energy/boltz/model_best.pt',
    ('amine', 'atom', 'NBO_LP_energy', 'max'):   'trained_models/combined_amines/NBO_LP_energy/max/model_best.pt',
    ('amine', 'atom', 'NBO_LP_energy', 'min'):   'trained_models/combined_amines/NBO_LP_energy/min/model_best.pt',
    ('amine', 'atom', 'NBO_LP_energy', 'lowE'): 'trained_models/combined_amines/NBO_LP_energy/min_E/model_best.pt',
    
    ('amine', 'atom', 'NBO_LP_occupancy', 'Boltz'): 'trained_models/combined_amines/NBO_LP_occupancy/boltz/model_best.pt',
    ('amine', 'atom', 'NBO_LP_occupancy', 'max'):   'trained_models/combined_amines/NBO_LP_occupancy/max/model_best.pt',
    ('amine', 'atom', 'NBO_LP_occupancy', 'min'):   'trained_models/combined_amines/NBO_LP_occupancy/min/model_best.pt',
    ('amine', 'atom', 'NBO_LP_occupancy', 'lowE'): 'trained_models/combined_amines/NBO_LP_occupancy/min_E/model_best.pt',
    
    ('amine', 'atom', 'NBO_charge_H_avg', 'Boltz'): 'trained_models/amines/NBO_charge_H_avg/boltz/model_best.pt',
    ('amine', 'atom', 'NBO_charge_H_avg', 'max'):   'trained_models/amines/NBO_charge_H_avg/max/model_best.pt',
    ('amine', 'atom', 'NBO_charge_H_avg', 'min'):   'trained_models/amines/NBO_charge_H_avg/min/model_best.pt',
    ('amine', 'atom', 'NBO_charge_H_avg', 'lowE'): 'trained_models/amines/NBO_charge_H_avg/min_E/model_best.pt',
    
    ('amine', 'atom', 'NBO_charge_H_min', 'Boltz'): 'trained_models/amines/NBO_charge_H_min/boltz/model_best.pt',
    ('amine', 'atom', 'NBO_charge_H_min', 'max'):   'trained_models/amines/NBO_charge_H_min/max/model_best.pt',
    ('amine', 'atom', 'NBO_charge_H_min', 'min'):   'trained_models/amines/NBO_charge_H_min/min/model_best.pt',
    ('amine', 'atom', 'NBO_charge_H_min', 'lowE'): 'trained_models/amines/NBO_charge_H_min/min_E/model_best.pt',
    
    ('amine', 'atom', 'NMR_shift_H_avg', 'Boltz'): 'trained_models/amines/NMR_shift_H_avg/boltz/model_best.pt',
    ('amine', 'atom', 'NMR_shift_H_avg', 'max'):   'trained_models/amines/NMR_shift_H_avg/max/model_best.pt',
    ('amine', 'atom', 'NMR_shift_H_avg', 'min'):   'trained_models/amines/NMR_shift_H_avg/min/model_best.pt',
    ('amine', 'atom', 'NMR_shift_H_avg', 'lowE'): 'trained_models/amines/NMR_shift_H_avg/min_E/model_best.pt',
    
    ('amine', 'atom', 'pyr_agranat', 'Boltz'): 'trained_models/combined_amines/pyr_agranat/boltz/model_best.pt',
    ('amine', 'atom', 'pyr_agranat', 'max'):   'trained_models/combined_amines/pyr_agranat/max/model_best.pt',
    ('amine', 'atom', 'pyr_agranat', 'min'):   'trained_models/combined_amines/pyr_agranat/min/model_best.pt',
    ('amine', 'atom', 'pyr_agranat', 'lowE'): 'trained_models/combined_amines/pyr_agranat/min_E/model_best.pt',
    
    ('amine', 'atom', 'Vbur', 'Boltz'): 'trained_models/combined_amines/Vbur/boltz/model_best.pt',
    ('amine', 'atom', 'Vbur', 'max'):   'trained_models/combined_amines/Vbur/max/model_best.pt',
    ('amine', 'atom', 'Vbur', 'min'):   'trained_models/combined_amines/Vbur/min/model_best.pt',
    ('amine', 'atom', 'Vbur', 'lowE'): 'trained_models/combined_amines/Vbur/min_E/model_best.pt',
    
    ('amine', 'bond', 'Sterimol_B1', 'Boltz'): 'trained_models/amines/Sterimol_B1/boltz/model_best.pt',
    ('amine', 'bond', 'Sterimol_B1', 'max'):   'trained_models/amines/Sterimol_B1/max/model_best.pt',
    ('amine', 'bond', 'Sterimol_B1', 'min'):   'trained_models/amines/Sterimol_B1/min/model_best.pt',
    ('amine', 'bond', 'Sterimol_B1', 'lowE'): 'trained_models/amines/Sterimol_B1/min_E/model_best.pt',
    
    ('amine', 'bond', 'Sterimol_B5', 'Boltz'): 'trained_models/amines/Sterimol_B5/boltz/model_best.pt',
    ('amine', 'bond', 'Sterimol_B5', 'max'):   'trained_models/amines/Sterimol_B5/max/model_best.pt',
    ('amine', 'bond', 'Sterimol_B5', 'min'):   'trained_models/amines/Sterimol_B5/min/model_best.pt',
    ('amine', 'bond', 'Sterimol_B5', 'lowE'): 'trained_models/amines/Sterimol_B5/min_E/model_best.pt',
    
    ('amine', 'bond', 'Sterimol_L', 'Boltz'): 'trained_models/amines/Sterimol_L/boltz/model_best.pt',
    ('amine', 'bond', 'Sterimol_L', 'max'):   'trained_models/amines/Sterimol_L/max/model_best.pt',
    ('amine', 'bond', 'Sterimol_L', 'min'):   'trained_models/amines/Sterimol_L/min/model_best.pt',
    ('amine', 'bond', 'Sterimol_L', 'lowE'): 'trained_models/amines/Sterimol_L/min_E/model_best.pt',
    
    ('amine', 'mol', 'dipole', 'Boltz'): 'trained_models/combined_amines/dipole/boltz/model_best.pt',
    ('amine', 'mol', 'dipole', 'max'):   'trained_models/combined_amines/dipole/max/model_best.pt',
    ('amine', 'mol', 'dipole', 'min'):   'trained_models/combined_amines/dipole/min/model_best.pt',
    ('amine', 'mol', 'dipole', 'lowE'): 'trained_models/combined_amines/dipole/min_E/model_best.pt',
    
    
    ('sec_amine', 'atom', 'NBO_LP_energy', 'Boltz'): 'trained_models/combined_amines/NBO_LP_energy/boltz/model_best.pt',
    ('sec_amine', 'atom', 'NBO_LP_energy', 'max'):   'trained_models/combined_amines/NBO_LP_energy/max/model_best.pt',
    ('sec_amine', 'atom', 'NBO_LP_energy', 'min'):   'trained_models/combined_amines/NBO_LP_energy/min/model_best.pt',
    ('sec_amine', 'atom', 'NBO_LP_energy', 'lowE'): 'trained_models/combined_amines/NBO_LP_energy/min_E/model_best.pt',
    
    ('sec_amine', 'atom', 'NBO_LP_occupancy', 'Boltz'): 'trained_models/combined_amines/NBO_LP_occupancy/boltz/model_best.pt',
    ('sec_amine', 'atom', 'NBO_LP_occupancy', 'max'):   'trained_models/combined_amines/NBO_LP_occupancy/max/model_best.pt',
    ('sec_amine', 'atom', 'NBO_LP_occupancy', 'min'):   'trained_models/combined_amines/NBO_LP_occupancy/min/model_best.pt',
    ('sec_amine', 'atom', 'NBO_LP_occupancy', 'lowE'): 'trained_models/combined_amines/NBO_LP_occupancy/min_E/model_best.pt',
    
    ('sec_amine', 'atom', 'NBO_charge_H', 'Boltz'): 'trained_models/sec_amines/NBO_charge_H/boltz/model_best.pt',
    ('sec_amine', 'atom', 'NBO_charge_H', 'max'):   'trained_models/sec_amines/NBO_charge_H/max/model_best.pt',
    ('sec_amine', 'atom', 'NBO_charge_H', 'min'):   'trained_models/sec_amines/NBO_charge_H/min/model_best.pt',
    ('sec_amine', 'atom', 'NBO_charge_H', 'lowE'): 'trained_models/sec_amines/NBO_charge_H/min_E/model_best.pt',
    
    ('sec_amine', 'atom', 'NMR_shift_H', 'Boltz'): 'trained_models/sec_amines/NMR_shift_H/boltz/model_best.pt',
    ('sec_amine', 'atom', 'NMR_shift_H', 'max'):   'trained_models/sec_amines/NMR_shift_H/max/model_best.pt',
    ('sec_amine', 'atom', 'NMR_shift_H', 'min'):   'trained_models/sec_amines/NMR_shift_H/min/model_best.pt',
    ('sec_amine', 'atom', 'NMR_shift_H', 'lowE'): 'trained_models/sec_amines/NMR_shift_H/min_E/model_best.pt',
    
    ('sec_amine', 'atom', 'Vbur', 'Boltz'): 'trained_models/combined_amines/Vbur/boltz/model_best.pt',
    ('sec_amine', 'atom', 'Vbur', 'max'):   'trained_models/combined_amines/Vbur/max/model_best.pt',
    ('sec_amine', 'atom', 'Vbur', 'min'):   'trained_models/combined_amines/Vbur/min/model_best.pt',
    ('sec_amine', 'atom', 'Vbur', 'lowE'): 'trained_models/combined_amines/Vbur/min_E/model_best.pt',
    
    ('sec_amine', 'atom', 'pyr_agranat', 'Boltz'): 'trained_models/combined_amines/pyr_agranat/boltz/model_best.pt',
    ('sec_amine', 'atom', 'pyr_agranat', 'max'):   'trained_models/combined_amines/pyr_agranat/max/model_best.pt',
    ('sec_amine', 'atom', 'pyr_agranat', 'min'):   'trained_models/combined_amines/pyr_agranat/min/model_best.pt',
    ('sec_amine', 'atom', 'pyr_agranat', 'lowE'): 'trained_models/combined_amines/pyr_agranat/min_E/model_best.pt',
    
    ('sec_amine', 'mol', 'dipole', 'Boltz'): 'trained_models/combined_amines/dipole/boltz/model_best.pt',
    ('sec_amine', 'mol', 'dipole', 'max'):   'trained_models/combined_amines/dipole/max/model_best.pt',
    ('sec_amine', 'mol', 'dipole', 'min'):   'trained_models/combined_amines/dipole/min/model_best.pt',
    ('sec_amine', 'mol', 'dipole', 'lowE'): 'trained_models/combined_amines/dipole/min_E/model_best.pt',
    
}


In [None]:
atom_selection_dictionary = {
    
    ('acid', 'atom', 'C1_NBO_charge'): 'C1',
    ('acid', 'atom', 'C1_Vbur'): 'C1',
    ('acid', 'atom', 'O3_NBO_charge'): 'O3',
    ('acid', 'atom', 'H5_NBO_charge'): 'H5',
    ('acid', 'atom', 'H5_NMR_shift'): 'H5',
    
    
    ('acid', 'bond', 'IR_freq'): ('C1', 'O2'),
    ('acid', 'bond', 'Sterimol_B1'): ('C1', 'C4'),
    ('acid', 'bond', 'Sterimol_B5'): ('C1', 'C4'),
    ('acid', 'bond', 'Sterimol_L'): ('C1', 'C4'),
    
    
    ('amine', 'atom', 'NBO_LP_energy'): 'N1',
    ('amine', 'atom', 'NBO_LP_occupancy'): 'N1',
    ('amine', 'atom', 'NBO_charge_H_avg'): 'H3',
    ('amine', 'atom', 'NBO_charge_H_min'): 'H3',
    ('amine', 'atom', 'NMR_shift_H_avg'): 'H3',
    ('amine', 'atom', 'pyr_gavrish'): 'N1',
    ('amine', 'atom', 'pyr_agranat'): 'N1',
    ('amine', 'atom', 'Vbur'): 'N1',
    
    ('amine', 'bond', 'Sterimol_B1'): ('N1', 'C2'),
    ('amine', 'bond', 'Sterimol_B5'): ('N1', 'C2'),
    ('amine', 'bond', 'Sterimol_L'): ('N1', 'C2'),
    
    
    ('sec_amine', 'atom', 'NBO_LP_energy'): 'N1',
    ('sec_amine', 'atom', 'NBO_LP_occupancy'): 'N1',
    ('sec_amine', 'atom', 'pyr_agranat'): 'N1',
    ('sec_amine', 'atom', 'Vbur'): 'N1',
    ('sec_amine', 'atom', 'NBO_charge_H'): 'H4',
    ('sec_amine', 'atom', 'NMR_shift_H'): 'H4',
    
}

In [None]:
preloaded_models = {}
for model_selection in tqdm(model_dictionary):
    print(f'loading model: {model_selection}') # this can take a few seconds
    preloaded_models[model_selection] = load_model(*model_selection, model_dictionary)

In [None]:
# Enter your SMILES here!

In [None]:
molecule_type = 'acid'

# decrease in order to increase speed of conformer generation, at the cost of greater uncertainty in the prediction results
N_confs = 20

# list of SMILES of the molecule_type (list must include all amines or all acids)
acids_validation = pd.read_csv(
    'external_validation_data/AmideCoupling_External_Validation_Sets_acids.csv',
    encoding='unicode_escape',
)

smiles_list = list(acids_validation.Canonical_SMILES)
smiles_list = [rdkit.Chem.MolToSmiles(rdkit.Chem.MolFromSmiles(s)) for s in smiles_list]

# generate rdkit conformer ensembles
    # This can be a bottleneck. 
acid_dataframe = generate_conformer_dataframe(smiles_list, molecule_type, N_cpus = N_cpus, N_confs = N_confs)

In [None]:
molecule_type = 'amine'

# decrease in order to increase speed of conformer generation, at the cost of greater uncertainty in the prediction results
N_confs = 20

# list of SMILES of the molecule_type (list must include all amines or all acids)
amine_validation = pd.read_csv(
    'external_validation_data/AmideCoupling_External_Validation_Sets_primaryamines.csv',
    encoding='unicode_escape',
)

smiles_list = list(amine_validation.Canonical_SMILES)
smiles_list = [rdkit.Chem.MolToSmiles(rdkit.Chem.MolFromSmiles(s)) for s in smiles_list]


# generate rdkit conformer ensembles
    # This can be a bottleneck. 
amine_dataframe = generate_conformer_dataframe(smiles_list, molecule_type, N_cpus = N_cpus, N_confs = N_confs)

In [None]:
molecule_type = 'sec_amine'

# decrease in order to increase speed of conformer generation, at the cost of greater uncertainty in the prediction results
N_confs = 20

# list of SMILES of the molecule_type (list must include all amines or all acids)
sec_amine_validation = pd.read_csv(
    'external_validation_data/AmideCoupling_External_Validation_Sets_secamines.csv',
    encoding='unicode_escape',
)

smiles_list = list(sec_amine_validation.Canonical_SMILES)
smiles_list = [rdkit.Chem.MolToSmiles(rdkit.Chem.MolFromSmiles(s)) for s in smiles_list]


# generate rdkit conformer ensembles
    # This can be a bottleneck. 
sec_amine_dataframe = generate_conformer_dataframe(smiles_list, molecule_type, N_cpus = N_cpus, N_confs = N_confs)

In [None]:
acid_dataframe.head(10)

In [None]:
amine_dataframe.head(10)

In [None]:
sec_amine_dataframe.head(10)

In [None]:
# ACIDS

molecule_type = 'acid'

# SELECT ALL MODELED PROPERTIES
acid_model_selections = [key for key in model_dictionary if key[0] == molecule_type] 


# loading models and making predictions for each descriptor in model_selections

acid_results_dict = {}
model_selections = acid_model_selections
test_dataframe = acid_dataframe

for model_selection in tqdm(model_selections):
    
    if model_selection not in preloaded_models:
        print(f'loading model: {model_selection}') # this can take a few seconds
        model = load_model(*model_selection, model_dictionary)
    else:
        model = preloaded_models[model_selection]
    
    keep_explicit_hydrogens = (model_selection[2] in ['H5_NBO_charge', 'H5_NMR_shift'])
    remove_Hs_except_functional = keep_explicit_hydrogens
    
    print('    making predictions:')
    predictions = make_predictions(
        test_dataframe, 
        model, 
        model_selection, 
        atom_selection_dictionary, 
        keep_explicit_hydrogens = keep_explicit_hydrogens,
        remove_Hs_except_functional = remove_Hs_except_functional,
        num_workers = 0,
        device = torch.device("cpu"),
    )
    
    # saving results
    acid_results_dict[tuple(model_selection)] = predictions


# per-conformer predictions
acid_results_df = pd.DataFrame(acid_results_dict)
acid_results_df[['mol_index']] = test_dataframe[['mol_index']]

# averaging per-conformer predictions to get final predictions per molecule
acid_results_df_mean = acid_results_df.groupby('mol_index', sort = False).apply(lambda x: x.mean()).reset_index(drop=True)

In [None]:
# PRIMARY AMINES

molecule_type = 'amine'

# SELECT ALL MODELED PROPERTIES
amine_model_selections = [key for key in model_dictionary if key[0] == molecule_type] 


# loading models and making predictions for each descriptor in model_selections

amine_results_dict = {}
model_selections = amine_model_selections
test_dataframe = amine_dataframe

for model_selection in model_selections:
    
    if model_selection not in preloaded_models:
        print(f'loading model: {model_selection}') # this can take a few seconds
        model = load_model(*model_selection, model_dictionary)
    else:
        model = preloaded_models[model_selection]
    
    keep_explicit_hydrogens = (model_selection[2] in ['NBO_charge_H_avg', 'NBO_charge_H_min', 'NMR_shift_H_avg'])
    remove_Hs_except_functional = keep_explicit_hydrogens
    
    print('    making predictions:')
    predictions = make_predictions(
        test_dataframe, 
        model, 
        model_selection, 
        atom_selection_dictionary, 
        keep_explicit_hydrogens = keep_explicit_hydrogens,
        remove_Hs_except_functional = remove_Hs_except_functional,
        device = torch.device("cpu"),
    )
    
    # saving results
    amine_results_dict[tuple(model_selection)] = predictions


# per-conformer predictions
amine_results_df = pd.DataFrame(amine_results_dict)
amine_results_df[['mol_index']] = test_dataframe[['mol_index']]

# averaging per-conformer predictions to get final predictions per molecule
amine_results_df_mean = amine_results_df.groupby('mol_index', sort = False).apply(lambda x: x.mean()).reset_index(drop=True)

In [None]:
# SECONDARY AMINES

molecule_type = 'sec_amine'

# SELECT ALL MODELED PROPERTIES
sec_amine_model_selections = [key for key in model_dictionary if key[0] == molecule_type] 


# loading models and making predictions for each descriptor in model_selections

sec_amine_results_dict = {}
model_selections = sec_amine_model_selections
test_dataframe = sec_amine_dataframe

for model_selection in model_selections:
    
    if model_selection not in preloaded_models:
        print(f'loading model: {model_selection}') # this can take a few seconds
        model = load_model(*model_selection, model_dictionary)
    else:
        model = preloaded_models[model_selection]
    
    keep_explicit_hydrogens = (model_selection[2] in ['NBO_charge_H', 'NMR_shift_H'])
    remove_Hs_except_functional = keep_explicit_hydrogens
    
    print('    making predictions:')
    predictions = make_predictions(
        test_dataframe, 
        model, 
        model_selection, 
        atom_selection_dictionary, 
        keep_explicit_hydrogens = keep_explicit_hydrogens,
        remove_Hs_except_functional = remove_Hs_except_functional,
        device = torch.device("cpu"),
    )
    
    # saving results
    sec_amine_results_dict[tuple(model_selection)] = predictions


# per-conformer predictions
sec_amine_results_dict = pd.DataFrame(sec_amine_results_dict)
sec_amine_results_dict[['mol_index']] = test_dataframe[['mol_index']]

# averaging per-conformer predictions to get final predictions per molecule
sec_amine_results_df_mean = sec_amine_results_dict.groupby('mol_index', sort = False).apply(lambda x: x.mean()).reset_index(drop=True)

In [None]:
# view all predictions
acid_index_to_smiles = {m:s for m,s in zip(acid_dataframe.mol_index, acid_dataframe.smiles)}
acid_results_df_mean['smiles'] = [acid_index_to_smiles[int(m)] for m in acid_results_df_mean.mol_index]
acid_results_df_mean.head(10)

In [None]:
#acid_results_df_mean.to_csv('AmideCoupling_External_Validation_Sets_acids_results3D.csv')


In [None]:
# view all predictions
amine_index_to_smiles = {m:s for m,s in zip(amine_dataframe.mol_index, amine_dataframe.smiles)}
amine_results_df_mean['smiles'] = [amine_index_to_smiles[int(m)] for m in amine_results_df_mean.mol_index]
amine_results_df_mean.head(10)

In [None]:
#amine_results_df_mean.to_csv('AmideCoupling_External_Validation_Sets_primaryamines_results3D.csv')


In [None]:
# view all predictions
sec_amine_index_to_smiles = {m:s for m,s in zip(sec_amine_dataframe.mol_index, sec_amine_dataframe.smiles)}
sec_amine_results_df_mean['smiles'] = [sec_amine_index_to_smiles[int(m)] for m in sec_amine_results_df_mean.mol_index]
sec_amine_results_df_mean.head(10)

In [None]:
#sec_amine_results_df_mean.to_csv('AmideCoupling_External_Validation_Sets_secondaryamines_results3D.csv')
