In [12]:
import os
import sys
from NN import data, utils, create_input, datagenerator
import glob
import numpy as np

In [13]:
create_mol_dict = create_input.CreateMolDict(cutoff=6.0)  # Function to create input dictionary. 
# The cutoff the the cutoff used to create edges between atoms

# Create datasets used for testing the model

### S66x100

In [14]:
dest = "Datasets/S66x100"
source = "../Data/S66x100/"

if not os.path.isdir(dest):
    os.makedirs(dest)

energies_dz = utils.load_np_txt_files(os.path.join(source, "Energies/B3LYP/CC-PVDZ"))
energies_5z = utils.load_np_txt_files(os.path.join(source, "Energies/B3LYP/CC-PV5Z"))

complex_folders = glob.glob(os.path.join(source, "Complexes", "*"))

# The individual datapoints of the dataset are added to a data.MolDictList object.
datasets = {
    "A": data.MolDictList(),  # Subunit A datapoints
    "B": data.MolDictList(),  # Subunit B datapoints
    "complex": data.MolDictList(),  # Complex datapoints
}
for complex_folder in complex_folders:
    complex_id = int(os.path.basename(complex_folder))
    for key, dataset in datasets.items():
        filename = os.path.join(complex_folder, f"{key}.xyz")
        atoms, xyz = utils.read_xyz_file(filename)
        
        for i, frame in enumerate(xyz):
            dataset.append(
                create_mol_dict(
                    atoms=atoms,
                    coordinates=frame,
                    energy=0.0,  # If dataset is used for training, the reference value that is fitted goes here.
                    index=i,  # Indices that can be used to identify datapoint
                    molecule_index=complex_id,  # Indices that can be used to identify datapoint
                )
            )

for key, dataset in datasets.items():
    print(key, len(dataset))
    filename = os.path.join(dest, f"{key}.pickle")
    data.save_dataset(filename, dataset)

A 66
B 66
complex 6600


### Linear Hydrocarbon test dataset

In [15]:
dest = "Datasets/LinearHydrocarbons"
source = "../Data/LinearHydrocarbons/XYZ/"

if not os.path.isdir(dest):
    os.makedirs(dest)

# The individual datapoints of the dataset are added to a data.MolDictList object.
dataset = data.MolDictList()
for i in range(1, 16):
    filename = os.path.join(source, f"{i}.xyz")
    atoms, xyz = utils.read_xyz_file(filename)
    dataset.append(
        create_mol_dict(
            atoms=atoms,
            coordinates=xyz[0],
            energy=0.0,  # If dataset is used for training, the reference value that is fitted goes here.
            index=i,  # Indices that can be used to identify datapoint
            molecule_index=i,  # Indices that can be used to identify datapoint
        )
    )

filename = os.path.join(dest, "dataset.pickle")
data.save_dataset(filename, dataset)

### PAHs

In [16]:
dest = "Datasets/PAHs"
source = "../Data/PAHs/XYZ/"

if not os.path.isdir(dest):
    os.makedirs(dest)

# The individual datapoints of the dataset are added to a data.MolDictList object.
dataset = data.MolDictList()
for i in range(1, 6):
    filename = os.path.join(source, f"{i}.xyz")
    atoms, xyz = utils.read_xyz_file(filename)
    dataset.append(
        create_mol_dict(
            atoms=atoms,
            coordinates=xyz[0],
            energy=0.0,  # If dataset is used for training, the reference value that is fitted goes here.
            index=i,  # Indices that can be used to identify datapoint
            molecule_index=i,  # Indices that can be used to identify datapoint
        )
    )

filename = os.path.join(dest, "dataset.pickle")
data.save_dataset(filename, dataset)