
# Dataset loading

The following steps are performed:
1. Loading configuration settings from a YAML file.
2. Loading molecular datasets from the configuration (dictionary).


In [10]:
import os
from pathlib import Path
import qedft
from qedft.config.config import Config
from jax_dft.datasets import _TEST_DISTANCE_X100

In [11]:
# Get the available molecules
data_dict = _TEST_DISTANCE_X100
available_molecules = list(data_dict.keys())
print(available_molecules)
print(data_dict['h2'])

['h2_plus', 'h2', 'h4', 'h2_h2']
{136, 264, 392, 520, 280, 152, 408, 536, 40, 424, 312, 56, 184, 568, 72, 200, 328, 456, 584, 344, 88, 216, 472, 120, 600, 376, 104, 232, 360, 488, 504, 248}


Training distances used for H2 molecule:
80 128 192 240 384 448 544 592

In [12]:
# Initialize a configuration object to manage our settings
config = Config()
# Define the project path, assuming 'settings.yaml' is located in the root directory
project_path = Path(os.path.dirname(os.path.dirname(qedft.__file__)))
# Load configuration settings from a YAML file
config.load_from_yaml(project_path / 'tests' / 'test_files' / 'custom_config.yaml')
config_dict = config.config
config_dict

{'name': 'test_vase',
 'experiment_name': 'test',
 'network_type': 'mlp',
 'molecule_name': 'h2',
 'molecule_names': ['h2', 'h4'],
 'dataset': [42, 384],
 'dataset1': [128, 384],
 'dataset2': [160, 240],
 'rng': 0,
 'save_plot_loss': False,
 'save_every_n': 42,
 'activation': 'tanh',
 'n_neurons': 42,
 'n_layers': 42,
 'n_qubits': 42,
 'n_reupload_layers': 42,
 'use_rzz_parametrized_entanglers': False,
 'chebychev_reuploading': False,
 'add_reversed_rzz': False,
 'entangling_block_type': 'circular',
 'single_qubit_rotations': ['rx', 'rz', 'rx'],
 'use_same_parameters': False,
 'add_negative_transform': False,
 'wrap_with_self_interaction_layer': False,
 'wrap_with_global_functional': False,
 'use_correlators_in_output': False,
 'output_operators': ['Z'],
 'use_bias_in_output': False,
 'max_train_steps': 42,
 'factr': 1.0,
 'pgtol': 1e-14,
 'm': 42,
 'maxfun': 20,
 'maxiter': 2,
 'num_iterations': 20,
 'ks_iter_to_ignore': 10,
 'discount_factor': 0.9,
 'alpha': 0.5,
 'alpha_decay': 0.9,

In [13]:
from qedft.data_io.dataset_loader import load_molecular_datasets_from_config

base_path = project_path / "data" / "od"
base_path = Path(base_path)
list_datasets = load_molecular_datasets_from_config(
    config_dict,
    base_path,
    check_grid_centering=False
)

[32m2025-04-14 16:03:57.905[0m | [1mINFO    [0m | [36mqedft.data_io.dataset_loader[0m:[36mload_molecular_datasets[0m:[36m63[0m - [1mLoading dataset for h2[0m
[32m2025-04-14 16:03:57.907[0m | [1mINFO    [0m | [36mqedft.data_io.dataset_loader[0m:[36mload_molecular_datasets[0m:[36m83[0m - [1mLoading dataset from /Users/igorsokolov/PycharmProjects/qedft/data/od/h2[0m
[32m2025-04-14 16:03:57.915[0m | [1mINFO    [0m | [36mqedft.data_io.dataset_loader[0m:[36mload_molecular_datasets[0m:[36m95[0m - [1mTraining distances: [128, 384][0m
[32m2025-04-14 16:03:57.917[0m | [1mINFO    [0m | [36mqedft.data_io.dataset_loader[0m:[36mload_molecular_datasets[0m:[36m101[0m - [1mNumber of electrons: 2[0m
[32m2025-04-14 16:03:57.918[0m | [1mINFO    [0m | [36mqedft.data_io.dataset_loader[0m:[36mload_molecular_datasets[0m:[36m102[0m - [1mGrid shape: (42,)[0m
[32m2025-04-14 16:03:57.918[0m | [1mINFO    [0m | [36mqedft.data_io.dataset_loader[0m:[3

In [14]:
list_datasets

[(<jax_dft.datasets.Dataset at 0x174823f40>,
  KohnShamState(density=array([[0.21648641, 0.24098534, 0.26731451, 0.29542215, 0.32520787,
          0.35651379, 0.3891152 , 0.42271087, 0.4569137 , 0.49124186,
          0.52511153, 0.5578318 , 0.58860358, 0.6165279 , 0.64068498,
          0.66112242, 0.6779955 , 0.69148193, 0.70175649, 0.70897346,
          0.71325286, 0.71467066, 0.71325285, 0.70897345, 0.70175648,
          0.69148191, 0.67799548, 0.66112238, 0.64068494, 0.61652786,
          0.58860353, 0.55783176, 0.52511149, 0.49124181, 0.45691364,
          0.42271082, 0.38911514, 0.35651372, 0.32520778, 0.29542205,
          0.26731441, 0.24098521],
         [0.38471743, 0.37550223, 0.36420572, 0.35127389, 0.33713473,
          0.32218663, 0.30679524, 0.29128636, 0.27594877, 0.26103487,
          0.24675656, 0.23329357, 0.22079191, 0.20937311, 0.19913098,
          0.19013989, 0.18245674, 0.17612344, 0.1711704 , 0.16761864,
          0.16548217, 0.16476902, 0.16548213, 0.16761918, 

In [15]:
# Select a dataset
dataset = list_datasets[0][0]  # first dataset
dataset

<jax_dft.datasets.Dataset at 0x174823f40>

In [16]:
all_data_for_dataset = list_datasets[0][1]  # KohnShamState object that contains the data for the dataset
all_data_for_dataset

KohnShamState(density=array([[0.21648641, 0.24098534, 0.26731451, 0.29542215, 0.32520787,
        0.35651379, 0.3891152 , 0.42271087, 0.4569137 , 0.49124186,
        0.52511153, 0.5578318 , 0.58860358, 0.6165279 , 0.64068498,
        0.66112242, 0.6779955 , 0.69148193, 0.70175649, 0.70897346,
        0.71325286, 0.71467066, 0.71325285, 0.70897345, 0.70175648,
        0.69148191, 0.67799548, 0.66112238, 0.64068494, 0.61652786,
        0.58860353, 0.55783176, 0.52511149, 0.49124181, 0.45691364,
        0.42271082, 0.38911514, 0.35651372, 0.32520778, 0.29542205,
        0.26731441, 0.24098521],
       [0.38471743, 0.37550223, 0.36420572, 0.35127389, 0.33713473,
        0.32218663, 0.30679524, 0.29128636, 0.27594877, 0.26103487,
        0.24675656, 0.23329357, 0.22079191, 0.20937311, 0.19913098,
        0.19013989, 0.18245674, 0.17612344, 0.1711704 , 0.16761864,
        0.16548217, 0.16476902, 0.16548213, 0.16761918, 0.1711719 ,
        0.17612506, 0.18245842, 0.19014198, 0.19913223, 0.209

In [17]:
# Get molecules from dataset
train_distances = [128]
grids = dataset.grids
train_set = dataset.get_molecules(train_distances)
train_set

KohnShamState(density=array([[0.21648641, 0.24098534, 0.26731451, 0.29542215, 0.32520787,
        0.35651379, 0.3891152 , 0.42271087, 0.4569137 , 0.49124186,
        0.52511153, 0.5578318 , 0.58860358, 0.6165279 , 0.64068498,
        0.66112242, 0.6779955 , 0.69148193, 0.70175649, 0.70897346,
        0.71325286, 0.71467066, 0.71325285, 0.70897345, 0.70175648,
        0.69148191, 0.67799548, 0.66112238, 0.64068494, 0.61652786,
        0.58860353, 0.55783176, 0.52511149, 0.49124181, 0.45691364,
        0.42271082, 0.38911514, 0.35651372, 0.32520778, 0.29542205,
        0.26731441, 0.24098521]]), total_energy=array([-2.06208973]), locations=array([[-0.64,  0.64]]), nuclear_charges=array([[1, 1]], dtype=int32), external_potential=array([[-1.09777433, -1.13521597, -1.17393463, -1.21397387, -1.25537872,
        -1.29819575, -1.34247315, -1.3882607 , -1.43560993, -1.48457409,
        -1.53520826, -1.58756941, -1.64171643, -1.69771024, -1.68374189,
        -1.67166761, -1.6614738 , -1.65314901

In [18]:
train_set.density

array([[0.21648641, 0.24098534, 0.26731451, 0.29542215, 0.32520787,
        0.35651379, 0.3891152 , 0.42271087, 0.4569137 , 0.49124186,
        0.52511153, 0.5578318 , 0.58860358, 0.6165279 , 0.64068498,
        0.66112242, 0.6779955 , 0.69148193, 0.70175649, 0.70897346,
        0.71325286, 0.71467066, 0.71325285, 0.70897345, 0.70175648,
        0.69148191, 0.67799548, 0.66112238, 0.64068494, 0.61652786,
        0.58860353, 0.55783176, 0.52511149, 0.49124181, 0.45691364,
        0.42271082, 0.38911514, 0.35651372, 0.32520778, 0.29542205,
        0.26731441, 0.24098521]])

# Summary

In this notebook, we demonstrated the usage of the dataset loading functionality. We covered:

- Loading configuration settings from a YAML file.
- Loading molecular datasets from the configuration.
- Getting molecules from the dataset.
- Getting the density of a molecule.
- Training distances used for H2 molecule:
80 128 192 240 384 448 544 592