Skip to content

Commit

Permalink
ENH: Export more parameters to the .hdf5 file (#152)
Browse files Browse the repository at this point in the history
* BUG: Fixed an issue where the atom counts of frozen parameters were not properly parsed
* ENH: Store whether or not a parameter is guessed
* ENH: Export more parameters to the .hdf5 file
* MAINT: Cleaned up .gitignore
  • Loading branch information
BvB93 committed Nov 13, 2020
1 parent e9918cb commit 12508f3
Show file tree
Hide file tree
Showing 8 changed files with 110 additions and 51 deletions.
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ coverage.xml
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

Expand All @@ -65,7 +64,6 @@ instance/

# Sphinx documentation
docs/_build/
docs_out/

# PyBuilder
target/
Expand Down Expand Up @@ -109,4 +107,3 @@ venv.bak/

# Visual Studio Code
.vscode/
MM_MD_workdir*/
42 changes: 33 additions & 9 deletions FOX/armc/param_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
Tup2 = Tuple[Hashable, Hashable]

# All dict keys in ParamMappingABC
SeriesKeys = Literal['min', 'max', 'count', 'constant']
SeriesKeys = Literal['min', 'max', 'count', 'frozen', 'guess']
DFKeys = Literal['param', 'param_old']
ValidKeys = Union[SeriesKeys, DFKeys]

Expand All @@ -71,7 +71,8 @@ class InputMapping(_InputMapping, total=False):
min: pd.Series
max: pd.Series
count: pd.Series
constant: pd.Series
frozen: pd.Series
guess: pd.Series


class Data(TypedDict):
Expand All @@ -82,7 +83,8 @@ class Data(TypedDict):
min: pd.Series
max: pd.Series
count: pd.Series
constant: pd.Series
frozen: pd.Series
guess: pd.Series


def _parse_param(dct: MutableMapping[str, Any]) -> pd.DataFrame:
Expand Down Expand Up @@ -178,11 +180,12 @@ class ParamMappingABC(AbstractDataClass, ABC, _ParamMappingABC):
_data_dict: Data

#: Fill values for when optional keys are absent.
FILL_VALUE: ClassVar[Mapping[ValidKeys, Any]] = MappingProxyType({
'min': -np.inf,
'max': np.inf,
'count': -1,
'constant': False,
FILL_VALUE: ClassVar[Mapping[ValidKeys, np.generic]] = MappingProxyType({
'min': np.float64(-np.inf),
'max': np.float64(np.inf),
'count': np.int64(-1),
'frozen': np.False_,
'guess': np.False_,
})

_PRIVATE_ATTR = frozenset({'_net_charge'}) # type: ignore
Expand Down Expand Up @@ -485,6 +488,27 @@ def apply_constraints(self, idx: Tup3, value: float, param: int) -> Optional[Exc
""" # noqa
pass

def to_struct_array(self) -> np.ndarray:
"""Stack all :class:`~pandas.Series` in this instance into a single structured array."""
cls = type(self)
dtype = np.dtype(list((k, type(v)) for k, v in cls.FILL_VALUE.items()))
data = [self[k].values for k in cls.FILL_VALUE]
return np.fromiter(zip(*data), dtype=dtype)

def constraints_to_str(self) -> pd.Series:
"""Convert the constraints into a human-readably :class:`pandas.Series`."""
dct = {k: '' for k in self.constraints}
for key, tup in self.constraints.items():
if not len(tup):
continue
dct[key] += ' == '.join(
' + '.join(f'{v}*{k}' for k, v in series.items()) for series in tup
)
ret = pd.Series(dct)
ret.name = 'constraints'
ret.index.names = self['param'].index.names[:2]
return ret


MOVE_RANGE = np.array([[
0.900, 0.905, 0.910, 0.915, 0.920, 0.925, 0.930, 0.935, 0.940,
Expand Down Expand Up @@ -523,7 +547,7 @@ def identify_move(self, param_idx: int) -> Tuple[Tup3, float, float]:
""" # noqa
# Define a random parameter
variable = ~self['constant']
variable = ~self['frozen']
random_prm: pd.Series = self['param'].loc[variable, param_idx].sample()
idx, x1 = next(random_prm.items()) # Type: Tup3, float

Expand Down
22 changes: 15 additions & 7 deletions FOX/armc/sanitization.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,6 @@ def dict_to_armc(input_dict: MainMapping) -> Tuple[MonteCarloABC, RunDict]:
# Handle psf stuff
psf_list: Optional[List[PSFContainer]] = get_psf(dct['psf'], mol_list)
run_kwargs['psf'] = psf_list
update_count(param, psf=psf_list, mol=mol_list)
_parse_ligand_alias(psf_list, prm=param)
if psf_list is not None:
mc.pes_post_process = [AtomsFromPSF.from_psf(*psf_list)]
Expand All @@ -110,14 +109,18 @@ def dict_to_armc(input_dict: MainMapping) -> Tuple[MonteCarloABC, RunDict]:
if _param_frozen is not None:
_guess_param(mc, _param_frozen, frozen=True, psf=psf_list)
_guess_param(mc, _param, frozen=False, psf=psf_list)
update_count(param, psf=psf_list, mol=mol_list)

mc.param['param'].sort_index(inplace=True)
mc.param['param_old'].sort_index(inplace=True)
mc.param['min'].sort_index(inplace=True)
mc.param['max'].sort_index(inplace=True)
mc.param['count'].sort_index(inplace=True)
mc.param['constant'].sort_index(inplace=True)
mc.param._data['constant'] = mc.param['constant'].astype(bool, copy=False)
mc.param['frozen'].sort_index(inplace=True)
mc.param['guess'].sort_index(inplace=True)
mc.param._data['count'] = mc.param['count'].astype(int, copy=False)
mc.param._data['frozen'] = mc.param['frozen'].astype(bool, copy=False)
mc.param._data['guess'] = mc.param['guess'].astype(bool, copy=False)

# Add PES evaluators
pes = get_pes(dct['pes'])
Expand Down Expand Up @@ -161,7 +164,8 @@ def _guess_param(mc: MonteCarloABC, prm: dict,
param_mapping['min'][key] = -np.inf
param_mapping['max'][key] = np.inf
param_mapping['count'][key] = 0
param_mapping['constant'][key] = frozen
param_mapping['frozen'][key] = frozen
param_mapping['guess'][key] = True
return


Expand Down Expand Up @@ -237,7 +241,7 @@ def get_param(dct: ParamMapping_) -> Tuple[ParamMapping, dict, dict]:
if _sub_prm_dict_frozen is not None:
for *_key, value in _get_prm(_sub_prm_dict_frozen):
key = tuple(_key)
data.loc[key, :] = [value, value, True, -np.inf, np.inf]
data.loc[key, :] = [value, value, True, False, -np.inf, np.inf, 0]
data.sort_index(inplace=True)

param_type = prm_dict.pop('type') # type: ignore
Expand Down Expand Up @@ -413,9 +417,11 @@ def _get_param_df(dct: Mapping[str, Any]) -> pd.DataFrame:
df.set_index(['key', 'param_type', 'atoms'], inplace=True)

df['param_old'] = df['param'].copy()
df['constant'] = False
df['frozen'] = False
df['guess'] = False
df['min'] = -np.inf
df['max'] = np.inf
df['count'] = 0
return df


Expand Down Expand Up @@ -580,7 +586,9 @@ def _parse_ligand_alias(psf_list: Optional[List[PSFContainer]], prm: ParamMappin
prm['param_old'].loc[key] = prm['param'].loc[key]
prm['min'][key] = -np.inf
prm['max'][key] = np.inf
prm['constant'][key] = True
prm['frozen'][key] = True
prm['guess'][key] = False
prm['count'][key] = 0

for lst in prm.constraints.values():
for series in lst:
Expand Down
15 changes: 15 additions & 0 deletions FOX/io/hdf5_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,21 @@ def create_hdf5(filename: PathType, armc: ARMC) -> None:
f.attrs['sub-iteration'] = -1
f.attrs['__version__'] = np.fromiter(__version__.split('.'), count=3, dtype=int)

str_dtype = h5py.string_dtype(encoding='ascii')
index: np.ndarray = armc.param['param'].index
index_dtype = np.dtype(list((k, str_dtype) for k in index.names))

dset = f.create_dataset(name='param_metadata', data=armc.param.to_struct_array())
dset.attrs['index'] = index.values.astype(index_dtype)
dset.attrs['net_charge'] = armc.param._net_charge
dset.attrs['move_range'] = armc.param.move_range

constraints = armc.param.constraints_to_str()
index2: np.ndarray = constraints.index
index2_dtype = np.dtype(list((k, str_dtype) for k in index2.names))
dset.attrs['constraints'] = constraints.values.astype(str_dtype)
dset.attrs['constraints_index'] = index2.values.astype(index2_dtype)

# Store the *index*, *column* and *name* attributes of dataframes/series in the hdf5 file
kappa = armc.iter_len // armc.sub_iter_len
idx = armc.param['param'][0].index.append(pd.MultiIndex.from_tuples([('', 'phi', '')]))
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
'pyflakes>=2.1.1',
'pytest-flake8>=1.0.5',
'pytest-pydocstyle>=2.1',
'auto-FOX-data@git+https://github.com/nlesc-nano/auto-FOX-data@1.0.0',
'auto-FOX-data@git+https://github.com/nlesc-nano/auto-FOX-data@1.1.1',
]
tests_require += docs_require

Expand Down
38 changes: 25 additions & 13 deletions tests/test_armc.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Tests for various ARMC jobs."""

from pathlib import Path
from typing import Tuple, Generator, Any, cast
from typing import Tuple, Generator, Any, cast, Container
from itertools import combinations_with_replacement

import numpy as np
Expand Down Expand Up @@ -56,6 +56,28 @@ def test_armc_guess() -> None:
np.testing.assert_allclose(param.values, ref)


def compare_hdf5(f1: h5py.Group, f2: h5py.Group, skip: Container[str] = frozenset({})) -> None:
"""Check if the two passed hdf5 files are equivalent."""
assertion.eq(f1.keys(), f2.keys())

iterator1 = ((k, f1[k], f2[k]) for k in f2.keys() if k not in skip)
for k1, dset1, dset2 in iterator1:
if issubclass(dset1.dtype.type, np.inexact):
np.testing.assert_allclose(dset1[:], dset2[:], err_msg=f'dataset {k1!r}\n')
else:
np.testing.assert_array_equal(dset1[:], dset2[:], err_msg=f'dataset {k1!r}\n')

# Compare attributes
assertion.eq(dset1.attrs.keys(), dset2.attrs.keys())
iterator2 = ((k2, dset1.attrs[k2], dset1.attrs[k2]) for k2 in dset1.attrs.keys())
for k2, attr1, attr2 in iterator2:
err_msg = f'dataset {k1!r}; attribute {k2!r}'
if issubclass(attr1.dtype.type, np.inexact):
np.testing.assert_allclose(attr1, attr2, err_msg=err_msg)
else:
np.testing.assert_array_equal(attr1, attr2, err_msg=err_msg)


@delete_finally(PATH / '_ARMC')
def test_armc() -> None:
"""Test :class:`ARMC`."""
Expand All @@ -71,12 +93,7 @@ def test_armc() -> None:
hdf5 = PATH / '_ARMC' / 'armc.hdf5'
hdf5_ref = ARMC_REF / 'armc.hdf5'
with h5py.File(hdf5, 'r') as f1, h5py.File(hdf5_ref, 'r') as f2:
assertion.eq(f1.keys(), f2.keys())

skip = {'param', 'aux_error_mod'}
iterator = ((k, f1[k][:], f2[k][:]) for k in f2.keys() if k not in skip)
for k, ar1, ar2 in iterator:
np.testing.assert_allclose(ar1, ar2, err_msg=f'dataset {k!r}\n')
compare_hdf5(f1, f2, skip={'param', 'aux_error_mod'})


def swap_phi(*args: Any, n: int = 3, **kwargs: Any) -> Generator[Tuple[int, int], None, None]:
Expand All @@ -102,12 +119,7 @@ def test_armcpt() -> None:
hdf5 = PATH / '_ARMCPT' / 'armc.hdf5'
hdf5_ref = ARMCPT_REF / 'armc.hdf5'
with h5py.File(hdf5, 'r') as f1, h5py.File(hdf5_ref, 'r') as f2:
assertion.eq(f1.keys(), f2.keys())

skip = {'param', 'aux_error_mod'}
iterator = ((k, f1[k][:], f2[k][:]) for k in f2.keys() if k not in skip)
for k, ar1, ar2 in iterator:
np.testing.assert_allclose(ar1, ar2, err_msg=f'dataset {k!r}\n')
compare_hdf5(f1, f2, skip={'param', 'aux_error_mod'})


def test_param_sorting() -> None:
Expand Down
31 changes: 16 additions & 15 deletions tests/test_hdf5_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from scm.plams import Settings
from assertionlib import assertion
from nanoutils import delete_finally

import FOX
from FOX.armc import dict_to_armc
Expand All @@ -17,6 +18,7 @@
PATH: str = join('tests', 'test_files')


@delete_finally(join(PATH, 'test.hdf5'))
def test_create_hdf5():
"""Test :meth:`FOX.io.hdf5_utils.create_hdf5`."""
yaml_file = join(PATH, 'armc.yaml')
Expand All @@ -26,28 +28,27 @@ def test_create_hdf5():

ref_dict = Settings()
ref_dict.acceptance.shape = 500, 100, 1
ref_dict.acceptance.dtype = bool
ref_dict.acceptance.dtype = np.bool_
ref_dict.aux_error.shape = 500, 100, 1, 1
ref_dict.aux_error.dtype = np.float
ref_dict.aux_error.dtype = np.float64
ref_dict.aux_error_mod.shape = 500, 100, 1, 16
ref_dict.aux_error_mod.dtype = np.float
ref_dict.aux_error_mod.dtype = np.float64
ref_dict.param.shape = 500, 100, 1, 15
ref_dict.param.dtype = np.float
ref_dict.param.dtype = np.float64
ref_dict.param_metadata.shape = (15,)
ref_dict.param_metadata.dtype = np.void
ref_dict.phi.shape = 500, 1
ref_dict.phi.dtype = np.float
ref_dict.phi.dtype = np.float64
ref_dict['rdf.0'].shape = 500, 100, 241, 6
ref_dict['rdf.0'].dtype = np.float
ref_dict['rdf.0'].dtype = np.float64
ref_dict['rdf.0.ref'].shape = 241, 6
ref_dict['rdf.0.ref'].dtype = np.float
ref_dict['rdf.0.ref'].dtype = np.float64

try:
create_hdf5(hdf5_file, armc)
with h5py.File(hdf5_file, 'r') as f:
for key, value in f.items():
assertion.shape_eq(value, ref_dict[key].shape, message=key)
assertion.isinstance(value[:].item(0), ref_dict[key].dtype, message=key)
finally:
remove(hdf5_file) if isfile(hdf5_file) else None
create_hdf5(hdf5_file, armc)
with h5py.File(hdf5_file, 'r') as f:
for key, value in f.items():
assertion.shape_eq(value, ref_dict[key].shape, message=key)
assertion.isinstance(value[:].take(0), ref_dict[key].dtype, message=key)


def test_to_hdf5():
Expand Down
8 changes: 5 additions & 3 deletions tests/test_param_mapping.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""A module for testing :mod:`FOX.armc.param_mapping`."""

import warnings
import numpy as np
import pandas as pd

from assertionlib import assertion
Expand All @@ -24,7 +26,7 @@
_DF['count'] = 2 * [26, 68, 26, 52, 55]
_DF['min'] = _DF['param'] - 0.5 * abs(_DF['param'])
_DF['max'] = _DF['param'] + 0.5 * abs(_DF['param'])
_DF['constant'] = 2 * [False, False, True, False, False]
_DF['frozen'] = 2 * [False, False, True, False, False]

_CONSTRAINTS = {
('charge', 'charge'): [{'Cd': 1.0}, {'O': -4.0, 'C': -2.0, 'H': -2.0}]
Expand Down Expand Up @@ -54,14 +56,14 @@ def test_call():
assertion.isclose(value, ref, abs_tol=0.001)

try:
assert (param['param'][0] <= param['max']).all()
assertion.le(param['param'][0], param['max'], post_process=np.all)
except AssertionError as ex:
df = pd.DataFrame({'param': param['param'][0], 'max': param['max']})
msg = f"iteration{i}\n{df.round(2)}"
raise AssertionError(msg) from ex

try:
assert (param['param'][0] >= param['min']).all()
assertion.ge(param['param'][0], param['min'], post_process=np.all)
except AssertionError as ex:
df = pd.DataFrame({'param': param['param'][0], 'max': param['min']})
msg = f"iteration{i}\n{df.round(2)}"
Expand Down

0 comments on commit 12508f3

Please sign in to comment.