From 8265f640f633b94a64c9866107ee2166455bfab3 Mon Sep 17 00:00:00 2001 From: Angel Droth <67913551+angeldrothqb@users.noreply.github.com> Date: Thu, 10 Sep 2020 13:45:06 +0100 Subject: [PATCH] Release/0.8.0 (#80) * Merge back to develop * Simplifying viz.draw syntax in tutorial notebook (#46) * Add non negativity constraint in numpy lasso (#41) * Add plotting tutorial to the documentation (#47) * Unpin some requirements * Mixed type data generation (#55) Added DAG-based synthetic data generator for mixed types (binary, categorical, continuous) using a linear SEM approach. * Merge back to develop (#59) * Pytorch NOTEARS (#63) * NoTears as ScoreSolver * refactor continuous solver * adding attribute to access weight matrix * refactoring continuous solver * Adding fit_lasso method * add data_gen_continuous.py and tests (#38) * add data_gen.py * rename * wrap SM * move data_gen_continous, create test * more coverage * test fixes * move discrete sem to another file * node list dupe check test * ValueError tests * replace dag and sem functions with Ben's verions * add Ben's tests * fix fstring * to_numpy_array coverage * Ben's comments * remove unreachable ValueError for coverage * remove unused fixture * remove redundant test * remove extensions Co-Authored-By: Ben Horsburgh * docstring Co-Authored-By: Ben Horsburgh * docstring Co-Authored-By: Ben Horsburgh * docs Co-Authored-By: Ben Horsburgh * doc Co-Authored-By: Ben Horsburgh * rename file, g_dag rename to sm * add new tests for equal weights * docstring * steve docstring, leq fix * steve comments + docstrings Co-authored-by: Ben Horsburgh * Adding check input and removing some inner functions * Removing attribute original_ndarray * Aligning from pandas with new implementation * Adding tests for fit_lasso * More tests for lasso * wrapping tabu params in a dict * Aligning tests with new tabu params * Aligning from_pandas with new tabu_params * Adding fit_intercept option to _fit method * Adding scaling option * fixing lasso tests * Adding a test for fit_intercept * scaling option only with mean * Correction in lasso bounds * Fix typos * Remove duplicated bounds function * adding comments * add torch files from xunzheng * add from_numpy_torch function that works like from_numpy_lasso * lint * add requirements * add debug functionality * add visual debug test * add license * allow running as main for viz, comments * move to contrib * make multi layer work a bit better * add comment for multi layer * use polynomial dag constraint for better speed comparison * revert unnecessary changes to keep PR lean * revert unnecessary changes to keep PR lean * revert unnecessary changes to keep PR lean * fixes * refactor * Integrated tests * Checkpoint * Refactoring * Finished initial refactoring * All tests passed * Cleaning * Git add testing * Get adjacency matrix * Done cleaning * Revert change to original notears * Revert change to original structuremodel * Revert change to pylintrc * Undo deletion * Apply suggestions from Zain Co-authored-by: Zain Patel * Addressed Zain comments * Migrated from_numpy * Delete contrib test * Migrated w_threshold * Some linting * Change to None * Undo deletion * List comprehension * Refactoring scipy and remove scipy optimiser * Refactoring * Refactoring * Refactoring complete * change from np to torch tensor * More refactoring * Remove hnew equal to None * Refactor again and remove commented line * Minor change * change to params * Addressing Philip's comment * Add property * Add fc2 property weights * Change to weights * Docstring * Linting * Linting completed * Add gpu code * Add gpu to from_numpy and from_pandas * cuda 0 run out of memory * Debugging * put 5 * debugging gpu * shift to inner loop * debugging not in place * Use cada instead of to * Support both interfaces * Benchmarking gpu * Minor fix * correct import path for test * change gpu from 5 to 1 * Debugging * Debugging * Experimenting * Linting * Remove hidden layer and gpu * Linting * Testing and linting * Correct pytorch to torch * Add init zeros * Change weight threshold to 0.25 * Revert requirements.txt * Update release.md * Address coments * Corrected release.md * fc1 to adjacency Co-authored-by: Ben Horsburgh Co-authored-by: LiseDiagneQB <60981366+LiseDiagneQB@users.noreply.github.com> Co-authored-by: Casey Juanxi Li <50737712+caseyliqb@users.noreply.github.com> Co-authored-by: qbphilip Co-authored-by: Zain Patel * Pinned sphinx-auto-doc-typehints (#66) * Corrected a spelling/grammar mistake (#55) * Fix/lint (#73) * Hotfix/0.4.3 (#7) - Address broken links and grammar * Fix documentation links in README (#2) * Fix links in README * library -> libraries * Fix github link in docs * Clean up grammar and consistency in documentation (#4) * Clean up grammar and consistency in `README` files * Add esses, mostly * Reword feature description to not appear automatic * Update docs/source/05_resources/05_faq.md Co-Authored-By: Ben Horsburgh Co-authored-by: Ben Horsburgh * hotfix/0.4.3: fix broken links Co-authored-by: Zain Patel <30357972+mzjp2@users.noreply.github.com> Co-authored-by: Nikos Tsaousis Co-authored-by: Deepyaman Datta * Release/0.5.0 * Plotting now backed by pygraphviz. This allows: * More powerful layout manager * Cleaner fully customisable theme * Out-the-box styling for different node and edge types * Can now get subgraphs from StructureModel containing a specific node * Bugfix to resolve issue when fitting CPDs with some missing states in data * Minor documentation fixes and improvements * Release/0.6.0 * Release/0.7.0 (#57) * Added plottting tutorial to the documentation * Updated `viz.draw` syntax in tutorial notebooks * Bugfix on notears lasso (`from_numpy_lasso` and `from_pandas_lasso`) where the non-negativity constraint was not being set * Added DAG-based synthetic data generator for mixed types (binary, categorical, continuous) using a linear SEM approach. * Unpinned some requirements * black * pin pytorch version * pin pytorch version Co-authored-by: Ben Horsburgh Co-authored-by: Zain Patel <30357972+mzjp2@users.noreply.github.com> Co-authored-by: Nikos Tsaousis Co-authored-by: Deepyaman Datta * Structure learning regressor (#68) * initial commit (local copy-paste) * fixed minor comments * minor bugfix * impute from children inital commit * bugfixes and method option * auto thresholding * autothreshold and bugfix * make threshold removal explicit * add l1 argument * remove child imputation * feat importance fix and tabu logic * moved threshold till dag * restructure with base class * coef mask * recipe * enable bias fitting * persist bias as node attribute * allow fit_intercept * minor PR comment fixes * minor comment adjustment * test coverage and l1 clarification * recipe * minor test fixes * more tests * full test coverage * revove python 3.5/3.6 unsupported import * add normalization option * idiomatic typing * correct pylint errors * update some tests * more typeing updates * more pylint requirements * more pylint disable * python 3.5 support * try to get to work with 3.5 * full coverage and 3.5 support * remove base class to pass test * remove unneeded supression * black formatting changes * remove unused import * pytlint supression * minor reformat change * isort fix * better defensive programming * fix unittests * docstring update * do Raises docstring properly * action SWE suggestions * hotfixes * minor update * minor black formatting change * final merge checkbox * fix end of file * Data Gen root node initialisation fix (#72) * Hotfix/0.4.3 (#7) - Address broken links and grammar * Fix documentation links in README (#2) * Fix links in README * library -> libraries * Fix github link in docs * Clean up grammar and consistency in documentation (#4) * Clean up grammar and consistency in `README` files * Add esses, mostly * Reword feature description to not appear automatic * Update docs/source/05_resources/05_faq.md Co-Authored-By: Ben Horsburgh Co-authored-by: Ben Horsburgh * hotfix/0.4.3: fix broken links Co-authored-by: Zain Patel <30357972+mzjp2@users.noreply.github.com> Co-authored-by: Nikos Tsaousis Co-authored-by: Deepyaman Datta * Release/0.5.0 * Plotting now backed by pygraphviz. This allows: * More powerful layout manager * Cleaner fully customisable theme * Out-the-box styling for different node and edge types * Can now get subgraphs from StructureModel containing a specific node * Bugfix to resolve issue when fitting CPDs with some missing states in data * Minor documentation fixes and improvements * Release/0.6.0 * Release/0.7.0 (#57) * Added plottting tutorial to the documentation * Updated `viz.draw` syntax in tutorial notebooks * Bugfix on notears lasso (`from_numpy_lasso` and `from_pandas_lasso`) where the non-negativity constraint was not being set * Added DAG-based synthetic data generator for mixed types (binary, categorical, continuous) using a linear SEM approach. * Unpinned some requirements * fix for consinuous normal data * generalise across all dtypes * support fit_intercept * fixed many test errors * test logic fixes * lint test fixes * python 3.5 failure change * minor test bugfix * black * pin pytorch version * pin pytorch version * additional test parameter * black formatting * requested changes * test updates and docstring * black format change * disable too many lines * change * move recipe to tutorial folder * releaseMD changes Co-authored-by: Ben Horsburgh Co-authored-by: Zain Patel <30357972+mzjp2@users.noreply.github.com> Co-authored-by: Nikos Tsaousis Co-authored-by: Deepyaman Datta Co-authored-by: Philip Pilgerstorfer <34248114+qbphilip@users.noreply.github.com> Co-authored-by: qbphilip * [1/2] Poisson data for data gen (#61) * Hotfix/0.4.3 (#7) - Address broken links and grammar * Fix documentation links in README (#2) * Fix links in README * library -> libraries * Fix github link in docs * Clean up grammar and consistency in documentation (#4) * Clean up grammar and consistency in `README` files * Add esses, mostly * Reword feature description to not appear automatic * Update docs/source/05_resources/05_faq.md Co-Authored-By: Ben Horsburgh Co-authored-by: Ben Horsburgh * hotfix/0.4.3: fix broken links Co-authored-by: Zain Patel <30357972+mzjp2@users.noreply.github.com> Co-authored-by: Nikos Tsaousis Co-authored-by: Deepyaman Datta * Release/0.5.0 * Plotting now backed by pygraphviz. This allows: * More powerful layout manager * Cleaner fully customisable theme * Out-the-box styling for different node and edge types * Can now get subgraphs from StructureModel containing a specific node * Bugfix to resolve issue when fitting CPDs with some missing states in data * Minor documentation fixes and improvements * Release/0.6.0 * Release/0.7.0 (#57) * Added plottting tutorial to the documentation * Updated `viz.draw` syntax in tutorial notebooks * Bugfix on notears lasso (`from_numpy_lasso` and `from_pandas_lasso`) where the non-negativity constraint was not being set * Added DAG-based synthetic data generator for mixed types (binary, categorical, continuous) using a linear SEM approach. * Unpinned some requirements * refactor & docstring * remove unused helper object * add data gen to init * make test more robust * add count data and test, use logs for poisson samples for stability * fix tests * duplicate fixtures * remove unused fixtures * refactor data_generators into package with core and wrappers * move wrapper to test_wrapper * variable name change bugfix * fix tests Co-authored-by: Ben Horsburgh Co-authored-by: Zain Patel <30357972+mzjp2@users.noreply.github.com> Co-authored-by: Nikos Tsaousis Co-authored-by: Deepyaman Datta Co-authored-by: angeldrothqb * [2/2] Nonlinear Data gen (#60) * Hotfix/0.4.3 (#7) - Address broken links and grammar * Fix documentation links in README (#2) * Fix links in README * library -> libraries * Fix github link in docs * Clean up grammar and consistency in documentation (#4) * Clean up grammar and consistency in `README` files * Add esses, mostly * Reword feature description to not appear automatic * Update docs/source/05_resources/05_faq.md Co-Authored-By: Ben Horsburgh Co-authored-by: Ben Horsburgh * hotfix/0.4.3: fix broken links Co-authored-by: Zain Patel <30357972+mzjp2@users.noreply.github.com> Co-authored-by: Nikos Tsaousis Co-authored-by: Deepyaman Datta * Release/0.5.0 * Plotting now backed by pygraphviz. This allows: * More powerful layout manager * Cleaner fully customisable theme * Out-the-box styling for different node and edge types * Can now get subgraphs from StructureModel containing a specific node * Bugfix to resolve issue when fitting CPDs with some missing states in data * Minor documentation fixes and improvements * Release/0.6.0 * Release/0.7.0 (#57) * Added plottting tutorial to the documentation * Updated `viz.draw` syntax in tutorial notebooks * Bugfix on notears lasso (`from_numpy_lasso` and `from_pandas_lasso`) where the non-negativity constraint was not being set * Added DAG-based synthetic data generator for mixed types (binary, categorical, continuous) using a linear SEM approach. * Unpinned some requirements * refactor & docstring * remove unused helper object * add data gen to init * make test more robust * add count data and test, use logs for poisson samples for stability * add nonlinear * fix tests * duplicate fixtures * remove unused fixtures * refactor data_generators into package with core and wrappers * move wrapper to test_wrapper * add nonlinear to init * change order in all * change release.md * root node fix on core + count * nonlinear support to wrappers * docstring update * bugfix and reproducability fix * many tests and test updates * poiss bugfix and test fix * moar test coverage * categorical dataframe test coverage * full test coverage and linting * fix linting and fstring * black reformat * fix unused pylint argument * pytest fix * FINAL linting fix * Fix stuff (#75) CircleCI fixes Co-authored-by: Ben Horsburgh Co-authored-by: Zain Patel <30357972+mzjp2@users.noreply.github.com> Co-authored-by: Nikos Tsaousis Co-authored-by: Deepyaman Datta Co-authored-by: angeldrothqb Co-authored-by: Zain Patel * update black version (#76) * fix black * Fix/check for NA or Infinity when notears is used (#54) * update scipy version (#77) * add DYNOTEARS implementation (#50) Adds DYNOTEARS and corresponding data generator (for testing) * Pytorch NOTEARS extension - Non-Linear/Hidden Layer (#65) * NoTears as ScoreSolver * refactor continuous solver * adding attribute to access weight matrix * refactoring continuous solver * Adding fit_lasso method * add data_gen_continuous.py and tests (#38) * add data_gen.py * rename * wrap SM * move data_gen_continous, create test * more coverage * test fixes * move discrete sem to another file * node list dupe check test * ValueError tests * replace dag and sem functions with Ben's verions * add Ben's tests * fix fstring * to_numpy_array coverage * Ben's comments * remove unreachable ValueError for coverage * remove unused fixture * remove redundant test * remove extensions Co-Authored-By: Ben Horsburgh * docstring Co-Authored-By: Ben Horsburgh * docstring Co-Authored-By: Ben Horsburgh * docs Co-Authored-By: Ben Horsburgh * doc Co-Authored-By: Ben Horsburgh * rename file, g_dag rename to sm * add new tests for equal weights * docstring * steve docstring, leq fix * steve comments + docstrings Co-authored-by: Ben Horsburgh * Adding check input and removing some inner functions * Removing attribute original_ndarray * Aligning from pandas with new implementation * Adding tests for fit_lasso * More tests for lasso * wrapping tabu params in a dict * Aligning tests with new tabu params * Aligning from_pandas with new tabu_params * Adding fit_intercept option to _fit method * Adding scaling option * fixing lasso tests * Adding a test for fit_intercept * scaling option only with mean * Correction in lasso bounds * Fix typos * Remove duplicated bounds function * adding comments * add torch files from xunzheng * add from_numpy_torch function that works like from_numpy_lasso * lint * add requirements * add debug functionality * add visual debug test * add license * allow running as main for viz, comments * move to contrib * make multi layer work a bit better * add comment for multi layer * use polynomial dag constraint for better speed comparison * revert unnecessary changes to keep PR lean * revert unnecessary changes to keep PR lean * revert unnecessary changes to keep PR lean * fixes * refactor * Integrated tests * Checkpoint * Refactoring * Finished initial refactoring * All tests passed * Cleaning * Git add testing * Get adjacency matrix * Done cleaning * Revert change to original notears * Revert change to original structuremodel * Revert change to pylintrc * Undo deletion * Apply suggestions from Zain Co-authored-by: Zain Patel * Addressed Zain comments * Migrated from_numpy * Delete contrib test * Migrated w_threshold * Some linting * Change to None * Undo deletion * List comprehension * Refactoring scipy and remove scipy optimiser * Refactoring * Refactoring * Refactoring complete * change from np to torch tensor * More refactoring * Remove hnew equal to None * Refactor again and remove commented line * Minor change * change to params * Addressing Philip's comment * Add property * Add fc2 property weights * Change to weights * Docstring * Linting * Linting completed * Add gpu code * Add gpu to from_numpy and from_pandas * cuda 0 run out of memory * Debugging * put 5 * debugging gpu * shift to inner loop * debugging not in place * Use cada instead of to * Support both interfaces * Benchmarking gpu * Minor fix * correct import path for test * change gpu from 5 to 1 * Debugging * Debugging * Experimenting * Linting * Remove hidden layer and gpu * Linting * Testing and linting * Correct pytorch to torch * Add init zeros * Change weight threshold to 0.25 * Revert requirements.txt * Add hidden layer * small refactor * directional adj * minor edits * fix bias issues * breaking changes update to the interface * typo * new regressor regularisation interface * update forward method * forward(X) predictions work * working! * bugfix data normalisation * some fixes * average regularisation and adj calc at end * give credit! Co-authored-by: Philip Pilgerstorfer <34248114+qbphilip@users.noreply.github.com> * loc lin docstring update Co-authored-by: Philip Pilgerstorfer <34248114+qbphilip@users.noreply.github.com> * docstring + fc1/fc2 name updates * moar docstring updates * more minor updates * remove normalize option * plotting util * rename to DAGRegressor * rename and checks * more util functions * fix bias * fix bias with no intercept * fix linear adj * add tests * minor fix * minor fixes * extend interface to bias * differentialte coef_ and feature_imporances * seperate bias element * tests * more test coverage * nonlinear test coverage * test hotfix * more test coverage * test requirements update * more test coverage * formatting changes * final pylint change * more linting * more bestpractice structuring * more minor fixes * FINAL linting updates * actual last change * update to reg defaults, additions to the tutorial * nonlinear regularisation updates * regressor tutorial * almost finishing touches * gradient based h function! * soft clamp and coef feature importance seperation * small api update, closer to batchnorm * docstring updates * stronger soft clamping * gradient L1 rather than L2 * fcpos neg removal, gradient optim * revert back to create_graph=True for 2nd derivative * remove print and test fix * black reformatting * new black version * full test coverage * isort fix * pylint fix * first layer h(W) for speed optimization * fix batch norm system * add nonlinear test * test hotfix * black reformat * isort fix * remove X requirement from h_func * regressor tutorial final commit and black update * LayerNorm replacement Co-authored-by: Philip Pilgerstorfer <34248114+qbphilip@users.noreply.github.com> * major changes * add standardization * minort changes * fix tests * rename reg parameters * linting * test coverage, docstting * check array for infs * fix isinstance to base type * fix isort, add test coverage * new tutorial * docstring fix Co-authored-by: Zain Patel * test string match Co-authored-by: Zain Patel * assert improvement Co-authored-by: Zain Patel * SWE suggestions * minor bugfix * more test fixing Co-authored-by: Ben Horsburgh Co-authored-by: LiseDiagneQB <60981366+LiseDiagneQB@users.noreply.github.com> Co-authored-by: Casey Juanxi Li <50737712+caseyliqb@users.noreply.github.com> Co-authored-by: qbphilip Co-authored-by: Zain Patel Co-authored-by: angeldrothqb Co-authored-by: angeldrothqb <67913551+angeldrothqb@users.noreply.github.com> Co-authored-by: Philip Pilgerstorfer <34248114+qbphilip@users.noreply.github.com> * release.md, version bump, docs Co-authored-by: Ben Horsburgh Co-authored-by: GabrielAzevedoFerreiraQB <57528979+GabrielAzevedoFerreiraQB@users.noreply.github.com> Co-authored-by: Philip Pilgerstorfer <34248114+qbphilip@users.noreply.github.com> Co-authored-by: stevelersl <55385183+SteveLerQB@users.noreply.github.com> Co-authored-by: LiseDiagneQB <60981366+LiseDiagneQB@users.noreply.github.com> Co-authored-by: Casey Juanxi Li <50737712+caseyliqb@users.noreply.github.com> Co-authored-by: qbphilip Co-authored-by: Zain Patel Co-authored-by: KING-SID Co-authored-by: Zain Patel <30357972+mzjp2@users.noreply.github.com> Co-authored-by: Nikos Tsaousis Co-authored-by: Deepyaman Datta Co-authored-by: Jebq --- .circleci/config.yml | 4 + .pylintrc | 2 +- CONTRIBUTING.md | 2 +- RELEASE.md | 14 +- causalnex/__init__.py | 2 +- causalnex/inference/inference.py | 4 +- causalnex/network/network.py | 124 +-- causalnex/structure/__init__.py | 3 +- .../structure/categorical_variable_mapper.py | 6 +- .../structure/data_generators/__init__.py | 59 ++ .../core.py} | 713 +++++++++++------ .../structure/data_generators/wrappers.py | 712 +++++++++++++++++ causalnex/structure/dynotears.py | 494 ++++++++++++ causalnex/structure/notears.py | 33 + causalnex/structure/pytorch/__init__.py | 36 + causalnex/structure/pytorch/core.py | 469 +++++++++++ causalnex/structure/pytorch/nonlinear.py | 111 +++ causalnex/structure/pytorch/notears.py | 306 ++++++++ causalnex/structure/sklearn.py | 347 ++++++++ causalnex/structure/structuremodel.py | 9 + causalnex/structure/transformers.py | 290 +++++++ doc_requirements.txt | 2 +- docs/conf.py | 6 +- .../03_tutorial/regressor_tutorial.ipynb | 644 +++++++++++++++ docs/source/api_docs/index.rst | 1 + requirements.txt | 2 +- setup.py | 1 + test_requirements.txt | 10 +- tests/conftest.py | 533 ++++++++++++- tests/structure/data_generators/__init__.py | 27 + tests/structure/data_generators/test_core.py | 543 +++++++++++++ .../test_wrappers.py} | 648 +++++++-------- tests/structure/test_dynotears.py | 738 ++++++++++++++++++ tests/structure/test_nonlinear.py | 41 + tests/structure/test_notears.py | 94 ++- tests/structure/test_pytorch_notears.py | 422 ++++++++++ tests/structure/test_sklearn.py | 221 ++++++ tests/structure/test_transformers.py | 153 ++++ tools/license_and_headers.py | 7 +- 39 files changed, 7148 insertions(+), 685 deletions(-) create mode 100644 causalnex/structure/data_generators/__init__.py rename causalnex/structure/{data_generators.py => data_generators/core.py} (51%) create mode 100644 causalnex/structure/data_generators/wrappers.py create mode 100644 causalnex/structure/dynotears.py create mode 100644 causalnex/structure/pytorch/__init__.py create mode 100644 causalnex/structure/pytorch/core.py create mode 100644 causalnex/structure/pytorch/nonlinear.py create mode 100644 causalnex/structure/pytorch/notears.py create mode 100644 causalnex/structure/sklearn.py create mode 100644 causalnex/structure/transformers.py create mode 100644 docs/source/03_tutorial/regressor_tutorial.ipynb create mode 100644 tests/structure/data_generators/__init__.py create mode 100644 tests/structure/data_generators/test_core.py rename tests/structure/{test_data_generators.py => data_generators/test_wrappers.py} (63%) create mode 100644 tests/structure/test_dynotears.py create mode 100644 tests/structure/test_nonlinear.py create mode 100644 tests/structure/test_pytorch_notears.py create mode 100644 tests/structure/test_sklearn.py create mode 100644 tests/structure/test_transformers.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 05428ea..dce8c54 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -27,12 +27,16 @@ utils: echo ". /home/circleci/miniconda/etc/profile.d/conda.sh" >> $BASH_ENV echo "conda deactivate; conda activate causalnex_env" >> $BASH_ENV + # needed to control numpy multithreading code since circleci gives incorrect CPU counts + echo "export MKL_NUM_THREADS=1 && export OMP_NUM_THREADS=1 && export NUMEXPR_NUM_THREADS=1" >> $BASH_ENV + setup_requirements: &setup_requirements name: Install PIP dependencies command: | echo "Python version: $(python --version 2>&1)" pip install -r requirements.txt -U pip install -r test_requirements.txt -U + pip install ".[pytorch]" conda install -y virtualenv setup_pre_commit: &setup_pre_commit name: Install pre-commit hooks diff --git a/.pylintrc b/.pylintrc index bfb1093..4668a76 100644 --- a/.pylintrc +++ b/.pylintrc @@ -269,7 +269,7 @@ contextmanager-decorators=contextlib.contextmanager # List of members which are set dynamically and missed by pylint inference # system, and so shouldn't trigger E1101 when accessed. Python regular # expressions are accepted. -generated-members= +generated-members=torch.* # Tells whether missing members accessed in mixin class should be ignored. A # mixin class is detected if its name ends with "mixin" (case insensitive). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8b175ca..4c67432 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -16,7 +16,7 @@ The CausalNex team pledges to foster and maintain a welcoming and friendly commu We use [GitHub Issues](https://github.com/quantumblacklabs/causalnex/issues) to keep track of known bugs. We keep a close eye on them and try to make it clear when we have an internal fix in progress. Before reporting a new issue, please do your best to ensure your problem hasn't already been reported. If so, it's often better to just leave a comment on an existing issue, rather than create a new one. Old issues also can often include helpful tips and solutions to common problems. -If you are looking for help with your code in our documentation haven't helped you, please consider posting a question on [Stack Overflow](https://stackoverflow.com/questions/tagged/causalnex). If you tag it `causalnex` and `python`, more people will see it and may be able to help. We are unable to provide individual support via email. In the interest of community engagement we also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. +If you are looking for help with your code and our documentation hasn't helped you, please consider posting a question on [Stack Overflow](https://stackoverflow.com/questions/tagged/causalnex). If you tag it `causalnex` and `python`, more people will see it and may be able to help. We are unable to provide individual support via email. In the interest of community engagement we also believe that help is much more valuable if it's shared publicly, so that more people can benefit from it. If you're over on Stack Overflow and want to boost your points, take a look at the `causalnex` tag and see if you can help others out by sharing your knowledge. It's another great way to contribute. diff --git a/RELEASE.md b/RELEASE.md index 26019d1..6ea6d4b 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,9 +1,19 @@ # Upcoming release +# Release 0.8.0 + +* Add DYNOTEARS (`from_numpy_dynamic`, an algorithm for structure learning on Dynamic Bayesian Networks). +* Added Pytorch implementation for NOTEARS MLP (`pytorch.from_numpy`) which is much faster and allows nonlinear modelling. +* Added `DAGRegressor` sklearn interface using the Pytorch NOTEARS implementation. +* Add non-linear data generators for multiple data types. +* Add a count data type to the data generator using a zero-inflated Poisson. +* Set bounds/max class imbalance for binary features for the data generators. +* Bugfix to resolve issue when applying NOTEARS on data containing NaN. +* Bugfix for data_gen system. Fixes issues with root node initialization. # Release 0.7.0 -* Added plottting tutorial to the documentation +* Added plotting tutorial to the documentation * Updated `viz.draw` syntax in tutorial notebooks * Bugfix on notears lasso (`from_numpy_lasso` and `from_pandas_lasso`) where the non-negativity constraint was not being set * Added DAG-based synthetic data generator for mixed types (binary, categorical, continuous) using a linear SEM approach. @@ -42,6 +52,6 @@ The initial release of CausalNex. ## Thanks for supporting contributions CausalNex was originally designed by [Paul Beaumont](https://www.linkedin.com/in/pbeaumont/) and [Ben Horsburgh](https://www.linkedin.com/in/benhorsburgh/) to solve challenges they faced in inferencing causality in their project work. This work was later turned into a product thanks to the following contributors: -[Yetunde Dada](https://github.com/yetudada), [Wesley Leong](https://www.linkedin.com/in/wesleyleong/), [Steve Ler](https://www.linkedin.com/in/song-lim-steve-ler-380366106/), [Viktoriia Oliinyk](https://www.linkedin.com/in/victoria-oleynik/), [Roxana Pamfil](https://www.linkedin.com/in/roxana-pamfil-1192053b/), [Nisara Sriwattanaworachai](https://www.linkedin.com/in/nisara-sriwattanaworachai-795b357/) and [Nikolaos Tsaousis](https://www.linkedin.com/in/ntsaousis/). +[Yetunde Dada](https://github.com/yetudada), [Wesley Leong](https://www.linkedin.com/in/wesleyleong/), [Steve Ler](https://www.linkedin.com/in/song-lim-steve-ler-380366106/), [Viktoriia Oliinyk](https://www.linkedin.com/in/victoria-oleynik/), [Roxana Pamfil](https://www.linkedin.com/in/roxana-pamfil-1192053b/), [Nisara Sriwattanaworachai](https://www.linkedin.com/in/nisara-sriwattanaworachai-795b357/), [Nikolaos Tsaousis](https://www.linkedin.com/in/ntsaousis/), [Angel Droth](https://www.linkedin.com/in/angeldroth/), and [Zain Patel](https://www.linkedin.com/in/zain-patel/). CausalNex would also not be possible without the generous sharing from leading researches in the field of causal inference and we are grateful to everyone who advised and supported us, filed issues or helped resolve them, asked and answered questions or simply be part of inspiring discussions. diff --git a/causalnex/__init__.py b/causalnex/__init__.py index a800c17..01312ed 100644 --- a/causalnex/__init__.py +++ b/causalnex/__init__.py @@ -30,6 +30,6 @@ causalnex toolkit for causal reasoning (Bayesian Networks / Inference) """ -__version__ = "0.7.0" +__version__ = "0.8.0" __all__ = ["structure", "discretiser", "evaluation", "inference", "network", "plots"] diff --git a/causalnex/inference/inference.py b/causalnex/inference/inference.py index d653502..cdb93bb 100644 --- a/causalnex/inference/inference.py +++ b/causalnex/inference/inference.py @@ -284,9 +284,7 @@ def template() -> float: # initially there are none present, but caller will add appropriate arguments to the function # getargvalues was "inadvertently marked as deprecated in Python 3.5" # https://docs.python.org/3/library/inspect.html#inspect.getfullargspec - arg_spec = inspect.getargvalues( # pylint: disable=deprecated-method - inspect.currentframe() - ) + arg_spec = inspect.getargvalues(inspect.currentframe()) return self._cpds[arg_spec.args[0]][ # target name arg_spec.locals[arg_spec.args[0]] diff --git a/causalnex/network/network.py b/causalnex/network/network.py index 43fde1e..154e363 100644 --- a/causalnex/network/network.py +++ b/causalnex/network/network.py @@ -46,67 +46,67 @@ class BayesianNetwork: """ - Base class for Bayesian Network (BN), a probabilistic weighted DAG where nodes represent variables, - edges represent the causal relationships between variables. - - ``BayesianNetwork`` stores nodes with their possible states, edges and - conditional probability distributions (CPDs) of each node. - - ``BayesianNetwork`` is built on top of the ``StructureModel``, which is an extension of ``networkx.DiGraph`` - (see :func:`causalnex.structure.structuremodel.StructureModel`). - - In order to define the ``BayesianNetwork``, users should provide a relevant ``StructureModel``. - Once ``BayesianNetwork`` is initialised, no changes to the ``StructureModel`` can be made - and CPDs can be learned from the data. - - The learned CPDs can be then used for likelihood estimation and predictions. - - Example: - :: - >>> # Create a Bayesian Network with a manually defined DAG. - >>> from causalnex.structure import StructureModel - >>> from causalnex.network import BayesianNetwork - >>> - >>> sm = StructureModel() - >>> sm.add_edges_from([ - >>> ('rush_hour', 'traffic'), - >>> ('weather', 'traffic') - >>> ]) - >>> bn = BayesianNetwork(sm) - >>> # A created ``BayesianNetwork`` stores nodes and edges defined by the ``StructureModel`` - >>> bn.nodes - ['rush_hour', 'traffic', 'weather'] - >>> - >>> bn.edges - [('rush_hour', 'traffic'), ('weather', 'traffic')] - >>> # A ``BayesianNetwork`` doesn't store any CPDs yet - >>> bn.cpds - >>> {} - >>> - >>> # Learn the nodes' states from the data - >>> import pandas as pd - >>> data = pd.DataFrame({ - >>> 'rush_hour': [True, False, False, False, True, False, True], - >>> 'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'], - >>> 'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy'] - >>> }) - >>> bn = bn.fit_node_states(data) - >>> bn.node_states - {'rush_hour': {False, True}, 'weather': {'Bad', 'Good', 'Terrible'}, 'traffic': {'heavy', 'light'}} - >>> # Learn the CPDs from the data - >>> bn = bn.fit_cpds(data) - >>> # Use the learned CPDs to make predictions on the unseen data - >>> test_data = pd.DataFrame({ - >>> 'rush_hour': [False, False, True, True], - >>> 'weather': ['Good', 'Bad', 'Good', 'Bad'] - >>> }) - >>> bn.predict(test_data, "traffic").to_dict() - >>> {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}} - >>> bn.predict_probability(test_data, "traffic").to_dict() - {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}} - {'traffic_light': {0: 0.75, 1: 0.25, 2: 0.3333333333333333, 3: 0.3333333333333333}, - 'traffic_heavy': {0: 0.25, 1: 0.75, 2: 0.6666666666666666, 3: 0.6666666666666666}} - """ + Base class for Bayesian Network (BN), a probabilistic weighted DAG where nodes represent variables, + edges represent the causal relationships between variables. + + ``BayesianNetwork`` stores nodes with their possible states, edges and + conditional probability distributions (CPDs) of each node. + + ``BayesianNetwork`` is built on top of the ``StructureModel``, which is an extension of ``networkx.DiGraph`` + (see :func:`causalnex.structure.structuremodel.StructureModel`). + + In order to define the ``BayesianNetwork``, users should provide a relevant ``StructureModel``. + Once ``BayesianNetwork`` is initialised, no changes to the ``StructureModel`` can be made + and CPDs can be learned from the data. + + The learned CPDs can be then used for likelihood estimation and predictions. + + Example: + :: + >>> # Create a Bayesian Network with a manually defined DAG. + >>> from causalnex.structure import StructureModel + >>> from causalnex.network import BayesianNetwork + >>> + >>> sm = StructureModel() + >>> sm.add_edges_from([ + >>> ('rush_hour', 'traffic'), + >>> ('weather', 'traffic') + >>> ]) + >>> bn = BayesianNetwork(sm) + >>> # A created ``BayesianNetwork`` stores nodes and edges defined by the ``StructureModel`` + >>> bn.nodes + ['rush_hour', 'traffic', 'weather'] + >>> + >>> bn.edges + [('rush_hour', 'traffic'), ('weather', 'traffic')] + >>> # A ``BayesianNetwork`` doesn't store any CPDs yet + >>> bn.cpds + >>> {} + >>> + >>> # Learn the nodes' states from the data + >>> import pandas as pd + >>> data = pd.DataFrame({ + >>> 'rush_hour': [True, False, False, False, True, False, True], + >>> 'weather': ['Terrible', 'Good', 'Bad', 'Good', 'Bad', 'Bad', 'Good'], + >>> 'traffic': ['heavy', 'light', 'heavy', 'light', 'heavy', 'heavy', 'heavy'] + >>> }) + >>> bn = bn.fit_node_states(data) + >>> bn.node_states + {'rush_hour': {False, True}, 'weather': {'Bad', 'Good', 'Terrible'}, 'traffic': {'heavy', 'light'}} + >>> # Learn the CPDs from the data + >>> bn = bn.fit_cpds(data) + >>> # Use the learned CPDs to make predictions on the unseen data + >>> test_data = pd.DataFrame({ + >>> 'rush_hour': [False, False, True, True], + >>> 'weather': ['Good', 'Bad', 'Good', 'Bad'] + >>> }) + >>> bn.predict(test_data, "traffic").to_dict() + >>> {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}} + >>> bn.predict_probability(test_data, "traffic").to_dict() + {'traffic_prediction': {0: 'light', 1: 'heavy', 2: 'heavy', 3: 'heavy'}} + {'traffic_light': {0: 0.75, 1: 0.25, 2: 0.3333333333333333, 3: 0.3333333333333333}, + 'traffic_heavy': {0: 0.25, 1: 0.75, 2: 0.6666666666666666, 3: 0.6666666666666666}} + """ def __init__(self, structure: StructureModel): """ @@ -573,7 +573,7 @@ def _predict_probability_from_incomplete_data( cols = [] pattern = re.compile("^{node}_[0-9]+$".format(node=node)) # disabled open pylint issue (https://github.com/PyCQA/pylint/issues/2962) - for col in probability.columns: # pylint: disable=E1133 + for col in probability.columns: if pattern.match(col): cols.append(col) probability = probability[cols] diff --git a/causalnex/structure/__init__.py b/causalnex/structure/__init__.py index bfe7dfa..3651f9a 100644 --- a/causalnex/structure/__init__.py +++ b/causalnex/structure/__init__.py @@ -30,6 +30,7 @@ ``causalnex.structure`` provides functionality to define or learn structure. """ -__all__ = ["StructureModel", "notears"] +__all__ = ["StructureModel", "notears", "dynotears", "data_generators", "DAGRegressor"] +from .sklearn import DAGRegressor from .structuremodel import StructureModel diff --git a/causalnex/structure/categorical_variable_mapper.py b/causalnex/structure/categorical_variable_mapper.py index 3127b50..62cfe74 100644 --- a/causalnex/structure/categorical_variable_mapper.py +++ b/causalnex/structure/categorical_variable_mapper.py @@ -49,7 +49,7 @@ class VariableFeatureMapper: attribute ``PERMISSIBLE_TYPES``. """ - PERMISSIBLE_TYPES = {"binary", "categorical", "continuous"} + PERMISSIBLE_TYPES = {"binary", "categorical", "continuous", "count"} EXPANDABLE_TYPE = "categorical" def __init__(self, schema: Dict[Hashable, str]): @@ -81,10 +81,11 @@ def __init__(self, schema: Dict[Hashable, str]): ) cat_feature_list = list(self._cat_fte_var_dict.keys()) - # we put them together with the cont + binayr in a feature list + # we put them together with the cont + binary in a feature list self.feature_list = ( self.variable_type_dict["binary"] + self.variable_type_dict["continuous"] + + self.variable_type_dict["count"] + cat_feature_list ) @@ -98,6 +99,7 @@ def __init__(self, schema: Dict[Hashable, str]): var: [self._fte_index_dict[var]] for var in self.variable_type_dict["continuous"] + self.variable_type_dict["binary"] + + self.variable_type_dict["count"] } self.var_indices_dict.update( { diff --git a/causalnex/structure/data_generators/__init__.py b/causalnex/structure/data_generators/__init__.py new file mode 100644 index 0000000..ced528e --- /dev/null +++ b/causalnex/structure/data_generators/__init__.py @@ -0,0 +1,59 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Data generators using DAGs for benchmarking and synthetic data generation. +""" + +__all__ = [ + "generate_structure", + "nonlinear_sem_generator", + "sem_generator", + "generate_binary_data", + "generate_binary_dataframe", + "generate_categorical_dataframe", + "generate_continuous_data", + "generate_continuous_dataframe", + "generate_count_dataframe", + "gen_stationary_dyn_net_and_df", + "generate_dataframe_dynamic", + "generate_structure_dynamic", +] + +from .core import generate_structure, nonlinear_sem_generator, sem_generator +from .wrappers import ( + gen_stationary_dyn_net_and_df, + generate_binary_data, + generate_binary_dataframe, + generate_categorical_dataframe, + generate_continuous_data, + generate_continuous_dataframe, + generate_count_dataframe, + generate_dataframe_dynamic, + generate_structure_dynamic, +) diff --git a/causalnex/structure/data_generators.py b/causalnex/structure/data_generators/core.py similarity index 51% rename from causalnex/structure/data_generators.py rename to causalnex/structure/data_generators/core.py index ee30db1..384dc2e 100644 --- a/causalnex/structure/data_generators.py +++ b/causalnex/structure/data_generators/core.py @@ -32,17 +32,29 @@ Structure generator based on implementation found in: from https://github.com/xunzheng/notears git hash: 31923cb22517f7bb6420dd0b6ef23ca550702b97 """ -from typing import Dict, Hashable, List, Optional, Tuple +from typing import Dict, Hashable, List, Optional, Tuple, Union import networkx as nx import numpy as np import pandas as pd +from sklearn.gaussian_process.kernels import RBF, Kernel +from causalnex.structure import StructureModel from causalnex.structure.categorical_variable_mapper import ( VariableFeatureMapper, validate_schema, ) -from causalnex.structure.structuremodel import StructureModel + +# dict mapping distributions names to their functions +__distribution_mapper = { + "gaussian": np.random.normal, + "normal": np.random.normal, + "student-t": np.random.standard_t, + "gumbel": np.random.gumbel, + "exponential": np.random.exponential, + "probit": np.random.normal, + "logit": np.random.logistic, +} def generate_structure( @@ -105,7 +117,10 @@ def generate_structure( edge_flags = np.tril(np.ones([num_nodes, num_nodes]), k=-1) else: - raise ValueError("unknown graph type") + raise ValueError( + "Unknown graph type {t}. ".format(t=graph_type) + + "Available types are ['erdos-renyi', 'barabasi-albert', 'full']" + ) # randomly permute edges - required because we limited ourselves to lower diagonal previously perms = np.random.permutation(np.eye(num_nodes, num_nodes)) @@ -120,214 +135,24 @@ def generate_structure( return graph -def generate_continuous_data( - sm: nx.DiGraph, - n_samples: int, - distribution: str = "gaussian", - noise_scale: float = 1.0, - intercept: bool = False, - seed: int = None, -) -> np.ndarray: - """ - Simulate samples from SEM with specified type of noise. - The order of the columns on the returned array is the one provided by `sm.nodes` - - Args: - sm: A DAG in form of a networkx or StructureModel. Does not require weights. - n_samples: The number of rows/observations to sample. - distribution: The type of distribution to use for the noise - of a variable. Options: 'gaussian'/'normal' (alias), 'student-t', - 'exponential', 'gumbel'. - noise_scale: The standard deviation of the noise. - intercept: Whether to use an intercept for each feature. - seed: Random state - Returns: - x_mat: [n_samples,d_nodes] sample matrix - Raises: - ValueError: if distribution isn't gaussian/normal/student-t/exponential/gumbel - """ - df = sem_generator( - graph=sm, - default_type="continuous", - n_samples=n_samples, - distributions={"continuous": distribution}, - noise_std=noise_scale, - intercept=intercept, - seed=seed, - ) - return df[list(sm.nodes())].values - - -def generate_binary_data( - sm: nx.DiGraph, - n_samples: int, - distribution: str = "logit", - noise_scale: float = 1.0, - intercept: bool = False, - seed: int = None, -) -> np.ndarray: - """ - Simulate samples from SEM with specified type of noise. - The order of the columns on the returned array is the one provided by `sm.nodes` - - Args: - sm: A DAG in form of a networkx or StructureModel. Does not require weights. - n_samples: The number of rows/observations to sample. - distribution: The type of distribution to use for the noise - of a variable. Options: 'probit'/'normal' (alias), - 'logit' (default). - noise_scale: The standard deviation of the noise. The binary and - categorical features are created using a latent variable approach. - The noise standard deviation determines how much weight the "mean" - estimate has on the feature value. - intercept: Whether to use an intercept for the latent variable of each feature. - seed: Random state - Returns: - x_mat: [n_samples,d_nodes] sample matrix - Raises: - ValueError: if distribution isn't 'probit', 'normal', 'logit' - """ - df = sem_generator( - graph=sm, - default_type="binary", - n_samples=n_samples, - distributions={"binary": distribution}, - noise_std=noise_scale, - intercept=intercept, - seed=seed, - ) - return df[list(sm.nodes())].values - - -def generate_continuous_dataframe( - sm: nx.DiGraph, - n_samples: int, - distribution: str = "gaussian", - noise_scale: float = 1.0, - intercept: bool = False, - seed: int = None, -) -> pd.DataFrame: - """ - Generates a dataframe with samples from SEM with specified type of noise. - Args: - sm: A DAG in form of a networkx or StructureModel. Does not require weights. - n_samples: The number of rows/observations to sample. - distribution: The type of distribution to use for the noise - of a variable. Options: 'gaussian'/'normal' (alias), 'student-t', - 'exponential', 'gumbel'. - noise_scale: The standard deviation of the noise. - intercept: Whether to use an intercept for each feature. - seed: Random state - Returns: - Dataframe with the node names as column names - Raises: - ValueError: if distribution is not 'gaussian', 'normal', 'student-t', - 'exponential', 'gumbel' - """ - return sem_generator( - graph=sm, - default_type="continuous", - n_samples=n_samples, - distributions={"continuous": distribution}, - noise_std=noise_scale, - intercept=intercept, - seed=seed, - ) - - -def generate_binary_dataframe( - sm: nx.DiGraph, - n_samples: int, - distribution: str = "logit", - noise_scale: float = 1.0, - intercept: bool = False, - seed: int = None, -) -> pd.DataFrame: - """ - Generates a dataframe with samples from SEM with specified type of noise. - - Args: - sm: A DAG in form of a networkx or StructureModel. Does not require weights. - n_samples: The number of rows/observations to sample. - distribution: The type of distribution to use for the noise - of a variable. Options: 'probit'/'normal' (alias), - 'logit' (default). - noise_scale: The standard deviation of the noise. The binary and - categorical features are created using a latent variable approach. - The noise standard deviation determines how much weight the "mean" - estimate has on the feature value. - intercept: Whether to use an intercept for the latent variable of each feature. - seed: Random state - Returns: - x_mat: [n_samples,d_nodes] sample matrix - Raises: - ValueError: if distribution is not 'probit', 'normal', 'logit' - """ - return sem_generator( - graph=sm, - default_type="binary", - n_samples=n_samples, - distributions={"binary": distribution}, - noise_std=noise_scale, - intercept=intercept, - seed=seed, - ) - - -def generate_categorical_dataframe( - sm: nx.DiGraph, - n_samples: int, - distribution: str = "logit", - n_categories: int = 3, - noise_scale: float = 1.0, - intercept: bool = False, - seed: int = None, -) -> pd.DataFrame: - """ - Generates a dataframe with samples from SEM with specified type of noise. - - Args: - sm: A DAG in form of a networkx or StructureModel. Does not require weights. - n_samples: The number of rows/observations to sample. - distribution: The type of distribution to use for the noise - of a variable. Options: 'probit'/'normal' (alias), - "logit"/"gumbel" (alias). Logit is default. - n_categories: Number of categories per variable/node. - noise_scale: The standard deviation of the noise. The categorical features - are created using a latent variable approach. The noise standard - deviation determines how much weight the "mean" estimate has on - the feature value. - intercept: Whether to use an intercept for the latent variable of each feature. - seed: Random state - Returns: - x_mat: [n_samples, d_nodes] sample matrix - Raises: - ValueError: if distribution is not 'probit', 'normal', 'logit', 'gumbel' - """ - return sem_generator( - graph=sm, - default_type="categorical:{}".format(n_categories), - n_samples=n_samples, - distributions={"categorical": distribution}, - noise_std=noise_scale, - intercept=intercept, - seed=seed, - ) - - def sem_generator( graph: nx.DiGraph, schema: Optional[Dict] = None, default_type: str = "continuous", noise_std: float = 1.0, n_samples: int = 1000, - distributions: Dict[str, str] = None, + distributions: Dict[str, Union[str, float]] = None, intercept: bool = True, seed: int = None, ) -> pd.DataFrame: """ Generator for tabular data with mixed variable types from a DAG. + NOTE: the root nodes of the DAG are sampled from a distribution with noise_std=1.0 always. + This is so that increases in the noise_std are in relation to a fixed spread, and therefore + actually have an impact on the fit. Not using this method causes the noise_std to only change + the axis scaling. + Supported variable types: `'binary', 'categorical', 'continuous'`. The number of categories can be determined using a colon, e.g. `'categorical:5'` specifies a categorical feature with 5 categories. @@ -362,6 +187,7 @@ def sem_generator( ``intercept'': The type of distribution to use for the intercept. For binary/categorical: this is the mean in the latent space. Options: 'gaussian'/'normal' (alias), 'uniform' (default). + ``count``: The zero-inflation probability as a float. intercept: Whether to use an intercept for each feature. The intercept is sampled once and held constant for all rows. For binary or categorical the intercept determines the class imbalance. @@ -381,7 +207,7 @@ def sem_generator( ValueError: if distributions['categorical'] is not 'probit', 'normal', 'logit', 'gumbel'. ValueError: if distributions['weight'] is not 'normal' / 'gaussian' (alias), 'uniform'. ValueError: if distributions['intercept'] is not 'normal' / 'gaussian' (alias), 'uniform'. - + ValueError: if distributions['count'], the zero-inflation factor is not a float in [0, 1]. Example: sm = StructureModel() @@ -398,19 +224,14 @@ def sem_generator( intercept=True, ) """ - - np.random.seed(seed) - - if not nx.algorithms.is_directed_acyclic_graph(graph): - raise ValueError("Provided graph is not a DAG.") - - distributions = _set_default_distributions(distributions=distributions) - validated_schema = validate_schema( - nodes=graph.nodes(), schema=schema, default_type=default_type + distributions, var_fte_mapper, x_mat = _init_sem_data_gen( + graph=graph, + schema=schema, + n_samples=n_samples, + default_type=default_type, + distributions=distributions, + seed=seed, ) - var_fte_mapper = VariableFeatureMapper(validated_schema) - - n_columns = var_fte_mapper.n_features # get dependence based on edges in graph (not via adjacency matrix) w_mat = _create_weight_matrix( @@ -421,11 +242,13 @@ def sem_generator( intercept=intercept, ) - # pre-allocate array - x_mat = np.empty([n_samples, n_columns + 1 if intercept else n_columns]) # intercept, append ones to the feature matrix if intercept: - x_mat[:, -1] = 1 + x_mat = np.append(x_mat, np.ones(shape=(n_samples, 1)), axis=1) + intercept_idx = [x_mat.shape[1] - 1] + + # if intercept is used, the root nodes have len = 1 + root_node_len = 1 if intercept else 0 # loop over sorted features according to ancestry (no parents first) for j_node in nx.topological_sort(graph): @@ -435,7 +258,10 @@ def sem_generator( # get all parent feature indices for the variable/node parents_idx = var_fte_mapper.get_indices(list(graph.predecessors(j_node))) if intercept: - parents_idx += [n_columns] + parents_idx += intercept_idx + + # if the data is a root node, must initialise the axis separate from noise parameter + root_node = len(parents_idx) <= root_node_len # continuous variable if var_fte_mapper.is_var_of_type(j_node, "continuous"): @@ -443,6 +269,7 @@ def sem_generator( mean=x_mat[:, parents_idx].dot(w_mat[parents_idx, j_idx_list[0]]), distribution=distributions["continuous"], noise_std=noise_std, + root_node=root_node, ) # binary variable @@ -453,6 +280,15 @@ def sem_generator( ), distribution=distributions["binary"], noise_std=noise_std, + root_node=root_node, + ) + + # count variable + elif var_fte_mapper.is_var_of_type(j_node, "count"): + x_mat[:, j_idx_list[0]] = _sample_count_from_latent( + eta=x_mat[:, parents_idx].dot(w_mat[parents_idx, j_idx_list[0]]), + zero_inflation_pct=distributions["count"], + root_node=root_node, ) # categorical variable @@ -463,6 +299,7 @@ def sem_generator( ), distribution=distributions["categorical"], noise_std=noise_std, + root_node=root_node, ) return pd.DataFrame( @@ -470,67 +307,188 @@ def sem_generator( ) +def _handle_distribution_sampling( + distribution: str, + distribution_func, + noise_std: float, + size: Tuple[int], + root_node: bool, +): + # force scale to be 1 for the root node + if root_node: + noise_std = 1 + + # special sampling syntax + if distribution == "student-t": + return distribution_func(df=5, size=size) * noise_std + + # default sampling syntax + return distribution_func(scale=noise_std, size=size) + + def _add_continuous_noise( - mean: np.ndarray, distribution: str, noise_std: float, + mean: np.ndarray, + distribution: str, + noise_std: float, + root_node: bool, ) -> np.ndarray: n_samples = mean.shape[0] - # add noise to mean - if distribution in ("gaussian", "normal"): - x = mean + np.random.normal(scale=noise_std, size=n_samples) - elif distribution == "student-t": - x = mean + np.random.standard_t(df=5, size=n_samples) * noise_std - elif distribution == "exponential": - x = mean + np.random.exponential(scale=noise_std, size=n_samples) - elif distribution == "gumbel": - x = mean + np.random.gumbel(scale=noise_std, size=n_samples) - else: + # try and get the requested distribution from the mapper + distribution_func = __distribution_mapper.get(distribution, None) + if distribution_func is None: _raise_dist_error( "continuous", distribution, ["gaussian", "normal", "student-t", "exponential", "gumbel"], ) - return x + # add noise to mean + mean += _handle_distribution_sampling( + distribution=distribution, + distribution_func=distribution_func, + noise_std=noise_std, + size=(n_samples,), + root_node=root_node, + ) + + return mean def _sample_binary_from_latent( - latent_mean: np.ndarray, distribution: str, noise_std: float, + latent_mean: np.ndarray, + distribution: str, + noise_std: float, + root_node: bool, + max_imbalance: float = 0.05, ) -> np.ndarray: n_samples = latent_mean.shape[0] - # add noise to latent variable - if distribution in ("normal", "probit"): - eta = latent_mean + np.random.normal(scale=noise_std, size=n_samples) - elif distribution == "logit": - eta = latent_mean + np.random.logistic(scale=noise_std, size=n_samples) - else: + # try and get the requested distribution from the mapper + distribution_func = __distribution_mapper.get(distribution, None) + if distribution_func is None: _raise_dist_error("binary", distribution, ["logit", "probit", "normal"]) - # using a latent variable approach - return (eta > 0).astype(int) + # add noise to mean + latent_mean += _handle_distribution_sampling( + distribution=distribution, + distribution_func=distribution_func, + noise_std=noise_std, + size=(n_samples,), + root_node=root_node, + ) + + # use an alternative threshold if 0 leads to heavy imbalance + labels = (latent_mean > 0).astype(int) + share_positive = np.mean(labels) + if share_positive < max_imbalance: + return (latent_mean > np.quantile(latent_mean, max_imbalance)).astype(int) + if share_positive > (1 - max_imbalance): + return (latent_mean > np.quantile(latent_mean, 1 - max_imbalance)).astype(int) + return labels + + +def _sample_count_from_latent( + eta: np.ndarray, + root_node: bool, + zero_inflation_pct: float = 0.05, +) -> np.ndarray: + """ + Samples a zero-inflated poisson distribution. + Returns: + Samples from a Poisson distribution. + Raises: + ValueError: Unsupported zero-inflation factor. + """ + if ( + not isinstance(zero_inflation_pct, (float, int)) + or zero_inflation_pct < 0 + or zero_inflation_pct > 1 + ): + raise ValueError( + "Unsupported zero-inflation factor, distribution['count'] needs to be a float in [0, 1]" + ) + n_samples = eta.shape[0] + + # add noise manually if root node + # uniform [0, 1] makes sure that the counts are small + if root_node: + eta += np.random.uniform(size=n_samples) + + zif = np.random.uniform(size=n_samples) < zero_inflation_pct + count = _sample_poisson(expected_count=_exp_relu(eta)) + + # inflate the zeros: + count[zif] = 0 + return count + + +def _exp_relu(x): + x[x < 0] = np.exp(x[x < 0]) + return x + + +def _sample_poisson(expected_count: np.ndarray, max_count: int = 5000) -> np.ndarray: + """ + Samples from a poisson distribution using each element in ``latent_mean`` + as the Poisson parameter. + + Args: + expected_count: Event rate of the Poisson process, can be of any array + dimension. Defined on (0, infty). + max_count: Bounds the count from above. The count sample is created + with a while loop. This argument is the maximum number of loop + iterations before stopping. Default value should run on most + machines in reasonable amount of time. + Returns: + Sampled count of a Poisson distribution from the given mean. + """ + # use log for numeric stability for large count values + log_cond_intensity = -expected_count + log_intensity_budget = np.copy(log_cond_intensity) + + count = np.zeros_like(expected_count) + + log_uni = np.log(np.random.uniform(size=expected_count.shape)) + mask = log_uni >= log_intensity_budget + + while np.any(mask) and count.max() < max_count: + mask = log_uni >= log_intensity_budget + count[mask] += 1 + log_cond_intensity[mask] += np.log(expected_count[mask] / count[mask]) + log_intensity_budget[mask] = np.logaddexp( + log_intensity_budget[mask], log_cond_intensity[mask] + ) + + return count def _sample_categories_from_latent( - latent_mean: np.ndarray, distribution: str, noise_std: float, + latent_mean: np.ndarray, + distribution: str, + noise_std: float, + root_node: bool, ) -> np.ndarray: one_hot = np.empty_like(latent_mean) n_samples, n_cardinality = latent_mean.shape - if distribution in ("normal", "probit"): - latent_mean += np.random.normal( - scale=noise_std, size=(n_samples, n_cardinality) - ) - elif distribution in ("logit", "gumbel"): - latent_mean += np.random.gumbel( - scale=noise_std, size=(n_samples, n_cardinality) - ) - else: + # try and get the requested distribution from the mapper + distribution_func = __distribution_mapper.get(distribution, None) + if distribution_func is None: _raise_dist_error( "categorical", distribution, ["logit", "gumbel", "probit", "normal"] ) + # add noise to mean + latent_mean += _handle_distribution_sampling( + distribution=distribution, + distribution_func=distribution_func, + noise_std=noise_std, + size=(n_samples, n_cardinality), + root_node=root_node, + ) + x_cat = np.argmax(latent_mean, axis=1) for i in range(n_cardinality): @@ -539,13 +497,16 @@ def _sample_categories_from_latent( return one_hot -def _set_default_distributions(distributions: Dict[str, str]) -> Dict[str, str]: +def _set_default_distributions( + distributions: Dict[str, Union[str, float]] +) -> Dict[str, Union[str, float]]: default_distributions = { "continuous": "gaussian", "binary": "logit", "categorical": "logit", "weight": "uniform", "intercept": "uniform", + "count": 0.05, } if distributions is None: @@ -626,3 +587,259 @@ def _raise_dist_error(name: str, dist: str, dist_options): ", ".join(valid_dist for valid_dist in dist_options) ) ) + + +def _init_sem_data_gen( + graph: nx.DiGraph, + schema: Dict, + n_samples: int, + default_type: str, + distributions: Dict[str, str], + seed: int, +): + np.random.seed(seed) + + if not nx.algorithms.is_directed_acyclic_graph(graph): + raise ValueError("Provided graph is not a DAG.") + + distributions = _set_default_distributions(distributions=distributions) + validated_schema = validate_schema( + nodes=graph.nodes(), schema=schema, default_type=default_type + ) + var_fte_mapper = VariableFeatureMapper(validated_schema) + + # pre-allocate array + n_columns = var_fte_mapper.n_features + x_mat = np.empty([n_samples, n_columns]) + + return distributions, var_fte_mapper, x_mat + + +def nonlinear_sem_generator( + graph: nx.DiGraph, + kernel: Kernel = RBF(1), + schema: Optional[Dict] = None, + default_type: str = "continuous", + noise_std: float = 1.0, + n_samples: int = 1000, + distributions: Dict[str, str] = None, + seed: int = None, +) -> pd.DataFrame: + """ + Generator for non-linear tabular data with mixed variable types from a DAG. + + The nonlinearity can be controlled via the ``kernel``. Note that a + ``DotProduct`` is equivalent to a linear function (without mean). + + Supported variable types: `'binary', 'categorical', 'continuous'`. The number + of categories can be determined using a colon, e.g. `'categorical:5'` + specifies a categorical feature with 5 categories. + + Notation: For binary and continuous variables, a ``variable'' refers to a + ``node'', a ``feature'' refers to the one-hot column for categorical + variables and is equivalent to a binary or continuous variable. + + Args: + graph: A DAG in form of a networkx or StructureModel. + kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or + Matern(1) or any combinations thereof. The kernels are used to + create the latent variable for the binary / categorical variables + and are directly used for continuous variables. + schema: Dictionary with schema for a node/variable, if a node is missing + uses ``default_type``. Format, {node_name: variable type}. + default_type: The default data type for a node/variable not listed + in the schema, or when the schema is empty. + noise_std: The standard deviation of the noise. The binary and + categorical features are created using a latent variable approach. + The noise standard deviation determines how much weight the "mean" + estimate has on the feature value. + n_samples: The number of rows/observations to sample. + distributions: + ``continuous'': The type of distribution to use for the noise + of a continuous variable. Options: 'gaussian'/'normal' (alias) + (default), 'student-t', 'exponential', 'gumbel'. + ``binary'': The type of distribution to use for the noise + of the latent binary variable. Options: 'probit'/'normal' (alias), + 'logit' (default). + ``categorical'': The type of distribution to use for the noise + of a latent continuous feature. Options: 'probit'/'normal' (alias), + 'logit'/'gumbel' (alias) (default). + seed: Random State + + Returns: + DataFrame with generated features, uses a one-hot coding for + categorical features. + + Raises: + ValueError: if the graph is not a DAG. + ValueError: if schema variable type is not in `'binary', 'categorical', + 'continuous', 'continuous:X` (for variables with X categories). + ValueError: if distributions['continuous'] is not 'gaussian', 'normal', 'student-t', + 'exponential', 'gumbel'. + ValueError: if distributions['binary'] is not 'probit', 'normal', 'logit'. + ValueError: if distributions['categorical'] is not 'probit', 'normal', 'logit', 'gumbel'. + ValueError: if distributions['count'], the zero-inflation factor is not a float in [0, 1]. + + Example: + sm = StructureModel() + + sm.add_edges_from([('A', 'C'), ('D', 'C'), ('E', 'D')]) + + sm.add_nodes_from(['B', 'F']) + + schema = {'B': 'binary', 'C': 'categorical:5', + 'E': 'binary', 'F': 'continuous'} + + df = sem_generator(sm, schema, kernel=RBF(1), noise_scale=1, + n_samples=10000) + """ + distributions, var_fte_mapper, x_mat = _init_sem_data_gen( + graph=graph, + schema=schema, + n_samples=n_samples, + default_type=default_type, + distributions=distributions, + seed=seed, + ) + + # loop over sorted features according to ancestry (no parents first) + for j_node in nx.topological_sort(graph): + # all feature indices corresponding to the node/variable + j_idx_list = var_fte_mapper.get_indices(j_node) + + # get all parent feature indices for the variable/node + parents_idx = var_fte_mapper.get_indices(list(graph.predecessors(j_node))) + + # if the data is a root node, must initialise the axis separate from noise parameter + root_node = len(parents_idx) <= 0 + + # continuous variable + if var_fte_mapper.is_var_of_type(j_node, "continuous"): + x_mat[:, j_idx_list[0]] = _add_continuous_noise( + mean=_gp_index(x_mat[:, parents_idx], kernel), + distribution=distributions["continuous"], + noise_std=noise_std, + root_node=root_node, + ) + + # binary variable + elif var_fte_mapper.is_var_of_type(j_node, "binary"): + x_mat[:, j_idx_list[0]] = _sample_binary_from_latent( + latent_mean=_gp_index(x_mat[:, parents_idx], kernel), + distribution=distributions["binary"], + noise_std=noise_std, + root_node=root_node, + ) + + # count + if var_fte_mapper.is_var_of_type(j_node, "count"): + x_mat[:, j_idx_list[0]] = _sample_count_from_latent( + eta=_gp_index(x_mat[:, parents_idx], kernel), + zero_inflation_pct=distributions["count"], + root_node=root_node, + ) + + # categorical variable + elif var_fte_mapper.is_var_of_type(j_node, "categorical"): + x_mat[:, j_idx_list] = _sample_categories_from_latent( + latent_mean=np.concatenate( + [ + np.expand_dims(_gp_index(x_mat[:, parents_idx], kernel), axis=1) + for _ in j_idx_list + ], + axis=1, + ), + distribution=distributions["categorical"], + noise_std=noise_std, + root_node=root_node, + ) + return pd.DataFrame(x_mat, columns=var_fte_mapper.feature_list) + + +def _unconditional_sample(x, kernel): + cov_mat = kernel(x) + y = np.random.multivariate_normal(mean=np.zeros(shape=x.shape[0]), cov=cov_mat) + return y.squeeze(), cov_mat + + +def _conditional_sample( + x_new, x_old, f_old, kernel, cov_mat_old: np.ndarray = None, epsilon=0.00001 +): + + cov_mat_new = kernel(x_new) + cross_cov = kernel(x_old, x_new) + # X_no.T @ inv(X_oo): + reg_coef = np.linalg.solve( + cov_mat_old + epsilon * np.eye(x_old.shape[0]), cross_cov + ).T + + # calculate conditional mean and cov + cond_cov = (cov_mat_new - reg_coef @ cross_cov) + epsilon * np.eye(x_new.shape[0]) + cond_mean = (reg_coef @ f_old).squeeze() + + # sample + y_new = np.random.multivariate_normal(mean=cond_mean, cov=cond_cov).squeeze() + return y_new, cov_mat_new + + +def _gp_index( + x: np.ndarray, + kernel: Kernel, + max_chunk_size: int = 100, +) -> np.ndarray: + """ + Sample a Gaussian process using input data. + ``f(x) ~ GP(0, K)`` + + If the number of samples is larger than ``max_chunk_size``, the sampling is + split in sorted batches (first dimension) and sampled using a conditional + multivariate normal. + + Args: + x: + kernel: + max_chunk_size: + + Returns: + A one-dimensional numpy array with a sample of f(x) + """ + # if we dont have a parent, the input will have no columns + if x.shape[1] == 0: + return np.zeros(shape=(x.shape[0],)) + + use_batches = x.shape[0] > max_chunk_size + + if not use_batches: + y, _ = _unconditional_sample(x, kernel=kernel) + return _scale_y(y) + + # if we need batches, we sort according to the first dimension + ix_sort = np.argsort(x, axis=0)[:, 0].squeeze() + reverse_ix = np.argsort(ix_sort).squeeze() + + # split into smaller pieces + n_splits = (x.shape[0] // max_chunk_size) + 1 + x_splits = np.array_split(x[ix_sort, :], n_splits) + + outputs = [] + y, cov_mat = _unconditional_sample(x_splits[0], kernel=kernel) + outputs.append(y) + x_old = x_splits[0] + for x_subset in x_splits[1:]: + y, cov_mat = _conditional_sample( + x_new=x_subset, + x_old=x_old, + f_old=outputs[-1], + kernel=kernel, + cov_mat_old=cov_mat, + ) + outputs.append(y) + x_old = x_subset + + y_all = _scale_y(np.concatenate(outputs)) + return y_all[reverse_ix] + + +def _scale_y(y): + """Normalize variance to 1.""" + return y / y.std() diff --git a/causalnex/structure/data_generators/wrappers.py b/causalnex/structure/data_generators/wrappers.py new file mode 100644 index 0000000..94ba092 --- /dev/null +++ b/causalnex/structure/data_generators/wrappers.py @@ -0,0 +1,712 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Module of methods to sample variables of a single data type. +""" +import warnings +from typing import List, Optional, Tuple + +import networkx as nx +import numpy as np +import pandas as pd +from scipy.sparse import csr_matrix +from sklearn.gaussian_process.kernels import Kernel + +from causalnex.structure.data_generators import ( + generate_structure, + nonlinear_sem_generator, + sem_generator, +) +from causalnex.structure.structuremodel import StructureModel + + +def generate_continuous_data( + sm: nx.DiGraph, + n_samples: int, + distribution: str = "gaussian", + noise_scale: float = 1.0, + intercept: bool = False, + seed: int = None, + kernel: Optional[Kernel] = None, +) -> np.ndarray: + """ + Simulate samples from SEM with specified type of noise. + The order of the columns on the returned array is the one provided by `sm.nodes` + + Args: + sm: A DAG in form of a networkx or StructureModel. Does not require weights. + n_samples: The number of rows/observations to sample. + kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or + Matern(1) or any combinations thereof. The kernels are used to + create the latent variable for the binary / categorical variables + and are directly used for continuous variables. + distribution: The type of distribution to use for the noise + of a variable. Options: 'gaussian'/'normal' (alias), 'student-t', + 'exponential', 'gumbel'. + noise_scale: The standard deviation of the noise. + intercept: Whether to use an intercept for each feature. + seed: Random state + Returns: + x_mat: [n_samples,d_nodes] sample matrix + Raises: + ValueError: if distribution isn't gaussian/normal/student-t/exponential/gumbel + """ + if kernel is None: + df = sem_generator( + graph=sm, + default_type="continuous", + n_samples=n_samples, + distributions={"continuous": distribution}, + noise_std=noise_scale, + intercept=intercept, + seed=seed, + ) + else: + df = nonlinear_sem_generator( + graph=sm, + kernel=kernel, + default_type="continuous", + n_samples=n_samples, + distributions={"continuous": distribution}, + noise_std=noise_scale, + seed=seed, + ) + return df[list(sm.nodes())].values + + +def generate_binary_data( + sm: nx.DiGraph, + n_samples: int, + distribution: str = "logit", + noise_scale: float = 1.0, + intercept: bool = False, + seed: int = None, + kernel: Optional[Kernel] = None, +) -> np.ndarray: + """ + Simulate samples from SEM with specified type of noise. + The order of the columns on the returned array is the one provided by `sm.nodes` + + Args: + sm: A DAG in form of a networkx or StructureModel. Does not require weights. + n_samples: The number of rows/observations to sample. + kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or + Matern(1) or any combinations thereof. The kernels are used to + create the latent variable for the binary / categorical variables + and are directly used for continuous variables. + distribution: The type of distribution to use for the noise + of a variable. Options: 'probit'/'normal' (alias), + 'logit' (default). + noise_scale: The standard deviation of the noise. The binary and + categorical features are created using a latent variable approach. + The noise standard deviation determines how much weight the "mean" + estimate has on the feature value. + intercept: Whether to use an intercept for the latent variable of each feature. + seed: Random state + Returns: + x_mat: [n_samples,d_nodes] sample matrix + Raises: + ValueError: if distribution isn't 'probit', 'normal', 'logit' + """ + if kernel is None: + df = sem_generator( + graph=sm, + default_type="binary", + n_samples=n_samples, + distributions={"binary": distribution}, + noise_std=noise_scale, + intercept=intercept, + seed=seed, + ) + else: + df = nonlinear_sem_generator( + graph=sm, + kernel=kernel, + default_type="binary", + n_samples=n_samples, + distributions={"binary": distribution}, + noise_std=noise_scale, + seed=seed, + ) + return df[list(sm.nodes())].values + + +def generate_continuous_dataframe( + sm: nx.DiGraph, + n_samples: int, + distribution: str = "gaussian", + noise_scale: float = 1.0, + intercept: bool = False, + seed: int = None, + kernel: Optional[Kernel] = None, +) -> pd.DataFrame: + """ + Generates a dataframe with samples from SEM with specified type of noise. + Args: + sm: A DAG in form of a networkx or StructureModel. Does not require weights. + n_samples: The number of rows/observations to sample. + kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or + Matern(1) or any combinations thereof. The kernels are used to + create the latent variable for the binary / categorical variables + and are directly used for continuous variables. + distribution: The type of distribution to use for the noise + of a variable. Options: 'gaussian'/'normal' (alias), 'student-t', + 'exponential', 'gumbel'. + noise_scale: The standard deviation of the noise. + intercept: Whether to use an intercept for each feature. + seed: Random state + Returns: + Dataframe with the node names as column names + Raises: + ValueError: if distribution is not 'gaussian', 'normal', 'student-t', + 'exponential', 'gumbel' + """ + if kernel is None: + return sem_generator( + graph=sm, + default_type="continuous", + n_samples=n_samples, + distributions={"continuous": distribution}, + noise_std=noise_scale, + intercept=intercept, + seed=seed, + ) + + return nonlinear_sem_generator( + graph=sm, + kernel=kernel, + default_type="continuous", + n_samples=n_samples, + distributions={"continuous": distribution}, + noise_std=noise_scale, + seed=seed, + ) + + +def generate_binary_dataframe( + sm: nx.DiGraph, + n_samples: int, + distribution: str = "logit", + noise_scale: float = 1.0, + intercept: bool = False, + seed: int = None, + kernel: Optional[Kernel] = None, +) -> pd.DataFrame: + """ + Generates a dataframe with samples from SEM with specified type of noise. + + Args: + sm: A DAG in form of a networkx or StructureModel. Does not require weights. + n_samples: The number of rows/observations to sample. + kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or + Matern(1) or any combinations thereof. The kernels are used to + create the latent variable for the binary / categorical variables + and are directly used for continuous variables. + distribution: The type of distribution to use for the noise + of a variable. Options: 'probit'/'normal' (alias), + 'logit' (default). + noise_scale: The standard deviation of the noise. The binary and + categorical features are created using a latent variable approach. + The noise standard deviation determines how much weight the "mean" + estimate has on the feature value. + intercept: Whether to use an intercept for the latent variable of each feature. + seed: Random state + Returns: + x_mat: [n_samples,d_nodes] sample matrix + Raises: + ValueError: if distribution is not 'probit', 'normal', 'logit' + """ + if kernel is None: + return sem_generator( + graph=sm, + default_type="binary", + n_samples=n_samples, + distributions={"binary": distribution}, + noise_std=noise_scale, + intercept=intercept, + seed=seed, + ) + + return nonlinear_sem_generator( + graph=sm, + kernel=kernel, + default_type="binary", + n_samples=n_samples, + distributions={"binary": distribution}, + noise_std=noise_scale, + seed=seed, + ) + + +def generate_count_dataframe( + sm: nx.DiGraph, + n_samples: int, + zero_inflation_factor: float = 0.1, + intercept: bool = False, + seed: int = None, + kernel: Optional[Kernel] = None, +) -> pd.DataFrame: + """ + Generates a dataframe with samples from SEM with specified type of noise. + + Args: + sm: A DAG in form of a networkx or StructureModel. Does not require weights. + n_samples: The number of rows/observations to sample. + kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or + Matern(1) or any combinations thereof. The kernels are used to + create the latent variable for the binary / categorical variables + and are directly used for continuous variables. + zero_inflation_factor: The probability of zero inflation for count data. + intercept: Whether to use an intercept for the latent variable of each feature. + seed: Random state + Returns: + x_mat: [n_samples, d_nodes] sample matrix + Raises: + ValueError: if ``zero_inflation_factor`` is not a float in [0, 1]. + """ + + if kernel is None: + return sem_generator( + graph=sm, + default_type="count", + n_samples=n_samples, + distributions={"count": zero_inflation_factor}, + noise_std=1, # not used for poisson + intercept=intercept, + seed=seed, + ) + + return nonlinear_sem_generator( + graph=sm, + kernel=kernel, + default_type="count", + n_samples=n_samples, + distributions={"count": zero_inflation_factor}, + noise_std=1, # not used for poisson + seed=seed, + ) + + +def generate_categorical_dataframe( + sm: nx.DiGraph, + n_samples: int, + distribution: str = "logit", + n_categories: int = 3, + noise_scale: float = 1.0, + intercept: bool = False, + seed: int = None, + kernel: Optional[Kernel] = None, +) -> pd.DataFrame: + """ + Generates a dataframe with samples from SEM with specified type of noise. + + Args: + sm: A DAG in form of a networkx or StructureModel. Does not require weights. + n_samples: The number of rows/observations to sample. + kernel: A kernel from sklearn.gaussian_process.kernels like RBF(1) or + Matern(1) or any combinations thereof. The kernels are used to + create the latent variable for the binary / categorical variables + and are directly used for continuous variables. + distribution: The type of distribution to use for the noise + of a variable. Options: 'probit'/'normal' (alias), + "logit"/"gumbel" (alias). Logit is default. + n_categories: Number of categories per variable/node. + noise_scale: The standard deviation of the noise. The categorical features + are created using a latent variable approach. The noise standard + deviation determines how much weight the "mean" estimate has on + the feature value. + intercept: Whether to use an intercept for the latent variable of each feature. + seed: Random state + Returns: + x_mat: [n_samples, d_nodes] sample matrix + Raises: + ValueError: if distribution is not 'probit', 'normal', 'logit', 'gumbel' + """ + + if kernel is None: + return sem_generator( + graph=sm, + default_type="categorical:{}".format(n_categories), + n_samples=n_samples, + distributions={"categorical": distribution}, + noise_std=noise_scale, + intercept=intercept, + seed=seed, + ) + + return nonlinear_sem_generator( + graph=sm, + kernel=kernel, + default_type="categorical:{}".format(n_categories), + n_samples=n_samples, + distributions={"categorical": distribution}, + noise_std=noise_scale, + seed=seed, + ) + + +def generate_structure_dynamic( # pylint: disable=too-many-arguments + num_nodes: int, + p: int, + degree_intra: float, + degree_inter: float, + graph_type_intra: str = "erdos-renyi", + graph_type_inter: str = "erdos-renyi", + w_min_intra: float = 0.5, + w_max_intra: float = 0.5, + w_min_inter: float = 0.5, + w_max_inter: float = 0.5, + w_decay: float = 1.0, +) -> StructureModel: + """ + Generates a dynamic DAG at random. + + Args: + num_nodes: Number of nodes + p: maximum lag to be considered in the structure + degree_intra: expected degree on nodes from the current state + degree_inter: expected degree on nodes from the lagged nodes + graph_type_intra: + - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) + - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes + - full: constructs a fully-connected graph - degree has no effect + graph_type_inter: + - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) + - full: connect all past nodes to all present nodes + w_min_intra: minimum weight for intra-slice nodes + w_max_intra: maximum weight for intra-slice nodes + w_min_inter: minimum weight for inter-slice nodes + w_max_inter: maximum weight for inter-slice nodes + w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay + + Raises: + ValueError: if graph type unknown or `num_nodes < 2` + + Returns: + StructureModel containing all simulated nodes and edges (intra- and inter-slice) + """ + sm_intra = generate_structure( + num_nodes=num_nodes, + degree=degree_intra, + graph_type=graph_type_intra, + w_min=w_min_intra, + w_max=w_max_intra, + ) + sm_inter = _generate_inter_structure( + num_nodes=num_nodes, + p=p, + degree=degree_inter, + graph_type=graph_type_inter, + w_min=w_min_inter, + w_max=w_max_inter, + w_decay=w_decay, + ) + res = StructureModel() + res.add_nodes_from(sm_inter.nodes) + res.add_nodes_from(["{var}_lag0".format(var=u) for u in sm_intra.nodes]) + res.add_weighted_edges_from(sm_inter.edges.data("weight")) + res.add_weighted_edges_from( + [ + ("{var}_lag0".format(var=u), "{var}_lag0".format(var=v), w) + for u, v, w in sm_intra.edges.data("weight") + ] + ) + return res + + +def _generate_inter_structure( + num_nodes: int, + p: int, + degree: float, + graph_type: str, + w_min: float, + w_max: float, + w_decay: float = 1.0, + neg: float = 0.5, +) -> StructureModel: + """Simulate random DAG between two time slices. + + Args: + num_nodes: number of nodes per slice + p: number of slices that influence current slice + degree: expected in-degree of current time slice + graph_type: {'erdos-renyi' 'full'} + w_min: minimum weight for inter-slice nodes + w_max: maximum weight for inter-slice nodes + w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay + neg: the proportion of edge weights expected to be negative. By default, 50% of the edges are expected + to be negative weight (`neg == 0.5`). + + Returns: + G_inter: weighted, bipartite DAG for inter-slice connections + + Raises: + ValueError: if graph type not known + """ + if w_min > w_max: + raise ValueError( + "Absolute minimum weight must be less than or equal to maximum weight: {} > {}".format( + w_min, w_max + ) + ) + + if graph_type == "erdos-renyi": + prob = degree / num_nodes + b = (np.random.rand(p * num_nodes, num_nodes) < prob).astype(float) + elif graph_type == "full": # ignore degree, only for experimental use + b = np.ones([p * num_nodes, num_nodes]) + else: + raise ValueError( + "Unknown inter-slice graph type `{n}`".format(n=graph_type) + + ". Valid types are 'erdos-renyi' and 'full'" + ) + u = [] + for i in range(p): + u_i = np.random.uniform(low=w_min, high=w_max, size=[num_nodes, num_nodes]) / ( + w_decay ** i + ) + u_i[np.random.rand(num_nodes, num_nodes) < neg] *= -1 + u.append(u_i) + + u = np.concatenate(u, axis=0) if u else np.empty(b.shape) + a = (b != 0).astype(float) * u + + df = pd.DataFrame( + a, + index=[ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for l_val in range(1, p + 1) + for var in range(num_nodes) + ], + columns=[ + "{var}_lag{l_val}".format(var=var, l_val=0) for var in range(num_nodes) + ], + ) + idxs, cols = list(df.index), list(df.columns) + for i in idxs: + df[i] = 0 + for i in cols: + df.loc[i, :] = 0 + + g_inter = StructureModel(df) + return g_inter + + +def generate_dataframe_dynamic( # pylint: disable=R0914 + g: StructureModel, + n_samples: int = 1000, + burn_in: int = 100, + sem_type: str = "linear-gauss", + noise_scale: float = 1.0, + drift: np.ndarray = None, +) -> pd.DataFrame: + """Simulate samples from dynamic SEM with specified type of noise. + Args: + g: Dynamic DAG + n_samples: number of samples + burn_in: number of samples to discard + sem_type: {linear-gauss,linear-exp,linear-gumbel} + noise_scale: scale parameter of noise distribution in linear SEM + drift: array of drift terms for each node, if None then the drift is 0 + Returns: + X: [n,d] sample matrix, row t is X_t + Y: [n,d*p] sample matrix, row t is [X_{t-1}, ..., X_{t-p}] + Raises: + ValueError: if sem_type isn't linear-gauss/linear_exp/linear-gumbel + """ + s_types = ("linear-gauss", "linear-exp", "linear-gumbel") + if sem_type not in s_types: + raise ValueError( + "unknown sem type {st}. Available types are: {sts}".format( + st=sem_type, sts=s_types + ) + ) + intra_nodes = sorted(el for el in g.nodes if "_lag0" in el) + inter_nodes = sorted(el for el in g.nodes if "_lag0" not in el) + w_mat = nx.to_numpy_array(g, nodelist=intra_nodes) + a_mat = nx.to_numpy_array(g, nodelist=intra_nodes + inter_nodes)[ + len(intra_nodes) :, : len(intra_nodes) + ] + g_intra = nx.DiGraph(w_mat) + g_inter = nx.bipartite.from_biadjacency_matrix( + csr_matrix(a_mat), create_using=nx.DiGraph + ) + d = w_mat.shape[0] + p = a_mat.shape[0] // d + total_length = n_samples + burn_in + X = np.zeros([total_length, d]) + Xlags = np.zeros([total_length, p * d]) + ordered_vertices = list(nx.topological_sort(g_intra)) + if drift is None: + drift = np.zeros(d) + for t in range(total_length): + for j in ordered_vertices: + parents = list(g_intra.predecessors(j)) + parents_prev = list(g_inter.predecessors(j + p * d)) + X[t, j] = ( + drift[j] + + X[t, parents].dot(w_mat[parents, j]) + + Xlags[t, parents_prev].dot(a_mat[parents_prev, j]) + ) + if sem_type == "linear-gauss": + X[t, j] = X[t, j] + np.random.normal(scale=noise_scale) + elif sem_type == "linear-exp": + X[t, j] = X[t, j] + np.random.exponential(scale=noise_scale) + elif sem_type == "linear-gumbel": + X[t, j] = X[t, j] + np.random.gumbel(scale=noise_scale) + + if (t + 1) < total_length: + Xlags[t + 1, :] = np.concatenate([X[t, :], Xlags[t, :]])[: d * p] + return pd.concat( + [ + pd.DataFrame(X[-n_samples:], columns=intra_nodes), + pd.DataFrame(Xlags[-n_samples:], columns=inter_nodes), + ], + axis=1, + ) + + +def gen_stationary_dyn_net_and_df( # pylint: disable=R0913, R0914 + num_nodes: int = 10, + n_samples: int = 100, + p: int = 1, + degree_intra: float = 3, + degree_inter: float = 3, + graph_type_intra: str = "erdos-renyi", + graph_type_inter: str = "erdos-renyi", + w_min_intra: float = 0.5, + w_max_intra: float = 0.5, + w_min_inter: float = 0.5, + w_max_inter: float = 0.5, + w_decay: float = 1.0, + sem_type: str = "linear-gauss", + noise_scale: float = 1, + max_data_gen_trials: int = 1000, +) -> Tuple[StructureModel, pd.DataFrame, List[str], List[str]]: + """ + Generates a dynamic structure model as well a dataframe representing a time series realisation of that model. + We do checks to verify the network is stationary, and iterate until the resulting network is stationary. + Args: + num_nodes: number of nodes in the intra-slice structure + n_samples: number of points to sample from the model, as a time series + p: lag value for the dynamic structure + degree_intra: expected degree for intra_slice nodes + degree_inter: expected degree for inter_slice nodes + graph_type_intra: + - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) + - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes + - full: constructs a fully-connected graph - degree has no effect + graph_type_inter: + - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) + - full: connect all past nodes to all present nodesw_min_intra: + w_min_intra: minimum weight on intra-slice adjacency matrix + w_max_intra: maximum weight on intra-slice adjacency matrix + w_min_inter: minimum weight on inter-slice adjacency matrix + w_max_inter: maximum weight on inter-slice adjacency matrix + w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay + sem_type: {linear-gauss,linear-exp,linear-gumbel} + noise_scale: scale parameter of noise distribution in linear SEM + max_data_gen_trials: maximun number of attempts until obtaining a seemingly stationary model + Returns: + Tuple with: + - the model created,as a Structure model + - DataFrame representing the time series created from the model + - Intra-slice nodes names + - Inter-slice nodes names + """ + + with np.errstate(over="raise", invalid="raise"): + burn_in = max(n_samples // 10, 50) + + simulate_flag = True + g, intra_nodes, inter_nodes = None, None, None + + while simulate_flag: + max_data_gen_trials -= 1 + if max_data_gen_trials <= 0: + simulate_flag = False + + try: + simulate_graphs_flag = True + while simulate_graphs_flag: + + g = generate_structure_dynamic( + num_nodes=num_nodes, + p=p, + degree_intra=degree_intra, + degree_inter=degree_inter, + graph_type_intra=graph_type_intra, + graph_type_inter=graph_type_inter, + w_min_intra=w_min_intra, + w_max_intra=w_max_intra, + w_min_inter=w_min_inter, + w_max_inter=w_max_inter, + w_decay=w_decay, + ) + intra_nodes = sorted([el for el in g.nodes if "_lag0" in el]) + inter_nodes = sorted([el for el in g.nodes if "_lag0" not in el]) + # Exclude empty graphs from consideration unless input degree is 0 + if ( + ( + [(u, v) for u, v in g.edges if u in intra_nodes] + and [(u, v) for u, v in g.edges if u in inter_nodes] + ) + or degree_intra == 0 + or degree_inter == 0 + ): + simulate_graphs_flag = False + + # generate single time series + df = ( + generate_dataframe_dynamic( + g, + n_samples=n_samples + burn_in, + sem_type=sem_type, + noise_scale=noise_scale, + ) + .loc[burn_in:, intra_nodes + inter_nodes] + .reset_index(drop=True) + ) + + if df.isna().any(axis=None): + continue + except (OverflowError, FloatingPointError): + continue + if (df.abs().max().max() < 1e3) or (max_data_gen_trials <= 0): + simulate_flag = False + if max_data_gen_trials <= 0: + warnings.warn( + "Could not simulate data, returning constant dataframe", UserWarning + ) + + df = pd.DataFrame( + np.ones((n_samples, num_nodes * (1 + p))), + columns=intra_nodes + inter_nodes, + ) + return g, df, intra_nodes, inter_nodes diff --git a/causalnex/structure/dynotears.py b/causalnex/structure/dynotears.py new file mode 100644 index 0000000..e7d1202 --- /dev/null +++ b/causalnex/structure/dynotears.py @@ -0,0 +1,494 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tools to learn a Dynamic Bayesian Network which describe the conditional dependencies between variables in a time-series +dataset. +""" + +import warnings +from typing import Dict, List, Tuple, Union + +import numpy as np +import pandas as pd +import scipy.linalg as slin +import scipy.optimize as sopt + +from causalnex.structure import StructureModel + +from .transformers import DynamicDataTransformer + + +def from_pandas_dynamic( # pylint: disable=too-many-arguments + time_series: Union[pd.DataFrame, List[pd.DataFrame]], + p: int, + lambda_w: float = 0.1, + lambda_a: float = 0.1, + max_iter: int = 100, + h_tol: float = 1e-8, + w_threshold: float = 0.0, + tabu_edges: List[Tuple[int, int, int]] = None, + tabu_parent_nodes: List[int] = None, + tabu_child_nodes: List[int] = None, +) -> StructureModel: + """ + Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between variables in + data. The input data is a time series or a list of realisations of a same time series. + The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted + adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function + h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that + encapsulated how acyclic the graph is (less = more acyclic). + + Based on "DYNOTEARS: Structure Learning from Time-Series Data". + https://arxiv.org/abs/2002.00498 + @inproceedings{pamfil2020dynotears, + title={DYNOTEARS: Structure Learning from Time-Series Data}, + author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer, + Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon}, + booktitle={International Conference on Artificial Intelligence and Statistics}, + pages={1595--1605}, + year={2020}year={2020}, + } + Args: + time_series: pd.DataFrame or List of pd.DataFrame instances. + If a list is provided each element of the list being an realisation of a time series (i.e. time series governed + by the same processes) + The columns of the data frame represent the variables in the model, and the *index represents the time index*. + Successive events, therefore, must be indexed with one integer of difference between them too. + p: Number of past interactions we allow the model to create. The state of a variable at time `t` is affected by + past variables up to a `t-p`, as well as by other variables at `t`. + lambda_w: parameter for l1 regularisation of intra-slice edges + lambda_a: parameter for l1 regularisation of inter-slice edges + max_iter: max number of dual ascent steps during optimisation. + h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0). + w_threshold: fixed threshold for absolute edge weights. + tabu_edges: list of edges(lag, from, to) not to be included in the graph. `lag == 0` implies that the edge is + forbidden in the INTRA graph (W), while lag > 0 implies an INTER-slice weight equal zero. + tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. + tabu_child_nodes: list of nodes banned from being a child of any other nodes. + + Returns: + StructureModel representing the model learnt. The node names are noted as `{var}_lag{l}`, where `var` is the + original variable name as in the give in the input data frames and `l`, in 0,1,2..p is the correspondent + time lag. + """ + time_series = [time_series] if not isinstance(time_series, list) else time_series + + X, Xlags = DynamicDataTransformer(p=p).fit_transform(time_series, return_df=False) + + col_idx = {c: i for i, c in enumerate(time_series[0].columns)} + idx_col = {i: c for c, i in col_idx.items()} + + if tabu_edges: + tabu_edges = [(lag, col_idx[u], col_idx[v]) for lag, u, v in tabu_edges] + if tabu_parent_nodes: + tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] + if tabu_child_nodes: + tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] + + g = from_numpy_dynamic( + X, + Xlags, + lambda_w, + lambda_a, + max_iter, + h_tol, + w_threshold, + tabu_edges, + tabu_parent_nodes, + tabu_child_nodes, + ) + + sm = StructureModel() + sm.add_nodes_from( + [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for var in col_idx.keys() + for l_val in range(p + 1) + ] + ) + sm.add_weighted_edges_from( + [ + ( + _format_name_from_pandas(idx_col, u), + _format_name_from_pandas(idx_col, v), + w, + ) + for u, v, w in g.edges.data("weight") + ], + origin="learned", + ) + + return sm + + +def _format_name_from_pandas(idx_col: Dict[int, str], from_numpy_node: str) -> str: + """ + Helper function for `from_pandas_dynamic`. converts a node from the `from_numpy_dynamic` format to the `from_pandas` + format + Args: + idx_col: map from variable to intdex + from_numpy_node: nodes in the structure model output by `from_numpy_dynamic`. + Returns: + nodes in from_pandas_dynamic format + """ + idx, lag_val = from_numpy_node.split("_lag") + return "{var}_lag{l_val}".format(var=idx_col[int(idx)], l_val=lag_val) + + +def from_numpy_dynamic( # pylint: disable=too-many-arguments + X: np.ndarray, + Xlags: np.ndarray, + lambda_w: float = 0.1, + lambda_a: float = 0.1, + max_iter: int = 100, + h_tol: float = 1e-8, + w_threshold: float = 0.0, + tabu_edges: List[Tuple[int, int, int]] = None, + tabu_parent_nodes: List[int] = None, + tabu_child_nodes: List[int] = None, +) -> StructureModel: + """ + Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between variables in + data. The input data is time series data present in numpy arrays X and Xlags. + + The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted + adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function + h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that + encapsulated how acyclic the graph is (less = more acyclic). + + Based on "DYNOTEARS: Structure Learning from Time-Series Data". + https://arxiv.org/abs/2002.00498 + @inproceedings{pamfil2020dynotears, + title={DYNOTEARS: Structure Learning from Time-Series Data}, + author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer, + Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon}, + booktitle={International Conference on Artificial Intelligence and Statistics}, + pages={1595--1605}, + year={2020}year={2020}, + } + + Args: + X (np.ndarray): 2d input data, axis=1 is data columns, axis=0 is data rows. Each column represents one variable, + and each row represents x(m,t) i.e. the mth time series at time t. + Xlags (np.ndarray): shifted data of X with lag orders stacking horizontally. Xlags=[shift(X,1)|...|shift(X,p)] + lambda_w (float): l1 regularization parameter of intra-weights W + lambda_a (float): l1 regularization parameter of inter-weights A + max_iter: max number of dual ascent steps during optimisation + h_tol (float): exit if h(W) < h_tol (as opposed to strict definition of 0) + w_threshold: fixed threshold for absolute edge weights. + tabu_edges: list of edges(lag, from, to) not to be included in the graph. `lag == 0` implies that the edge is + forbidden in the INTRA graph (W), while lag > 0 implies an INTER weight equal zero. + tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. + tabu_child_nodes: list of nodes banned from being a child of any other nodes. + Returns: + W (np.ndarray): d x d estimated weighted adjacency matrix of intra slices + A (np.ndarray): d x pd estimated weighted adjacency matrix of inter slices + + Raises: + ValueError: If X or Xlags does not contain data, or dimensions of X and Xlags do not conform + """ + _, d_vars = X.shape + p_orders = Xlags.shape[1] // d_vars + + bnds_w = 2 * [ + (0, 0) + if i == j + else (0, 0) + if tabu_edges is not None and (0, i, j) in tabu_edges + else (0, 0) + if tabu_parent_nodes is not None and i in tabu_parent_nodes + else (0, 0) + if tabu_child_nodes is not None and j in tabu_child_nodes + else (0, None) + for i in range(d_vars) + for j in range(d_vars) + ] + + bnds_a = [] + for k in range(1, p_orders + 1): + bnds_a.extend( + 2 + * [ + (0, 0) + if tabu_edges is not None and (k, i, j) in tabu_edges + else (0, 0) + if tabu_parent_nodes is not None and i in tabu_parent_nodes + else (0, 0) + if tabu_child_nodes is not None and j in tabu_child_nodes + else (0, None) + for i in range(d_vars) + for j in range(d_vars) + ] + ) + + bnds = bnds_w + bnds_a + w_est, a_est = _learn_dynamic_structure( + X, Xlags, bnds, lambda_w, lambda_a, max_iter, h_tol + ) + + w_est[np.abs(w_est) < w_threshold] = 0 + a_est[np.abs(a_est) < w_threshold] = 0 + sm = _matrices_to_structure_model(w_est, a_est) + return sm + + +def _matrices_to_structure_model( + w_est: np.ndarray, a_est: np.ndarray +) -> StructureModel: + """ + Converts the matrices output by dynotears (W and A) into a StructureModel + We use the following convention: + - {var}_lag{l} where l is the lag value (i.e. from how many previous timestamps the edge is coming + - if we deal with a intra_slice_node, `l == 0` + Args: + w_est: Intra-slice weight matrix + a_est: Inter-slice matrix + + Returns: + StructureModel representing the structure learnt + + """ + sm = StructureModel() + lag_cols = [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for l_val in range(1 + (a_est.shape[0] // a_est.shape[1])) + for var in range(a_est.shape[1]) + ] + sm.add_nodes_from(lag_cols) + sm.add_edges_from( + [ + (lag_cols[i], lag_cols[j], dict(weight=w_est[i, j])) + for i in range(w_est.shape[0]) + for j in range(w_est.shape[1]) + if w_est[i, j] != 0 + ] + ) + sm.add_edges_from( + [ + (lag_cols[i + w_est.shape[0]], lag_cols[j], dict(weight=a_est[i, j])) + for i in range(a_est.shape[0]) + for j in range(a_est.shape[1]) + if a_est[i, j] != 0 + ] + ) + return sm + + +def _reshape_wa( + wa_vec: np.ndarray, d_vars: int, p_orders: int +) -> Tuple[np.ndarray, np.ndarray]: + """ + Helper function for `_learn_dynamic_structure`. Transform adjacency vector to matrix form + + Args: + wa_vec (np.ndarray): current adjacency vector with intra- and inter-slice weights + d_vars (int): number of variables in the model + p_orders (int): number of past indexes we to use + Returns: + intra- and inter-slice adjacency matrices + """ + + w_tilde = wa_vec.reshape([2 * (p_orders + 1) * d_vars, d_vars]) + w_plus = w_tilde[:d_vars, :] + w_minus = w_tilde[d_vars : 2 * d_vars, :] + w_mat = w_plus - w_minus + a_plus = ( + w_tilde[2 * d_vars :] + .reshape(2 * p_orders, d_vars ** 2)[::2] + .reshape(d_vars * p_orders, d_vars) + ) + a_minus = ( + w_tilde[2 * d_vars :] + .reshape(2 * p_orders, d_vars ** 2)[1::2] + .reshape(d_vars * p_orders, d_vars) + ) + a_mat = a_plus - a_minus + return w_mat, a_mat + + +def _learn_dynamic_structure( + X: np.ndarray, + Xlags: np.ndarray, + bnds: List[Tuple[float, float]], + lambda_w: float = 0.1, + lambda_a: float = 0.1, + max_iter: int = 100, + h_tol: float = 1e-8, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between data variables. + + The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted + adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function + h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that + encapsulated how acyclic the graph is (less = more acyclic). + + Based on "DYNOTEARS: Structure Learning from Time-Series Data". + https://arxiv.org/abs/2002.00498 + @inproceedings{pamfil2020dynotears, + title={DYNOTEARS: Structure Learning from Time-Series Data}, + author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer, + Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon}, + booktitle={International Conference on Artificial Intelligence and Statistics}, + pages={1595--1605}, + year={2020}year={2020}, + } + + Args: + X (np.ndarray): 2d input data, axis=1 is data columns, axis=0 is data rows. Each column represents one variable, + and each row represents x(m,t) i.e. the mth time series at time t. + Xlags (np.ndarray): shifted data of X with lag orders stacking horizontally. Xlags=[shift(X,1)|...|shift(X,p)] + bnds: Box constraints of L-BFGS-B to ban self-loops in W, enforce non-negativity of w_plus, w_minus, a_plus, + a_minus, and help with stationarity in A + lambda_w (float): l1 regularization parameter of intra-weights W + lambda_a (float): l1 regularization parameter of inter-weights A + max_iter (int): max number of dual ascent steps during optimisation + h_tol (float): exit if h(W) < h_tol (as opposed to strict definition of 0) + + Returns: + W (np.ndarray): d x d estimated weighted adjacency matrix of intra slices + A (np.ndarray): d x pd estimated weighted adjacency matrix of inter slices + + Raises: + ValueError: If X or Xlags does not contain data, or dimensions of X and Xlags do not conform + """ + if X.size == 0: + raise ValueError("Input data X is empty, cannot learn any structure") + if Xlags.size == 0: + raise ValueError("Input data Xlags is empty, cannot learn any structure") + if X.shape[0] != Xlags.shape[0]: + raise ValueError("Input data X and Xlags must have the same number of rows") + if Xlags.shape[1] % X.shape[1] != 0: + raise ValueError( + "Number of columns of Xlags must be a multiple of number of columns of X" + ) + + n, d_vars = X.shape + p_orders = Xlags.shape[1] // d_vars + + def _h(wa_vec: np.ndarray) -> float: + """ + Constraint function of the dynotears + + Args: + wa_vec (np.ndarray): current adjacency vector with intra- and inter-slice weights + + Returns: + float: DAGness of the intra-slice adjacency matrix W (0 == DAG, >0 == cyclic) + """ + + _w_mat, _ = _reshape_wa(wa_vec, d_vars, p_orders) + return np.trace(slin.expm(_w_mat * _w_mat)) - d_vars + + def _func(wa_vec: np.ndarray) -> float: + """ + Objective function that the dynotears tries to minimise + + Args: + wa_vec (np.ndarray): current adjacency vector with intra- and inter-slice weights + + Returns: + float: objective + """ + + _w_mat, _a_mat = _reshape_wa(wa_vec, d_vars, p_orders) + loss = ( + 0.5 + / n + * np.square( + np.linalg.norm( + X.dot(np.eye(d_vars, d_vars) - _w_mat) - Xlags.dot(_a_mat), "fro" + ) + ) + ) + _h_value = _h(wa_vec) + l1_penalty = lambda_w * (wa_vec[: 2 * d_vars ** 2].sum()) + lambda_a * ( + wa_vec[2 * d_vars ** 2 :].sum() + ) + return loss + 0.5 * rho * _h_value * _h_value + alpha * _h_value + l1_penalty + + def _grad(wa_vec: np.ndarray) -> np.ndarray: + """ + Gradient function used to compute next step in dynotears + + Args: + wa_vec (np.ndarray): current adjacency vector with intra- and inter-slice weights + + Returns: + gradient vector + """ + + _w_mat, _a_mat = _reshape_wa(wa_vec, d_vars, p_orders) + e_mat = slin.expm(_w_mat * _w_mat) + loss_grad_w = ( + -1.0 + / n + * (X.T.dot(X.dot(np.eye(d_vars, d_vars) - _w_mat) - Xlags.dot(_a_mat))) + ) + obj_grad_w = ( + loss_grad_w + + (rho * (np.trace(e_mat) - d_vars) + alpha) * e_mat.T * _w_mat * 2 + ) + obj_grad_a = ( + -1.0 + / n + * (Xlags.T.dot(X.dot(np.eye(d_vars, d_vars) - _w_mat) - Xlags.dot(_a_mat))) + ) + + grad_vec_w = np.append( + obj_grad_w, -obj_grad_w, axis=0 + ).flatten() + lambda_w * np.ones(2 * d_vars ** 2) + grad_vec_a = obj_grad_a.reshape(p_orders, d_vars ** 2) + grad_vec_a = np.hstack( + (grad_vec_a, -grad_vec_a) + ).flatten() + lambda_a * np.ones(2 * p_orders * d_vars ** 2) + return np.append(grad_vec_w, grad_vec_a, axis=0) + + # initialise matrix, weights and constraints + wa_est = np.zeros(2 * (p_orders + 1) * d_vars ** 2) + wa_new = np.zeros(2 * (p_orders + 1) * d_vars ** 2) + rho, alpha, h_value, h_new = 1.0, 0.0, np.inf, np.inf + + for n_iter in range(max_iter): + while rho < 1e20: + wa_new = sopt.minimize( + _func, wa_est, method="L-BFGS-B", jac=_grad, bounds=bnds + ).x + h_new = _h(wa_new) + if h_new > 0.25 * h_value: + rho *= 10 + else: + break + wa_est = wa_new + h_value = h_new + alpha += rho * h_value + if h_value <= h_tol: + break + if h_value > h_tol and n_iter == max_iter - 1: + warnings.warn("Failed to converge. Consider increasing max_iter.") + return _reshape_wa(wa_est, d_vars, p_orders) diff --git a/causalnex/structure/notears.py b/causalnex/structure/notears.py index 9dab187..0472cc2 100644 --- a/causalnex/structure/notears.py +++ b/causalnex/structure/notears.py @@ -102,6 +102,8 @@ def from_numpy( # n examples, d properties _, d = X.shape + _assert_all_finite(X) + bnds = [ (0, 0) if i == j @@ -162,6 +164,8 @@ def from_numpy_lasso( # n examples, d properties _, d = X.shape + _assert_all_finite(X) + bnds = [ (0, 0) if i == j @@ -550,3 +554,32 @@ def _grad(w_vec: np.ndarray) -> np.ndarray: w_new = w_est[: d ** 2].reshape([d, d]) - w_est[d ** 2 :].reshape([d, d]) w_new[np.abs(w_new) < w_threshold] = 0 return StructureModel(w_new.reshape([d, d])) + + +def _assert_all_finite(X: np.ndarray): + """Throw a ValueError if X contains NaN or Infinity. + + Based on Sklearn method to handle NaN & Infinity. + @inproceedings{sklearn_api, + author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and + Fabian Pedregosa and Andreas Mueller and Olivier Grisel and + Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort + and Jaques Grobler and Robert Layton and Jake VanderPlas and + Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux}, + title = {{API} design for machine learning software: experiences from the scikit-learn + project}, + booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning}, + year = {2013}, + pages = {108--122}, + } + + Args: + X: Array to validate + + Raises: + ValueError: If X contains NaN or Infinity + """ + + msg_err = "Input contains NaN, infinity or a value too large for {!r}." + if not np.isfinite(X).all(): + raise ValueError(msg_err.format(X.dtype)) diff --git a/causalnex/structure/pytorch/__init__.py b/causalnex/structure/pytorch/__init__.py new file mode 100644 index 0000000..a17bd56 --- /dev/null +++ b/causalnex/structure/pytorch/__init__.py @@ -0,0 +1,36 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +``causalnex.structure.pytorch`` provides functionality to define or learn structure using pytorch. +""" + +__all__ = ["from_numpy", "from_pandas", "NotearsMLP"] + +from .core import NotearsMLP +from .notears import from_numpy, from_pandas diff --git a/causalnex/structure/pytorch/core.py b/causalnex/structure/pytorch/core.py new file mode 100644 index 0000000..58564d9 --- /dev/null +++ b/causalnex/structure/pytorch/core.py @@ -0,0 +1,469 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is modified from this git repo: https://github.com/xunzheng/notears + +@inproceedings{zheng2020learning, + author = {Zheng, Xun and Dan, Chen and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.}, + booktitle = {International Conference on Artificial Intelligence and Statistics}, + title = {{Learning sparse nonparametric DAGs}}, + year = {2020} +} +""" +import logging +from typing import Iterable, List, Tuple, Union + +import numpy as np +import scipy.optimize as sopt +import torch +import torch.nn as nn +from sklearn.base import BaseEstimator + +from .nonlinear import LocallyConnected + + +class NotearsMLP(nn.Module, BaseEstimator): + """ + Class for NOTEARS MLP (Multi-layer Perceptron) model. + The model weights consist of dag_layer and loc_lin_layer weights respectively. + dag_layer weight is the weight of the first fully connected layer which determines the causal structure. + loc_lin_layer weights are the weight of hidden layers after the first fully connected layer + """ + + def __init__( + self, + n_features: int, + use_bias: bool = False, + hidden_layer_units: Iterable[int] = (0,), + bounds: List[Tuple[int, int]] = None, + lasso_beta: float = 0.0, + ridge_beta: float = 0.0, + nonlinear_clamp: float = 1e-2, + ): + """ + Constructor for NOTEARS MLP class. + + Args: + n_features: number of input features + use_bias: True to add the intercept to the model + hidden_layer_units: An iterable where its length determine the number of layers used, + and the numbers determine the number of nodes used for the layer in order. + bounds: bound constraint for each parameter. + lasso_beta: Constant that multiplies the lasso term (l1 regularisation). + It only applies to dag_layer weight. + ridge_beta: Constant that multiplies the ridge term (l2 regularisation). + It applies to both dag_layer and loc_lin_layer weights. + nonlinear_clamp: Value used to soft clamp the nonlinear layer normalisation. + Prevents the weights from being scaled above 1/nonlinear_clamp. + """ + super().__init__() + self.device = torch.device("cpu") + self.lasso_beta = lasso_beta + self.ridge_beta = ridge_beta + self.nonlinear_clamp = nonlinear_clamp + + # cast to list for later concat. + self.dims = ( + [n_features] + list(hidden_layer_units) + [1] + if hidden_layer_units[0] + else [n_features, 1] + ) + + # dag_layer: initial linear layer + self.dag_layer = nn.Linear( + self.dims[0], self.dims[0] * self.dims[1], bias=use_bias + ).float() + nn.init.zeros_(self.dag_layer.weight) + if use_bias: + nn.init.zeros_(self.dag_layer.bias) + + # loc_lin_layer: local linear layers + layers = [ + LocallyConnected( + self.dims[0], input_features, output_features, bias=use_bias + ).float() + for input_features, output_features in zip(self.dims[1:-1], self.dims[2:]) + ] + self._loc_lin_layer_weights = nn.ModuleList(layers) + for layer in layers: + layer.reset_parameters() + + # set the bounds as an attribute on the weights object + self.dag_layer.weight.bounds = bounds + # type the adjacency matrix + self.adj = None + self.adj_mean_effect = None + + @property + def _logger(self): + return logging.getLogger(self.__class__.__name__) + + @property + def dag_layer_bias(self) -> Union[torch.Tensor, None]: + """ + dag_layer bias is the bias of the first fully connected layer which determines the causal structure. + Returns: + dag_layer bias if use_bias is True, otherwise None + """ + return self.dag_layer.bias + + @property + def dag_layer_weight(self) -> torch.Tensor: + """ + dag_layer weight is the weight of the first fully connected layer which determines the causal structure. + Returns: + dag_layer weight + """ + return self.dag_layer.weight + + @property + def loc_lin_layer_weights(self) -> torch.Tensor: + """ + loc_lin_layer weights are the weight of hidden layers after the first fully connected layer. + Returns: + loc_lin_layer weights + """ + return self._loc_lin_layer_weights + + # pylint: disable=arguments-differ + def forward(self, x: torch.Tensor) -> torch.Tensor: # [n, d] -> [n, d] + """ + Feed forward calculation for the model. + + Args: + x: input torch tensor + + Returns: + output tensor from the model + """ + x = self.dag_layer(x) # [n, d * m1] + x = x.view(-1, self.dims[0], self.dims[1]) # [n, d, m1] + for layer in self.loc_lin_layer_weights: + x = torch.sigmoid(x) # [n, d, m1] + # soft clamp the denominator to prevent divide by zero and prevent very large weight increases + x = (x - x.mean(dim=0).detach()) / torch.sqrt( + (self.nonlinear_clamp + x.var(dim=0).detach()) + ) + + x = layer(x) # [n, d, m2] + x = x.squeeze(dim=2) # [n, d] + return x + + @property + def bias(self) -> Union[np.ndarray, None]: + """ + Get the vector of feature biases + + Returns: + bias vector if use_bias is True, otherwise None + """ + bias = self.dag_layer_bias + return bias if bias is None else bias.cpu().detach().numpy() + + def fit( + self, + x: np.ndarray, + max_iter: int = 100, + h_tol: float = 1e-8, + rho_max: float = 1e16, + ): + """ + Fit NOTEARS MLP model using the input data x + Args: + x: 2d numpy array input data, axis=0 is data rows, axis=1 is data columns. Data must be row oriented. + max_iter: max number of dual ascent steps during optimisation. + h_tol: exit if h(w) < h_tol (as opposed to strict definition of 0). + rho_max: to be updated + """ + rho, alpha, h = 1.0, 0.0, np.inf + X_torch = torch.from_numpy(x).float().to(self.device) + + for n_iter in range(max_iter): + rho, alpha, h = self._dual_ascent_step(X_torch, rho, alpha, h, rho_max) + if h <= h_tol or rho >= rho_max: + break + if n_iter == max_iter - 1 and h > h_tol: + self._logger.warning( + "Failed to converge. Consider increasing max_iter." + ) + + # calculate the adjacency matrix after the fitting is finished + self.adj = ( + self._calculate_adj(X_torch, mean_effect=False).cpu().detach().numpy() + ) + self.adj_mean_effect = ( + self._calculate_adj(X_torch, mean_effect=True).cpu().detach().numpy() + ) + + # pylint: disable=too-many-locals + def _dual_ascent_step( + self, X: torch.Tensor, rho: float, alpha: float, h: float, rho_max: float + ) -> Tuple[float, float, float]: + """ + Perform one step of dual ascent in augmented Lagrangian. + + Args: + X: input tensor data. + rho: max number of dual ascent steps during optimisation. + alpha: exit if h(w) < h_tol (as opposed to strict definition of 0). + h: DAGness of the adjacency matrix + rho_max: to be updated + + Returns: + rho, alpha and h + """ + + def _get_flat_grad(params: List[torch.Tensor]) -> np.ndarray: + """ + Get flatten gradient vector from the parameters of the model + + Args: + params: parameters of the model + + Returns: + flatten gradient vector in numpy form + """ + views = [ + p.data.new(p.data.numel()).zero_() + if p.grad is None + else p.grad.data.to_dense().view(-1) + if p.grad.data.is_sparse + else p.grad.data.view(-1) + for p in params + ] + return torch.cat(views, 0).cpu().detach().numpy() + + def _get_flat_bounds( + params: List[torch.Tensor], + ) -> List[Tuple[Union[None, float]]]: + """ + Get bound constraint for each parameter in flatten vector form from the parameters of the model + + Args: + params: parameters of the model + + Returns: + flatten vector of bound constraints for each parameter in numpy form + """ + bounds = [] + for p in params: + try: + b = p.bounds + except AttributeError: + b = [(None, None)] * p.numel() + bounds += b + return bounds + + def _get_flat_params(params: List[torch.Tensor]) -> np.ndarray: + """ + Get parameters in flatten vector from the parameters of the model + + Args: + params: parameters of the model + + Returns: + flatten parameters vector in numpy form + """ + views = [ + p.data.to_dense().view(-1) if p.data.is_sparse else p.data.view(-1) + for p in params + ] + return torch.cat(views, 0).cpu().detach().numpy() + + def _update_params_from_flat( + params: List[torch.Tensor], flat_params: np.ndarray + ): + """ + Update parameters of the model from the parameters in the form of flatten vector + + Args: + params: parameters of the model + flat_params: parameters in the form of flatten vector + """ + offset = 0 + flat_params_torch = torch.from_numpy(flat_params).to( + torch.get_default_dtype() + ) + for p in params: + n_params = p.numel() + # view_as to avoid deprecated pointwise semantics + p.data = flat_params_torch[offset : offset + n_params].view_as(p.data) + offset += n_params + + def _func(flat_params: np.ndarray) -> Tuple[float, np.ndarray]: + """ + Objective function that the NOTEARS algorithm tries to minimise. + + Args: + flat_params: parameters to be optimised to minimise the objective function + + Returns: + Loss and gradient + """ + _update_params_from_flat(params, flat_params) + optimizer.zero_grad() + + n_features = X.shape[1] + + X_hat = self(X) + h_val = self._h_func() + + loss = (0.5 / X.shape[0]) * torch.sum((X_hat - X) ** 2) + lagrange_penalty = 0.5 * rho * h_val * h_val + alpha * h_val + # NOTE: both the l2 and l1 regularization are NOT applied to the bias parameters + l2_reg = 0.5 * self.ridge_beta * self._l2_reg(n_features) + l1_reg = self.lasso_beta * self._l1_reg(n_features) + + primal_obj = loss + lagrange_penalty + l2_reg + l1_reg + primal_obj.backward() + loss = primal_obj.item() + + flat_grad = _get_flat_grad(params) + return loss, flat_grad.astype("float64") + + optimizer = torch.optim.Optimizer(self.parameters(), dict()) + params = optimizer.param_groups[0]["params"] + + flat_params = _get_flat_params(params) + bounds = _get_flat_bounds(params) + + while rho < rho_max: + # Magic + sol = sopt.minimize( + _func, + flat_params, + method="L-BFGS-B", + jac=True, + bounds=bounds, + ) + + _update_params_from_flat(params, sol.x) + h_new = self._h_func().item() + if h_new > 0.25 * h: + rho *= 10 + else: + break + alpha += rho * h_new + return rho, alpha, h_new + + def _h_func(self) -> torch.Tensor: + """ + Constraint function of the NOTEARS algorithm. + Constrain 2-norm-squared of dag_layer weights of the model along m1 dim to be a DAG + + Returns: + DAGness of the adjacency matrix + """ + d = self.dims[0] + d_torch = torch.tensor(d).to(self.device) # pylint: disable=not-callable + + # only consider the dag_layer for h(W) for compute efficiency + dag_layer_weight = self.dag_layer_weight.view(d, -1, d) # [j, m1, i] + square_weight_mat = torch.sum( + dag_layer_weight * dag_layer_weight, dim=1 + ).t() # [i, j] + + # h = trace_expm(a) - d # (Zheng et al. 2018) + characteristic_poly_mat = ( + torch.eye(d).to(self.device) + square_weight_mat / d_torch + ) # (Yu et al. 2019) + polynomial_mat = torch.matrix_power(characteristic_poly_mat, d - 1) + h = (polynomial_mat.t() * characteristic_poly_mat).sum() - d + return h + + def _l1_reg(self, n_features: int) -> torch.Tensor: + """ + Take average l1 of all weight parameters of the model. + NOTE: regularisation needs to be scaled up by the number of features + because the loss scales with feature number. + + Returns: + l1 regularisation term. + """ + return torch.mean(torch.abs(self.dag_layer_weight)) * n_features + + def _l2_reg(self, n_features: int) -> torch.Tensor: + """ + Take average 2-norm-squared of all weight parameters of the model. + NOTE: regularisation needs to be scaled up by the number of features + because the loss scales with feature number. + + Returns: + l2 regularisation term. + """ + reg = 0.0 + reg += torch.sum(self.dag_layer_weight ** 2) + for layer in self.loc_lin_layer_weights: + reg += torch.sum(layer.weight ** 2) + + # calculate the total number of elements used in the above sums + n_elements = self.dag_layer_weight.numel() + for layer in self.loc_lin_layer_weights: + n_elements = n_elements + layer.weight.numel() + return reg / n_elements * n_features + + def _calculate_adj(self, X: torch.Tensor, mean_effect: bool) -> torch.Tensor: + """ + Calculate the adjacency matrix. + + For the linear case, this is just dag_layer_weight. + For the nonlinear case, approximate the relationship using the gradient of X_hat wrt X. + """ + + # for the linear case, save compute by just returning the dag_layer weights + if len(self.dims) <= 2: + adj = ( + self.dag_layer_weight.T + if mean_effect + else torch.abs(self.dag_layer_weight.T) + ) + return adj + + _, n_features = X.shape + # get the data X and reconstruction X_hat + X = X.clone().requires_grad_() + X_hat = self(X).sum(dim=0) # shape = (n_features,) + + adj = [] + # iterate over sums of reconstructed features + for j in range(n_features): + + # calculate the gradient of X_hat wrt X + ddx = torch.autograd.grad(X_hat[j], X, create_graph=True)[0] + + if mean_effect: + # get the average effect + adj.append(ddx.mean(axis=0).unsqueeze(0)) + else: + # otherwise, use the average L1 of the gradient as the W + adj.append(torch.abs(ddx).mean(dim=0).unsqueeze(0)) + adj = torch.cat(adj, dim=0) + + # transpose to get the adjacency matrix + return adj.T diff --git a/causalnex/structure/pytorch/nonlinear.py b/causalnex/structure/pytorch/nonlinear.py new file mode 100644 index 0000000..e3da828 --- /dev/null +++ b/causalnex/structure/pytorch/nonlinear.py @@ -0,0 +1,111 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This code is modified from this git repo: https://github.com/xunzheng/notears +@inproceedings{zheng2020learning, + author = {Zheng, Xun and Dan, Chen and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.}, + booktitle = {International Conference on Artificial Intelligence and Statistics}, + title = {{Learning sparse nonparametric DAGs}}, + year = {2020} +} +""" +import math + +import torch +import torch.nn as nn + + +class LocallyConnected(nn.Module): + """ + Local linear layer, i.e. Conv1dLocal() with filter size 1. + """ + + def __init__( + self, + num_linear: int, + input_features: int, + output_features: int, + bias: bool = True, + ): + """ + Create local linear layers. + Transformations of the feature are independent of each other, + each feature is expanded to several hidden units. + + Args: + num_linear: num of local linear layers. + input_features: m1. + output_features: m2. + bias: whether to include bias or not. + """ + super().__init__() + self.num_linear = num_linear + self.input_features = input_features + self.output_features = output_features + + self.weight = nn.Parameter( + torch.Tensor(num_linear, input_features, output_features) + ) + if bias: + self.bias = nn.Parameter(torch.Tensor(num_linear, output_features)) + else: + # You should always register all possible parameters, but the + # optional ones can be None if you want. + self.register_parameter("bias", None) + + self.reset_parameters() + + @torch.no_grad() + def reset_parameters(self): + """ + Reset parameters + """ + k = 1.0 / self.input_features + bound = math.sqrt(k) + nn.init.uniform_(self.weight, -bound, bound) + if self.bias is not None: + nn.init.uniform_(self.bias, -bound, bound) + + # pylint: disable=arguments-differ + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + Forward output calculation # [n, d, 1, m2] = [n, d, 1, m1] @ [1, d, m1, m2] + + Args: + x: torch tensor + + Returns: + output calculation + """ + out = torch.matmul(x.unsqueeze(dim=2), self.weight.unsqueeze(dim=0)) + out = out.squeeze(dim=2) + if self.bias is not None: + # [n, d, m2] += [d, m2] + out += self.bias + return out diff --git a/causalnex/structure/pytorch/notears.py b/causalnex/structure/pytorch/notears.py new file mode 100644 index 0000000..dd88ca6 --- /dev/null +++ b/causalnex/structure/pytorch/notears.py @@ -0,0 +1,306 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tools to learn a ``StructureModel`` which describes the conditional dependencies between variables in a dataset. +""" + +import logging +from copy import deepcopy +from typing import Iterable, List, Tuple + +import numpy as np +import pandas as pd +from sklearn.utils import check_array + +from causalnex.structure.pytorch.core import NotearsMLP +from causalnex.structure.structuremodel import StructureModel + +__all__ = ["from_numpy", "from_pandas"] + + +# pylint: disable=too-many-locals +# pylint: disable=too-many-arguments +def from_numpy( + X: np.ndarray, + lasso_beta: float = 0.0, + ridge_beta: float = 0.0, + use_bias: bool = False, + hidden_layer_units: Iterable[int] = None, + w_threshold: float = None, + max_iter: int = 100, + tabu_edges: List[Tuple[int, int]] = None, + tabu_parent_nodes: List[int] = None, + tabu_child_nodes: List[int] = None, + **kwargs +) -> StructureModel: + """ + Learn the `StructureModel`, the graph structure with lasso regularisation + describing conditional dependencies between variables in data presented as a numpy array. + + Based on DAGs with NO TEARS. + @inproceedings{zheng2018dags, + author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}}, + year = {2018}, + codebase = {https://github.com/xunzheng/notears} + } + + Args: + X: 2d input data, axis=0 is data rows, axis=1 is data columns. Data must be row oriented. + + lasso_beta: Constant that multiplies the lasso term (l1 regularisation). + NOTE when using nonlinearities, the l1 loss only applies to the dag_layer. + + use_bias: Whether to fit a bias parameter in the NOTEARS algorithm. + + ridge_beta: Constant that multiplies the ridge term (l2 regularisation). + When using nonlinear layers use of this parameter is recommended. + + hidden_layer_units: An iterable where its length determine the number of layers used, + and the numbers determine the number of nodes used for the layer in order. + + w_threshold: fixed threshold for absolute edge weights. + + max_iter: max number of dual ascent steps during optimisation. + + tabu_edges: list of edges(from, to) not to be included in the graph. + + tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. + + tabu_child_nodes: list of nodes banned from being a child of any other nodes. + + **kwargs: additional arguments for NOTEARS MLP model + + Returns: + StructureModel: a graph of conditional dependencies between data variables. + + Raises: + ValueError: If X does not contain data. + """ + # n examples, d properties + if not X.size: + raise ValueError("Input data X is empty, cannot learn any structure") + logging.info("Learning structure using 'NOTEARS' optimisation.") + # Check array for NaN or inf values + check_array(X) + + _, d = X.shape + + # if None or empty, convert into a list with single item + if hidden_layer_units is None: + hidden_layer_units = [0] + elif isinstance(hidden_layer_units, list) and not hidden_layer_units: + hidden_layer_units = [0] + + # if no hidden layer units, still take 1 iteration step with bounds + hidden_layer_bnds = hidden_layer_units[0] if hidden_layer_units[0] else 1 + + # Flip i and j because Pytorch flattens the vector in another direction + bnds = [ + (0, 0) + if i == j + else (0, 0) + if tabu_edges is not None and (i, j) in tabu_edges + else (0, 0) + if tabu_parent_nodes is not None and i in tabu_parent_nodes + else (0, 0) + if tabu_child_nodes is not None and j in tabu_child_nodes + else (None, None) + for j in range(d) + for _ in range(hidden_layer_bnds) + for i in range(d) + ] + + model = NotearsMLP( + n_features=d, + hidden_layer_units=hidden_layer_units, + lasso_beta=lasso_beta, + ridge_beta=ridge_beta, + bounds=bnds, + use_bias=use_bias, + **kwargs + ) + + model.fit(X, max_iter=max_iter) + sm = StructureModel(model.adj) + if w_threshold: + sm.remove_edges_below_threshold(w_threshold) + + mean_effect = model.adj_mean_effect + # extract the mean effect and add as edge attribute + for u, v, edge_dict in sm.edges.data(True): + sm.add_edge( + u, + v, + origin="learned", + weight=edge_dict["weight"], + mean_effect=mean_effect[u, v], + ) + + # set bias as node attribute + bias = model.bias + for node in sm.nodes(): + value = None + if bias is not None: + value = bias[node] + sm.nodes[node]["bias"] = value + + # preserve the structure_learner as a graph attribute + sm.graph["structure_learner"] = model + + return sm + + +# pylint: disable=too-many-locals +# pylint: disable=too-many-arguments +def from_pandas( + X: pd.DataFrame, + lasso_beta: float = 0.0, + ridge_beta: float = 0.0, + hidden_layer_units: Iterable[int] = None, + max_iter: int = 100, + w_threshold: float = None, + tabu_edges: List[Tuple[str, str]] = None, + tabu_parent_nodes: List[str] = None, + tabu_child_nodes: List[str] = None, + use_bias: bool = False, + **kwargs +) -> StructureModel: + """ + Learn the `StructureModel`, the graph structure describing conditional dependencies between variables + in data presented as a pandas dataframe. + + The optimisation is to minimise a score function :math:`F(W)` over the graph's + weighted adjacency matrix, :math:`W`, subject to the a constraint function :math:`h(W)`, + where :math:`h(W) == 0` characterises an acyclic graph. + :math:`h(W) > 0` is a continuous, differentiable function that encapsulated how acyclic the graph is + (less == more acyclic). + Full details of this approach to structure learning are provided in the publication: + + Based on DAGs with NO TEARS. + @inproceedings{zheng2018dags, + author = {Zheng, Xun and Aragam, Bryon and Ravikumar, Pradeep and Xing, Eric P.}, + booktitle = {Advances in Neural Information Processing Systems}, + title = {{DAGs with NO TEARS: Continuous Optimization for Structure Learning}}, + year = {2018}, + codebase = {https://github.com/xunzheng/notears} + } + + Args: + X: 2d input data, axis=0 is data rows, axis=1 is data columns. Data must be row oriented. + + lasso_beta: Constant that multiplies the lasso term (l1 regularisation). + NOTE when using nonlinearities, the l1 loss only applies to the dag_layer. + + use_bias: Whether to fit a bias parameter in the NOTEARS algorithm. + + ridge_beta: Constant that multiplies the ridge term (l2 regularisation). + When using nonlinear layers use of this parameter is recommended. + + hidden_layer_units: An iterable where its length determine the number of layers used, + and the numbers determine the number of nodes used for the layer in order. + + w_threshold: fixed threshold for absolute edge weights. + + max_iter: max number of dual ascent steps during optimisation. + + tabu_edges: list of edges(from, to) not to be included in the graph. + + tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. + + tabu_child_nodes: list of nodes banned from being a child of any other nodes. + + **kwargs: additional arguments for NOTEARS MLP model + + Returns: + StructureModel: graph of conditional dependencies between data variables. + + Raises: + ValueError: If X does not contain data. + """ + + data = deepcopy(X) + + non_numeric_cols = data.select_dtypes(exclude="number").columns + + if len(non_numeric_cols) > 0: + raise ValueError( + "All columns must have numeric data. " + "Consider mapping the following columns to int {non_numeric_cols}".format( + non_numeric_cols=non_numeric_cols + ) + ) + + col_idx = {c: i for i, c in enumerate(data.columns)} + idx_col = {i: c for c, i in col_idx.items()} + + if tabu_edges: + tabu_edges = [(col_idx[u], col_idx[v]) for u, v in tabu_edges] + if tabu_parent_nodes: + tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] + if tabu_child_nodes: + tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] + + g = from_numpy( + X=data.values, + lasso_beta=lasso_beta, + ridge_beta=ridge_beta, + use_bias=use_bias, + hidden_layer_units=hidden_layer_units, + w_threshold=w_threshold, + max_iter=max_iter, + tabu_edges=tabu_edges, + tabu_parent_nodes=tabu_parent_nodes, + tabu_child_nodes=tabu_child_nodes, + **kwargs + ) + + sm = StructureModel() + sm.add_nodes_from(data.columns) + + # recover the edge weights from g + for u, v, edge_dict in g.edges.data(True): + sm.add_edge( + idx_col[u], + idx_col[v], + origin="learned", + weight=edge_dict["weight"], + mean_effect=edge_dict["mean_effect"], + ) + + # retrieve dtype information from graph attribute + for key, val in g.graph.items(): + sm.graph[key] = val + + # recover the node biases from g + for node in g.nodes(data=True): + node_name = idx_col[node[0]] + sm.nodes[node_name]["bias"] = node[1]["bias"] + + return sm diff --git a/causalnex/structure/sklearn.py b/causalnex/structure/sklearn.py new file mode 100644 index 0000000..cabe896 --- /dev/null +++ b/causalnex/structure/sklearn.py @@ -0,0 +1,347 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module contains the implementation of ``DAGRegressor``. + +``DAGRegressor`` is a class which wraps the StructureModel in an sklearn interface for regression. +""" + +import copy +import warnings +from typing import Iterable, List, Union + +import numpy as np +import pandas as pd +import torch +from sklearn.base import BaseEstimator, RegressorMixin +from sklearn.preprocessing import StandardScaler +from sklearn.utils.validation import check_is_fitted, check_X_y + +from causalnex.plots import EDGE_STYLE, NODE_STYLE, plot_structure +from causalnex.structure.pytorch import notears + + +class DAGRegressor( + BaseEstimator, RegressorMixin +): # pylint: disable=too-many-instance-attributes + """ + Regressor wrapper of the StructureModel. + Implements the sklearn .fit and .predict interface. + Currently only supports linear NOTEARS fitting by the DAG. + + Example: + :: + >>> from causalnex.sklearn import DAGRegressor + >>> + >>> smr = DAGRegressor(threshold=0.1) + >>> smr.fit(X_train, y_train) + >>> + >>> y_preds = smr.predict(X_test) + >>> type(y_preds) + np.ndarray + >>> + >>> type(smr.feature_importances_) + np.ndarray + :: + + Attributes: + feature_importances_ (np.ndarray): An array of edge weights corresponding + positionally to the feature X. + """ + + # pylint: disable=too-many-arguments + def __init__( + self, + alpha: float = 0.0, + beta: float = 0.0, + fit_intercept: bool = True, + hidden_layer_units: Iterable[int] = None, + threshold: float = 0.0, + tabu_edges: List = None, + tabu_parent_nodes: List = None, + tabu_child_nodes: List = None, + dependent_target: bool = True, + enforce_dag: bool = False, + standardize: bool = False, + **kwargs + ): + """ + Args: + alpha: l1 loss weighting. When using nonlinear layers this is only applied + to the first layer. + + beta: l2 loss weighting. Applied across all layers. Reccomended to use this + when fitting nonlinearities. + + fit_intercept: Whether to fit an intercept in the structure model + equation. Use this if variables are offset. + + hidden_layer_units: An iterable where its length determine the number of layers used, + and the numbers determine the number of nodes used for the layer in order. + + threshold: The thresholding to apply to the DAG weights. + If 0.0, does not apply any threshold. + + tabu_edges: Tabu edges passed directly to the NOTEARS algorithm. + + tabu_parent_nodes: Tabu nodes passed directly to the NOTEARS algorithm. + + tabu_child_nodes: Tabu nodes passed directly to the NOTEARS algorithm. + + dependent_target: If True, constrains NOTEARS so that y can only + be dependent (i.e. cannot have children) and imputes from parent nodes. + + enforce_dag: If True, thresholds the graph until it is a DAG. + NOTE a properly trained model should be a DAG, and failure + indicates other issues. Use of this is only recommended if + features have similar units, otherwise comparing edge weight + magnitude has limited meaning. + + standardize: Whether to standardize the X and y variables before fitting. + The L-BFGS algorithm used to fit the underlying NOTEARS works best on data + all of the same scale so this parameter is reccomended. + + kwargs: Extra arguments passed to the NOTEARS from_pandas function. + + Raises: + TypeError: if alpha is not numeric. + TypeError: if beta is not numeric. + TypeError: if fit_intercept is not a bool. + TypeError: if threshold is not numeric. + """ + + # core causalnex parameters + self.alpha = alpha + self.beta = beta + self.fit_intercept = fit_intercept + self.hidden_layer_units = hidden_layer_units + self.threshold = threshold + self.tabu_edges = tabu_edges + self.tabu_parent_nodes = tabu_parent_nodes + self.tabu_child_nodes = tabu_child_nodes + self.kwargs = kwargs + + if not isinstance(alpha, (int, float)): + raise TypeError("alpha should be numeric") + if not isinstance(beta, (int, float)): + raise TypeError("beta should be numeric") + if not isinstance(fit_intercept, bool): + raise TypeError("fit_intercept should be a bool") + if not isinstance(threshold, (int, float)): + raise TypeError("threshold should be numeric") + + # sklearn wrapper paramters + self.dependent_target = dependent_target + self.enforce_dag = enforce_dag + self.standardize = standardize + + def fit( + self, X: Union[pd.DataFrame, np.ndarray], y: Union[pd.Series, np.ndarray] + ) -> "DAGRegressor": + """ + Fits the sm model using the concat of X and y. + """ + + # defensive X, y checks + check_X_y(X, y, y_numeric=True) + + # force as DataFrame and Series (for later calculations) + X = pd.DataFrame(X) + y = pd.Series(y) + # force name so that name != None (causes errors in notears) + y.name = y.name or "__target" + + if self.standardize: + self.ss_X = StandardScaler() + self.ss_y = StandardScaler() + X = pd.DataFrame(self.ss_X.fit_transform(X), columns=X.columns) + y = pd.Series( + self.ss_y.fit_transform(y.values.reshape(-1, 1)).reshape(-1), + name=y.name, + ) + + # preserve the feature and target colnames + self._features = tuple(X.columns) + self._target = y.name + + # concat X and y along column axis + X = pd.concat([X, y], axis=1) + + # make copy to prevent mutability + tabu_parent_nodes = copy.deepcopy(self.tabu_parent_nodes) + if self.dependent_target: + if tabu_parent_nodes is None: + tabu_parent_nodes = [self._target] + elif self._target not in tabu_parent_nodes: + tabu_parent_nodes.append(self._target) + + # fit the structured model + self.graph_ = notears.from_pandas( + X, + lasso_beta=self.alpha, + ridge_beta=self.beta, + hidden_layer_units=self.hidden_layer_units, + w_threshold=self.threshold, + tabu_edges=self.tabu_edges, + tabu_parent_nodes=tabu_parent_nodes, + tabu_child_nodes=self.tabu_child_nodes, + use_bias=self.fit_intercept, + **self.kwargs + ) + + # keep thresholding until the DAG constraint is enforced + if self.enforce_dag: + self.graph_.threshold_till_dag() + + return self + + def _predict_from_parents(self, X: Union[pd.DataFrame, np.ndarray]): + + # extract the base solver + structure_learner = self.graph_.graph["structure_learner"] + + # convert the predict data to pytorch tensor + X = torch.from_numpy(X).float().to(structure_learner.device) + # need to concat y onto X so that the dimensions are the same + y = torch.zeros(X.shape[0], 1).float().to(structure_learner.device) + X = torch.cat([X, y], dim=1) + + # perform forward reconstruction + X_hat = structure_learner(X) + + # FUTURE NOTE: with dtypes the projection from latent -> dtype goes here + + # extract the desired y column, return as array + y_pred = X_hat[:, -1] + return y_pred.cpu().detach().numpy() + + def predict(self, X: Union[pd.DataFrame, np.ndarray]) -> np.ndarray: + """ + Get the predictions of the structured model. + This is done by multiplying the edge weights with the feature i.e. X @ W + """ + # force convert to ndarray + X = np.asarray(X) + if self.standardize: + X = self.ss_X.transform(X) + + # check that the model has been fit + check_is_fitted(self, "graph_") + + y_pred = np.asarray(self._predict_from_parents(X)) + if self.standardize: + y_pred = self.ss_y.inverse_transform(y_pred.reshape(-1, 1)).reshape(-1) + return y_pred + + def get_edges_to_node(self, name: str, data: str = "weight") -> pd.Series: + """ + Get the edges to a specific node. + Args: + name: The name of the node which to get weights towards. + + data: The edge parameter to get. Default is "weight" to return + the adjacency matrix. Set to "mean_effect" to return the + signed average effect of features on the target node. + + Returns: + The specified edge data. + """ + check_is_fitted(self, "graph_") + + # build base data series + edges = pd.Series(index=self._features) + + # iterate over all edges + for (i, j, w) in self.graph_.edges(data=data): + # for edges directed towards the "name" node + if j == name: + # insert the weight into the series + edges[i] = w + + # fill edges not present in the iteration with zeros + edges = edges.fillna(0) + + return edges + + @property + def feature_importances_(self) -> np.ndarray: + """ + Unsigned importances of the features wrt to the target. + NOTE: these are used as the graph adjacency matrix. + Returns: + the L2 relationship between nodes. + """ + return self.get_edges_to_node(self._target).values + + @property + def coef_(self) -> np.ndarray: + """ + Signed relationship between features and the target. + For this linear case this equivalent to linear regression coefficients. + Returns: + the mean effect relationship between nodes. + """ + return self.get_edges_to_node(self._target, data="mean_effect").values + + @property + def intercept_(self) -> float: + """ The bias term from the target node """ + bias = self.graph_.nodes[self._target]["bias"] + return 0.0 if bias is None else float(bias) + + def plot_dag(self, enforce_dag: bool = False, filename: str = "./graph.png"): + """ Util function used to plot the fitted graph """ + + try: + # pylint: disable=import-outside-toplevel + from IPython.display import Image + except ImportError as e: + raise ImportError( + "DAGRegressor.plot_dag method requires IPython installed." + ) from e + + check_is_fitted(self, "graph_") + + graph = copy.deepcopy(self.graph_) + if enforce_dag: + graph.threshold_till_dag() + + # silence annoying plotting warning + warnings.filterwarnings("ignore") + + viz = plot_structure( + graph, + graph_attributes={"scale": "0.5"}, + all_node_attributes=NODE_STYLE.WEAK, + all_edge_attributes=EDGE_STYLE.WEAK, + ) + viz.draw(filename) + + # reset warnings to always show + warnings.simplefilter("always") + return Image(filename) diff --git a/causalnex/structure/structuremodel.py b/causalnex/structure/structuremodel.py index 259bf99..4d84fe9 100644 --- a/causalnex/structure/structuremodel.py +++ b/causalnex/structure/structuremodel.py @@ -292,3 +292,12 @@ def get_target_subgraph(self, node: Hashable) -> "StructureModel": return subgraph raise NodeNotFound("Node {node} not found in the graph.".format(node=node)) + + def threshold_till_dag(self): + """ + Remove edges with smallest weight until the graph is a DAG. + Not recommended if the weights have different units. + """ + while not nx.algorithms.is_directed_acyclic_graph(self): + i, j, _ = min(self.edges(data="weight"), key=lambda x: abs(x[2])) + self.remove_edge(i, j) diff --git a/causalnex/structure/transformers.py b/causalnex/structure/transformers.py new file mode 100644 index 0000000..09d5aa8 --- /dev/null +++ b/causalnex/structure/transformers.py @@ -0,0 +1,290 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Collection of sklearn style transformers designed to assist with causal structure learning. +""" + +from copy import deepcopy +from typing import List, Tuple, Union + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.exceptions import NotFittedError + + +class DynamicDataTransformer(BaseEstimator, TransformerMixin): + """ + Format a time series dataframe or list of dataframes into the a format that matches the structure learned by + `from_pandas_dynamic`. This is done to allow for bayesian network probability fitting. + + Example of utilisation: + >>> ddt = DynamicDataTransformer(p=p).fit(time_series, return_df=False) + >>> X, Xlags = ddt.transform(time_series) + + >>> ddt = DynamicDataTransformer(p=p).fit(time_series, return_df=True) + >>> df = ddt.transform(time_series) + """ + + def __init__(self, p: int): + """ + Initialise Transformer + Args: + p: Number of past interactions we allow the model to create. The state of a variable at time `t` is + affected by the variables at the time stamp + the variables at `t-1`, `t-2`,... `t-p`. + """ + self.p = p + self.columns = None + self.return_df = None + + def fit( + self, + time_series: Union[pd.DataFrame, List[pd.DataFrame]], + return_df: bool = True, + ) -> "DynamicDataTransformer": + """ + Fits the time series. This consists memorizing: + - Column names and column positions + - whether a dataframe or a tuple of arrays should be returned by `transform` (details below) + Args: + time_series: pd.DataFrame or List of pd.DataFrame instances. + If a list is provided each element of the list being an realisation of a time series + (i.e. time series governed by the same processes) + The columns of the data frame represent the variables in the model, and the *index represents + the time index*. + Successive events, therefore, must be indexed with one integer of difference between them too. + + return_df: Whether the `transform` method should return a pandas.DataFrame or a tuple with (X,Xlags) + (Details on the documentation of the `transform` method) + + Returns: + self + + """ + time_series = time_series if isinstance(time_series, list) else [time_series] + self._check_input_from_pandas(time_series) + self.columns = list(time_series[0].columns) + self.return_df = return_df + return self + + def transform( + self, time_series: Union[pd.DataFrame, List[pd.DataFrame]] + ) -> Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray]]: + """ + Applies transformation to format the dataframe properly + Args: + time_series: time_series: pd.DataFrame or List of pd.DataFrame instances. Details on `fit` documentation + + Returns: + - If `self.return_df=True`, returns a pandas.DataFrame on the following format: + + A_lag0 B_lag0 C_lag0 ... A_lag1 B_lag1 C_lag1 ... A_lag`p` B_lag`p` C_lag`p` + X X X X X X X X X + X X X X X X X X X + X X X X X X X X X + `lag0` denotes the current variable state and lag`k` denotes the states `k` time stamps in the past. + + - If `self.return_df=False`, returns a tuple of two numpy.ndarrayy: X and Xlags + X (np.ndarray): 2d input data, axis=1 is data columns, axis=0 is data rows. + Each column represents one variable, + and each row represents x(m,t) i.e. the mth time series at time t. + Xlags (np.ndarray): + Shifted data of X with lag orders stacking horizontally. Xlags=[shift(X,1)|...|shift(X,p)] + Raises: + NotFittedError: if `transform` called before `fit` + """ + if self.columns is None: + raise NotFittedError( + "This DynamicDataTransformer is not fitted yet. " + "Call `fit` before using this method" + ) + + time_series = time_series if isinstance(time_series, list) else [time_series] + + self._check_input_from_pandas(time_series) + + time_series = [t[self.columns] for t in time_series] + ts_realisations = self._cut_dataframes_on_discontinuity_points(time_series) + X, Xlags = self._convert_realisations_into_dynotears_format( + ts_realisations, self.p + ) + + if self.return_df: + res = self._concat_lags(X, Xlags) + return res + return X, Xlags + + def _concat_lags(self, X: np.ndarray, Xlags: np.ndarray) -> pd.DataFrame: + df_x = pd.DataFrame( + X, columns=["{col}_lag0".format(col=col) for col in self.columns] + ) + df_xlags = pd.DataFrame( + Xlags, + columns=[ + "{col}_lag{l_}".format(col=col, l_=l_) + for l_ in range(1, self.p + 1) + for col in self.columns + ], + ) + return pd.concat([df_x, df_xlags], axis=1) + + def _check_input_from_pandas(self, time_series: List[pd.DataFrame]): + """ + Check if the input of function `from_pandas_dynamic` is valid + Args: + time_series: List of pd.DataFrame instances. + each element of the list being an realisation of a same time series + + Raises: + ValueError: if empty list of time_series is provided + ValueError: if dataframes contain non numeric data + TypeError: if elements provided are not pandas dataframes + ValueError: if dataframes contain different columns + ValueError: if dataframes index is not in increasing order + TypeError: if dataframes index are not index + """ + if not time_series: + raise ValueError( + "Provided empty list of time_series. At least one DataFrame must be provided" + ) + + df = deepcopy(time_series[0]) + + for t in time_series: + if not isinstance(t, pd.DataFrame): + raise TypeError( + "Time series entries must be instances of `pd.DataFrame`" + ) + + non_numeric_cols = t.select_dtypes(exclude="number").columns + + if not non_numeric_cols.empty: + raise ValueError( + "All columns must have numeric data. Consider mapping the " + "following columns to int: {non_numeric_cols}".format( + non_numeric_cols=list(non_numeric_cols) + ) + ) + + if (not np.all(df.columns == t.columns)) or ( + not np.all(df.dtypes == t.dtypes) + ): + raise ValueError("All inputs must have the same columns and same types") + + if not np.all(t.index == t.index.sort_values()): + raise ValueError( + "Index for dataframe must be provided in increasing order" + ) + + if t.index.dtype != int: + raise TypeError("Index must be integers") + + if self.columns is not None: + missing_cols = [c for c in self.columns if c not in t.columns] + if missing_cols: + raise ValueError( + "We should provide all necessary columns in the time series." + " Columns not provided: {col}".format(col=missing_cols) + ) + + @staticmethod + def _cut_dataframes_on_discontinuity_points( + time_series: List[pd.DataFrame], + ) -> List[np.ndarray]: + """ + Helper function for `from_pandas_dynamic` + Receive a list of dataframes. For each dataframe, cut the points of discontinuity as two different dataframes. + Discontinuities are determined by the indexes. + + For Example: + If the following is a dataframe: + index variable_1 variable_2 + 1 X X + 2 X X + 3 X X + 4 X X + 8 X X <- discontinuity point + 9 X X + 10 X X + + We cut this dataset in two: + + index variable_1 variable_2 + 1 X X + 2 X X + 3 X X + 4 X X + + and: + index variable_1 variable_2 + 8 X X + 9 X X + 10 X X + + + Args: + time_series: list of dataframes representing various realisations of a same time series + + Returns: + List of np.ndarrays representing the pieces of the input datasets with no discontinuity + + """ + time_series_realisations = [] + for t in time_series: + cutting_points = np.where(np.diff(t.index) > 1)[0] + cutting_points = [0] + list(cutting_points + 1) + [len(t)] + for start, end in zip(cutting_points[:-1], cutting_points[1:]): + time_series_realisations.append(t.iloc[start:end, :].values) + return time_series_realisations + + @staticmethod + def _convert_realisations_into_dynotears_format( + realisations: List[np.ndarray], p: int + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Given a list of realisations of a time series, convert it to the format received by the dynotears algorithm. + Each realisation on `realisations` is a realisation of the time series, + where the time dimension is represented by the rows. + - The higher the row, the higher the time index + - The data is complete, meaning that the difference between two time stamps is equal one + Args: + realisations: a list of realisations of a time series + p: the number of lagged columns to create + + Returns: + X and Y as in the SVAR model and DYNOTEARS paper. I.e. X being representing X(m,t) and Y the concatenated + differences [X(m,t-1) | X(m,t-2) | ... | X(m,t-p)] + """ + X = np.concatenate([realisation[p:] for realisation in realisations], axis=0) + y_lag_list = [ + np.concatenate([realisation[p - i - 1 : -i - 1] for i in range(p)], axis=1) + for realisation in realisations + ] + y_lag = np.concatenate(y_lag_list, axis=0) + + return X, y_lag diff --git a/doc_requirements.txt b/doc_requirements.txt index f60e9a6..744f4c3 100644 --- a/doc_requirements.txt +++ b/doc_requirements.txt @@ -5,7 +5,7 @@ nbsphinx==0.4.2 nbstripout==0.3.3 patchy>=1.5, <2.0 recommonmark==0.5.0 -sphinx-autodoc-typehints>=1.6.0, < 2.0 +sphinx-autodoc-typehints>=1.6.0, < 1.11.0 sphinx-markdown-tables==0.0.9 sphinx>=1.8.4, <2.0 sphinx_copybutton==0.2.5 diff --git a/docs/conf.py b/docs/conf.py index 506e091..750a508 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -390,7 +390,9 @@ def _prepare_build_dir(app, config): shutil.rmtree(str(build_root / "api_docs")) shutil.rmtree(str(build_out), ignore_errors=True) copy_tree(str(build_root / "css"), str(build_out / "_static" / "css")) - copy_tree(str(build_root / "04_user_guide/images"), str(build_out / "04_user_guide")) + copy_tree( + str(build_root / "04_user_guide/images"), str(build_out / "04_user_guide") + ) shutil.rmtree(str(build_root / "css")) @@ -407,7 +409,7 @@ def setup(app): app.add_stylesheet("css/causalnex.css") # when using nbsphinx, to allow mathjax render properly - app.config._raw_config.pop('mathjax_config') + app.config._raw_config.pop("mathjax_config") def fix_module_paths(): diff --git a/docs/source/03_tutorial/regressor_tutorial.ipynb b/docs/source/03_tutorial/regressor_tutorial.ipynb new file mode 100644 index 0000000..648653c --- /dev/null +++ b/docs/source/03_tutorial/regressor_tutorial.ipynb @@ -0,0 +1,644 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import os\n", + "import sys\n", + "module_path = os.path.abspath(os.path.join(\"../../..\"))\n", + "if module_path not in sys.path:\n", + " sys.path.append(module_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Contents\n", + "\n", + "This notebook walks through using the DAGRegressor model.\n", + "\n", + "The material covered here is as follows:\n", + "- Linear Interface\n", + "- Nonlinear Interface" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "___\n", + "## Real Data (boston housing)\n", + "\n", + "This section demonstrates the performance of the algorithm on a real-world dataset. The main things to note in this section are:\n", + "- The scale sensitivity of the algorithm\n", + "- Interpretability of nonlinear `.coef_`\n", + "\n", + "The boston housing dataset is a classic benchmark regression task. The objective is to predict a set of house prices given a small set of features.\n", + "\n", + "The meaning of the set of avaliable features is shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + ".. _boston_dataset:\n", + "\n", + "Boston house prices dataset\n", + "---------------------------\n", + "\n", + "**Data Set Characteristics:** \n", + "\n", + " :Number of Instances: 506 \n", + "\n", + " :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n", + "\n", + " :Attribute Information (in order):\n", + " - CRIM per capita crime rate by town\n", + " - ZN proportion of residential land zoned for lots over 25,000 sq.ft.\n", + " - INDUS proportion of non-retail business acres per town\n", + " - CHAS Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n", + " - NOX nitric oxides concentration (parts per 10 million)\n", + " - RM average number of rooms per dwelling\n", + " - AGE proportion of owner-occupied units built prior to 1940\n", + " - DIS weighted distances to five Boston employment centres\n", + " - RAD index of accessibility to radial highways\n", + " - TAX full-value property-tax rate per $10,000\n", + " - PTRATIO pupil-teacher ratio by town\n", + " - B 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town\n", + " - LSTAT % lower status of the population\n", + " - MEDV Median value of owner-occupied homes in $1000's\n", + "\n", + " :Missing Attribute Values: None\n", + "\n", + " :Creator: Harrison, D. and Rubinfeld, D.L.\n", + "\n", + "This is a copy of UCI ML housing dataset.\n", + "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/\n", + "\n", + "\n", + "This dataset was taken from the StatLib library which is maintained at Carnegie Mellon University.\n", + "\n", + "The Boston house-price data of Harrison, D. and Rubinfeld, D.L. 'Hedonic\n", + "prices and the demand for clean air', J. Environ. Economics & Management,\n", + "vol.5, 81-102, 1978. Used in Belsley, Kuh & Welsch, 'Regression diagnostics\n", + "...', Wiley, 1980. N.B. Various transformations are used in the table on\n", + "pages 244-261 of the latter.\n", + "\n", + "The Boston house-price data has been used in many machine learning papers that address regression\n", + "problems. \n", + " \n", + ".. topic:: References\n", + "\n", + " - Belsley, Kuh & Welsch, 'Regression diagnostics: Identifying Influential Data and Sources of Collinearity', Wiley, 1980. 244-261.\n", + " - Quinlan,R. (1993). Combining Instance-Based and Model-Based Learning. In Proceedings on the Tenth International Conference of Machine Learning, 236-243, University of Massachusetts, Amherst. Morgan Kaufmann.\n", + "\n" + ] + } + ], + "source": [ + "from sklearn.datasets import load_boston\n", + "print(load_boston(return_X_y=False)[\"DESCR\"])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Lets initially benchmark the performance of an `ElasticNetCV` fitted across the entire dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MEAN R2: 0.700\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import ElasticNetCV\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "from sklearn.datasets import load_boston\n", + "X, y = load_boston(return_X_y=True)\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "ss = StandardScaler()\n", + "X = ss.fit_transform(X)\n", + "y = (y - y.mean()) / y.std()\n", + "\n", + "\n", + "reg = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], fit_intercept=True)\n", + "\n", + "from sklearn.model_selection import KFold\n", + "scores = cross_val_score(reg, X, y, cv=KFold(shuffle=True, random_state=42))\n", + "print(f'MEAN R2: {np.mean(scores).mean():.3f}')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Linear DAGRegressor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The DAGRegressor has several parameters which can be used to better fit a more complicated noisy DAG:\n", + "- `alpha`: The l1 (lasso) regularisation parameter. Increasing this creates a sparser DAG.\n", + "- `beta`: The l2 (ridge) regularisation parameter.\n", + "It was decided to use `alpha` and `beta` rather than `alpha` and `l1_ratio` like in sklearn elasticnet to uncouple the parameters during optimisation.\n", + "\n", + "There are several parameters which are also of interest which have good defaults, but we highlight here:\n", + "- `dependent_target`: This forces the target variable y to be only a child node. This is important for performance because in some cases `X -> y` is indistinguishable from `y -> X`. Enabling this (default enabled) ensures that the regressor performance at least matches linear regression. The trade-off is that the learned structure might be less accurate if y does cause other features.\n", + "- `enforce_dag`: This thresholds the learned structure model until the system is a DAG. This is useful for removing the small straggler connections which enables the DAG to be visualised easier. It does not impact performance, because the regressor still uses those connections under the hood." + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MEAN R2: 0.706\n", + "CRIM 0.000000\n", + "ZN 0.000000\n", + "INDUS 0.000000\n", + "CHAS 0.000000\n", + "NOX 0.000000\n", + "RM 0.310324\n", + "AGE 0.000000\n", + "DIS -0.225455\n", + "RAD 0.000000\n", + "TAX 0.000000\n", + "PTRATIO 0.000000\n", + "B 0.000000\n", + "LSTAT -0.372211\n", + "dtype: float64\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from causalnex.structure.sklearn import DAGRegressor\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "from sklearn.datasets import load_boston\n", + "X, y = load_boston(return_X_y=True)\n", + "\n", + "from sklearn.preprocessing import StandardScaler\n", + "ss = StandardScaler()\n", + "X = ss.fit_transform(X)\n", + "y = (y - y.mean()) / y.std()\n", + "\n", + "reg = DAGRegressor(\n", + " alpha=0.1,\n", + " beta=0.9,\n", + " fit_intercept=True,\n", + " hidden_layer_units=None,\n", + " dependent_target=True,\n", + " enforce_dag=True,\n", + " )\n", + "\n", + "from sklearn.model_selection import KFold\n", + "scores = cross_val_score(reg, X, y, cv=KFold(shuffle=True, random_state=42))\n", + "print(f'MEAN R2: {np.mean(scores).mean():.3f}')\n", + "\n", + "X_pd = pd.DataFrame(X, columns=load_boston(return_X_y=False)[\"feature_names\"])\n", + "y_pd = pd.Series(y, name=\"price\")\n", + "reg.fit(X_pd, y_pd)\n", + "print(pd.Series(reg.coef_, index=load_boston(return_X_y=False)[\"feature_names\"]))\n", + "reg.plot_dag(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NonLinear DAGRegressor" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Specifying a nonlinear model is extremely simple, only a single parameter needs to be altered: `hidden_layer_units`\n", + "\n", + "`hidden_layer_units` takes _any_ **iterable** of **integers**: \n", + "- The value specifies the number of perceptrons to use in each nonlinear MLP layer:\n", + "- The number of elements in the iterable determines the number of hidden layers. \n", + "The more layers and more perceptrons per layer, the more complicated the function which can be fit. The trade off is a greater tendency to overfit, and slower fitting.\n", + "\n", + "A good default starting argument is ~[5]. This is unlikely to overfit, and usually demonstrates immidiately whether the DAG has nonlinear components.\n", + "\n", + "The setting of the `alpha` and `beta` parameters is very important.\n", + "Typically `beta` is more important than `alpha` when using nonlinear layers. This is because l2 is applied across all layers, whereas l1 is only applied to the first layer.\n", + "A good starting point is `~beta=0.5`.\n", + "\n", + "**NOTE it is very important to scale your data!**\n", + "\n", + "The nonlinear layers contain sigmoid nonlinearities which can become saturated with unscaled data. Also, unscaled data means that regularisation parameters do not impact weights across features equally.\n", + "\n", + "For convnenience, setting `standardize=True` scales both the X and y data during fit. It also inverse transforms the y on predict similar to the sklearn `TransformedTargetRegressor`." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MEAN R2: 0.833\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from causalnex.structure.sklearn import DAGRegressor\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "from sklearn.datasets import load_boston\n", + "X, y = load_boston(return_X_y=True)\n", + "\n", + "reg = DAGRegressor(threshold=0.0,\n", + " alpha=0.0,\n", + " beta=0.5,\n", + " fit_intercept=True,\n", + " hidden_layer_units=[5],\n", + " standardize=True,\n", + " )\n", + "\n", + "from sklearn.model_selection import KFold\n", + "scores = cross_val_score(reg, X, y, cv=KFold(shuffle=True, random_state=42))\n", + "print(f'MEAN R2: {np.mean(scores).mean():.3f}')\n", + "\n", + "X_pd = pd.DataFrame(X, columns=load_boston(return_X_y=False)[\"feature_names\"])\n", + "y_pd = pd.Series(y, name=\"price\")\n", + "reg.fit(X_pd, y_pd)\n", + "\n", + "reg.plot_dag(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interpereting the Nonlinear DAG\n", + "\n", + "For nonlinear analysis, understanding the impact of one feature on another is not as simple as taking the mean effect as in the linear case.\n", + "Instead, a combination of `reg.coef_` and `reg.feature_importances` should be used:\n", + "\n", + "- `reg.coef_` provides the mean **directional** effect of all the features on the target. This gives average directional information, but can be misleading in terms of magnitude if the feature has a positive _and_ negative effect on the target.\n", + "\n", + "- `reg.feature_importances_` provides the mean **magnitude** effect of the features on the target. These values will be _strictly larger_ than the `reg.coef_` because there are no cancellation effects due to sign differences. \n", + "\n", + "The magnitude difference between the `reg.coef_` and `reg.feature_importances_` values can give insight into the _degree of directional variability_ of the parameter:\n", + "- A large difference means that the parameter has **large positive and negative effects** on the target. \n", + "- A zero difference means that the parameter always has the same directional impact on the target." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MEAN R2: 0.859\n", + "MEAN EFFECT DIRECTIONAL:\n", + "CRIM -0.455048\n", + "ZN -0.081971\n", + "INDUS 0.031664\n", + "CHAS 0.033811\n", + "NOX -0.226108\n", + "RM 0.328084\n", + "AGE -0.160502\n", + "DIS -0.479593\n", + "RAD 0.265122\n", + "TAX -0.230414\n", + "PTRATIO -0.089173\n", + "B 0.098137\n", + "LSTAT -0.344907\n", + "dtype: float64\n", + "MEAN EFFECT MAGNITUDE:\n", + "CRIM 0.455280\n", + "ZN 0.083515\n", + "INDUS 0.109172\n", + "CHAS 0.048846\n", + "NOX 0.226108\n", + "RM 0.364239\n", + "AGE 0.176076\n", + "DIS 0.479593\n", + "RAD 0.265122\n", + "TAX 0.238738\n", + "PTRATIO 0.118665\n", + "B 0.150867\n", + "LSTAT 0.347829\n", + "dtype: float64\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from causalnex.structure.sklearn import DAGRegressor\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "from sklearn.datasets import load_boston\n", + "X, y = load_boston(return_X_y=True)\n", + "\n", + "reg = DAGRegressor( alpha=0.0,\n", + " beta=1.0,\n", + " fit_intercept=True,\n", + " hidden_layer_units=[8, 8, 8],\n", + " standardize=True,\n", + " )\n", + "\n", + "from sklearn.model_selection import KFold\n", + "scores = cross_val_score(reg, X_pd.values, y_pd.values, cv=KFold(shuffle=True, random_state=42))\n", + "print(f'MEAN R2: {np.mean(scores).mean():.3f}')\n", + "\n", + "reg.fit(X_pd, y_pd)\n", + "print(\"MEAN EFFECT DIRECTIONAL:\")\n", + "print(pd.Series(reg.coef_, index=load_boston(return_X_y=False)[\"feature_names\"]))\n", + "print(\"MEAN EFFECT MAGNITUDE:\")\n", + "print(pd.Series(reg.feature_importances_, index=load_boston(return_X_y=False)[\"feature_names\"]))\n", + "\n", + "reg.plot_dag(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `reg.get_edges_to_node` method allows for analysis of other edges in the graph easily.\n", + "\n", + "Passing in `data=\"weight\"` returns the mean effect magnitude of the variables on the requested node. It is equivalent to the `reg.feature_importances` return for the target node.\n", + "\n", + "Passing in `data=\"mean_effect\"` returns the mean directional effect.\n", + "\n", + "Below is a good example of a large difference between the magnitude and directional effects: \n", + "- The feature RAD has overall a large effect on the presence of NOX. \n", + "- However, the _directional_ effect of this feature is highly variable, which leads the mean_effect to be an order of magnitude smaller than the mean effect magnitude!" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CRIM 0.663028\n", + "ZN 0.000000\n", + "INDUS 0.000000\n", + "CHAS 0.000000\n", + "NOX 0.000000\n", + "RM 0.000000\n", + "AGE 0.000000\n", + "DIS 0.723618\n", + "RAD 0.429126\n", + "TAX 0.858995\n", + "PTRATIO 0.000000\n", + "B 0.000000\n", + "LSTAT 0.000000\n", + "dtype: float64" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vals = reg.get_edges_to_node(\"NOX\", data=\"weight\").copy()\n", + "vals[vals.abs() < 0.01] = 0\n", + "vals" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "CRIM 0.644507\n", + "ZN 0.000000\n", + "INDUS 0.000000\n", + "CHAS 0.000000\n", + "NOX 0.000000\n", + "RM 0.000000\n", + "AGE 0.000000\n", + "DIS -0.615096\n", + "RAD 0.063520\n", + "TAX -0.746946\n", + "PTRATIO 0.000000\n", + "B 0.000000\n", + "LSTAT 0.000000\n", + "dtype: float64" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vals = reg.get_edges_to_node(\"NOX\", data=\"mean_effect\")\n", + "vals[vals.abs() < 0.01] = 0\n", + "vals" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dependent Target\n", + "\n", + "Setting the `dependent_target=False` has an impact on performance as shown below, but can give better insight into the overall nonlinear structure of the data.\n", + "\n", + "This is effectively the same as fitting causalnex on the data using from_pandas, but using the sklearn interface provides a set of useful convenience functions not present in the base causalnex implementation." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MEAN R2: 0.838\n", + "MEAN EFFECT DIRECTIONAL:\n", + "CRIM -0.152426\n", + "ZN 0.006836\n", + "INDUS 0.024601\n", + "CHAS 0.030994\n", + "NOX -0.182232\n", + "RM 0.273697\n", + "AGE -0.071418\n", + "DIS -0.296711\n", + "RAD 0.111622\n", + "TAX -0.223531\n", + "PTRATIO -0.151172\n", + "B 0.071595\n", + "LSTAT -0.412829\n", + "dtype: float64\n", + "MEAN EFFECT MAGNITUDE:\n", + "CRIM 0.152426\n", + "ZN 0.006899\n", + "INDUS 0.053876\n", + "CHAS 0.048723\n", + "NOX 0.182232\n", + "RM 0.283586\n", + "AGE 0.089832\n", + "DIS 0.296711\n", + "RAD 0.123725\n", + "TAX 0.228080\n", + "PTRATIO 0.151172\n", + "B 0.071595\n", + "LSTAT 0.413056\n", + "dtype: float64\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from causalnex.structure.sklearn import DAGRegressor\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "from sklearn.datasets import load_boston\n", + "X, y = load_boston(return_X_y=True)\n", + "\n", + "reg = DAGRegressor( alpha=0.0,\n", + " beta=1.0,\n", + " fit_intercept=True,\n", + " hidden_layer_units=[5],\n", + " standardize=True,\n", + " dependent_target=True,\n", + " )\n", + "\n", + "from sklearn.model_selection import KFold\n", + "scores = cross_val_score(reg, X_pd.values, y_pd.values, cv=KFold(shuffle=True, random_state=42))\n", + "print(f'MEAN R2: {np.mean(scores).mean():.3f}')\n", + "\n", + "reg.fit(X_pd, y_pd)\n", + "print(\"MEAN EFFECT DIRECTIONAL:\")\n", + "print(pd.Series(reg.coef_, index=load_boston(return_X_y=False)[\"feature_names\"]))\n", + "print(\"MEAN EFFECT MAGNITUDE:\")\n", + "print(pd.Series(reg.feature_importances_, index=load_boston(return_X_y=False)[\"feature_names\"]))\n", + "\n", + "reg.plot_dag(True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/source/api_docs/index.rst b/docs/source/api_docs/index.rst index d3e3c07..61e55e4 100644 --- a/docs/source/api_docs/index.rst +++ b/docs/source/api_docs/index.rst @@ -61,6 +61,7 @@ Welcome to CausalNex's API docs and tutorials! 03_tutorial/03_tutorial.md 03_tutorial/plotting_tutorial.md + 03_tutorial/regressor_tutorial.md .. toctree:: :maxdepth: 2 diff --git a/requirements.txt b/requirements.txt index d01734f..7d32c47 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,5 +4,5 @@ pandas>=0.24.0, <1.0 pgmpy==0.1.6 prettytable>=0.7.2, <0.8 scikit-learn>=0.20.2, <0.23.0, !=0.22.2.post1 -scipy>=1.2.0, <1.3 +scipy>=1.2.0, <1.6 wrapt>=1.11.0, <1.12 diff --git a/setup.py b/setup.py index 7ee5ef9..04fae47 100644 --- a/setup.py +++ b/setup.py @@ -58,6 +58,7 @@ "plot": [ "pygraphviz>=1.5, <2.0", ], + "pytorch": ["torch>=1.4.0, <2.0"], } extras_require["all"] = sorted(chain.from_iterable(extras_require.values())) diff --git a/test_requirements.txt b/test_requirements.txt index 087dc41..2124d06 100644 --- a/test_requirements.txt +++ b/test_requirements.txt @@ -1,10 +1,12 @@ -r requirements.txt -flake8>=3.5,<4.0 +flake8>=3.5, <4.0 +ipython>=7.0, <7.17 isort>=4.3.16, <5.0 -mock>=2.0.0,<3.0 +mock>=2.0.0, <3.0 pre-commit>=1.17.0, <2.0.0 pygraphviz>=1.5, <2.0 pylint>=2.5.2, <3.0 pytest-cov>=2.5, <3.0 -pytest-mock>=1.7.1,<2.0 -pytest>=4.3.0,<5.0 +pytest-mock>=1.7.1, <2.0 +pytest>=4.3.0, <5.0 +torch>=1.4.0, <1.6 diff --git a/tests/conftest.py b/tests/conftest.py index 3da7a2b..307495c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,6 +38,8 @@ from causalnex.structure.notears import from_pandas +# Ignoring limit of 1000 lines per module, since this module contains test sets. +# pylint: disable=C0302 @pytest.fixture def train_model() -> StructureModel: """ @@ -264,7 +266,6 @@ def train_data_discrete_cpds_k2(train_data_discrete) -> Dict[str, np.ndarray]: def create_cpds(data, pc=0): - df = data.copy(deep=True) # type: pd.DataFrame df_vals = {col: list(df[col].unique()) for col in df.columns} @@ -344,7 +345,6 @@ def create_cpds(data, pc=0): @pytest.fixture def train_data_idx_marginals(train_data_idx_cpds): - return create_marginals( train_data_idx_cpds, { @@ -359,7 +359,6 @@ def train_data_idx_marginals(train_data_idx_cpds): @pytest.fixture def train_data_discrete_marginals(train_data_discrete_cpds): - return create_marginals( train_data_discrete_cpds, { @@ -489,6 +488,534 @@ def bn(train_data_idx, train_data_discrete) -> BayesianNetwork: ).fit_node_states_and_cpds(train_data_discrete) +@pytest.fixture() +def data_dynotears_p1() -> Dict[str, np.ndarray]: + """ + Training data for testing Dynamic Bayesian Networks. Return a time series with 50 time points, with 5 columns + This data was simulated with te following configurations + Configurations: + - data points 50, + - num. variables: 5, + - p (lag amount): 1, + - graph type (intra-slice graph): 'erdos-renyi', + - graph type (inter-slice graph): 'erdos-renyi', + - SEM type: 'linear-gauss', + - weight range, intra-slice graph: (0.5, 2.0), + - weight range, inter-slice graph: (0.3, 0.5), + - expected degree, inter-slice graph: 3, + - noise scale (gaussian noise): 1.0, + - w decay: 1.1 + Returns: + dictionary with keys W (intra-weights), A (inter-weights), X and Y (inputs of from_numpy_dynamic) + """ + data = { + "W": np.array( + [ + [0.0, -0.55, 0.0, 1.48, 0.0], + [0.0, 0.0, -0.99, 0.0, 0.0], + [0.0, 0.0, 0.0, -1.13, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, -1.91, -0.64, -1.31, 0.0], + ] + ), + "A": np.array( + [ + [-0.35, -0.32, -0.33, 0.37, 0.0], + [-0.41, -0.42, -0.36, 0.33, -0.35], + [0.46, 0.0, 0.44, -0.36, -0.38], + [0.0, 0.0, -0.45, 0.0, 0.43], + [0.31, 0.4, 0.0, 0.0, -0.44], + ] + ), + "X": np.array( + [ + [-8.7, -3.1, -5.1, 2.8, -2.0], + [0.3, -5.8, 2.9, -12.6, 5.1], + [4.8, 17.4, -4.2, 17.6, -6.8], + [-12.5, -20.6, -1.9, -18.5, 8.3], + [14.4, 14.1, 7.7, 6.8, -3.0], + [-8.1, 0.1, -8.6, 9.0, -3.3], + [-3.3, -11.4, 1.3, -18.4, 8.0], + [9.4, 14.4, 3.7, 11.4, -6.5], + [-9.7, -7.0, -3.5, -3.2, -0.0], + [1.7, -2.7, 5.8, -14.8, 4.5], + [3.9, 17.2, -2.4, 19.2, -9.8], + [-12.5, -22.2, 1.2, -23.2, 7.8], + [17.2, 18.2, 9.2, 11.6, -6.4], + [-12.2, -7.9, -2.8, -2.6, -1.2], + [5.8, -4.9, 9.0, -12.6, 3.9], + [6.0, 13.5, 2.3, 15.1, -8.4], + [-10.1, -12.8, -1.4, -13.4, 4.5], + [9.7, 10.3, 5.5, 7.1, -3.5], + [-7.2, -1.4, -4.5, 0.2, -1.3], + [1.8, -3.7, 1.6, -4.8, 3.2], + [2.6, 6.7, -2.1, 9.7, -2.7], + [-4.6, -7.6, -1.7, -4.4, 2.6], + [4.6, -0.1, 4.5, -4.1, 1.9], + [3.3, 5.1, -0.7, 11.0, -4.1], + [-4.3, -12.7, -0.7, -9.5, 5.9], + [8.0, 6.0, 5.6, 0.8, -1.1], + [-2.4, 4.9, -3.9, 11.3, -4.7], + [-4.6, -12.0, -0.3, -13.4, 6.5], + [8.1, 15.9, -0.8, 16.0, -6.5], + [-11.5, -14.8, -5.0, -7.5, 4.4], + [10.0, 3.3, 7.1, -2.8, 1.7], + [-0.7, 3.8, -1.8, 9.2, -4.0], + [-1.6, -13.7, 5.3, -13.0, 4.7], + [11.0, 8.2, 7.7, 6.7, -4.0], + [-4.0, -2.4, -3.0, 1.8, -1.7], + [1.1, -4.8, 2.9, -5.9, 3.0], + [3.4, 7.5, -1.5, 9.0, -2.4], + [-6.2, -3.4, -4.8, -2.2, 1.0], + [0.8, -0.3, 1.8, -2.5, 1.0], + [0.4, 1.7, -1.6, 1.9, -0.5], + [-2.3, -4.5, 2.9, -7.4, 2.3], + [3.9, 9.3, 2.4, 4.1, -3.8], + [-4.8, -0.6, -3.6, 3.0, -2.1], + [-1.8, -7.5, 2.9, -13.2, 4.7], + [5.6, 19.0, -3.7, 18.1, -7.8], + [-13.5, -19.3, -2.7, -17.7, 7.5], + [15.5, 10.6, 9.9, 5.7, -3.1], + [-6.8, 3.3, -6.8, 9.6, -4.8], + [-5.2, -15.3, 3.9, -21.6, 7.9], + [11.1, 22.0, -0.3, 20.7, -8.4], + ] + ), + "Y": np.array( + [ + [10.8, 14.6, 5.8, 8.1, -4.5], + [-8.7, -3.1, -5.1, 2.8, -2.0], + [0.3, -5.8, 2.9, -12.6, 5.1], + [4.8, 17.4, -4.2, 17.6, -6.8], + [-12.5, -20.6, -1.9, -18.5, 8.3], + [14.4, 14.1, 7.7, 6.8, -3.0], + [-8.1, 0.1, -8.6, 9.0, -3.3], + [-3.3, -11.4, 1.3, -18.4, 8.0], + [9.4, 14.4, 3.7, 11.4, -6.5], + [-9.7, -7.0, -3.5, -3.2, -0.0], + [1.7, -2.7, 5.8, -14.8, 4.5], + [3.9, 17.2, -2.4, 19.2, -9.8], + [-12.5, -22.2, 1.2, -23.2, 7.8], + [17.2, 18.2, 9.2, 11.6, -6.4], + [-12.2, -7.9, -2.8, -2.6, -1.2], + [5.8, -4.9, 9.0, -12.6, 3.9], + [6.0, 13.5, 2.3, 15.1, -8.4], + [-10.1, -12.8, -1.4, -13.4, 4.5], + [9.7, 10.3, 5.5, 7.1, -3.5], + [-7.2, -1.4, -4.5, 0.2, -1.3], + [1.8, -3.7, 1.6, -4.8, 3.2], + [2.6, 6.7, -2.1, 9.7, -2.7], + [-4.6, -7.6, -1.7, -4.4, 2.6], + [4.6, -0.1, 4.5, -4.1, 1.9], + [3.3, 5.1, -0.7, 11.0, -4.1], + [-4.3, -12.7, -0.7, -9.5, 5.9], + [8.0, 6.0, 5.6, 0.8, -1.1], + [-2.4, 4.9, -3.9, 11.3, -4.7], + [-4.6, -12.0, -0.3, -13.4, 6.5], + [8.1, 15.9, -0.8, 16.0, -6.5], + [-11.5, -14.8, -5.0, -7.5, 4.4], + [10.0, 3.3, 7.1, -2.8, 1.7], + [-0.7, 3.8, -1.8, 9.2, -4.0], + [-1.6, -13.7, 5.3, -13.0, 4.7], + [11.0, 8.2, 7.7, 6.7, -4.0], + [-4.0, -2.4, -3.0, 1.8, -1.7], + [1.1, -4.8, 2.9, -5.9, 3.0], + [3.4, 7.5, -1.5, 9.0, -2.4], + [-6.2, -3.4, -4.8, -2.2, 1.0], + [0.8, -0.3, 1.8, -2.5, 1.0], + [0.4, 1.7, -1.6, 1.9, -0.5], + [-2.3, -4.5, 2.9, -7.4, 2.3], + [3.9, 9.3, 2.4, 4.1, -3.8], + [-4.8, -0.6, -3.6, 3.0, -2.1], + [-1.8, -7.5, 2.9, -13.2, 4.7], + [5.6, 19.0, -3.7, 18.1, -7.8], + [-13.5, -19.3, -2.7, -17.7, 7.5], + [15.5, 10.6, 9.9, 5.7, -3.1], + [-6.8, 3.3, -6.8, 9.6, -4.8], + [-5.2, -15.3, 3.9, -21.6, 7.9], + ] + ), + } + return data + + +@pytest.fixture() +def data_dynotears_p2() -> Dict[str, np.ndarray]: + """ + Training data for testing Dynamic Bayesian Networks. Return a time series with 50 time points, with 5 columns + This data was simulated with te following configurations + Configurations: + - data points 50, + - num. variables: 5, + - p (lag amount): 2, + - graph type (intra-slice graph): 'erdos-renyi', + - graph type (inter-slice graph): 'erdos-renyi', + - SEM type: 'linear-gauss', + - weight range, intra-slice graph: (0.5, 2.0), + - weight range, inter-slice graph: (0.3, 0.5), + - expected degree, inter-slice graph: 3, + - noise scale (gaussian noise): 1.0, + - w decay: 1.1 + Returns: + dictionary with keys W (intra-weights) ,A (inter-weights), X and Y (inputs of from_numpy_dynamic) + """ + data = { + "W": np.array( + [ + [0.0, 0.0, 0.0, 0.0, -1.08], + [1.16, 0.0, 0.0, -0.81, 0.89], + [-1.83, 0.58, 0.0, 0.61, -1.31], + [1.03, 0.0, 0.0, 0.0, -0.97], + [0.0, 0.0, 0.0, 0.0, 0.0], + ] + ), + "A": np.array( + [ + [0.31, 0.0, 0.0, 0.32, 0.0], + [0.0, 0.36, 0.0, 0.0, 0.5], + [-0.43, 0.0, 0.0, -0.49, -0.39], + [0.39, 0.0, 0.39, 0.0, 0.38], + [0.0, 0.37, 0.0, 0.43, 0.0], + [0.0, -0.37, 0.34, 0.0, 0.0], + [-0.3, 0.0, -0.42, 0.0, 0.45], + [0.0, -0.34, 0.0, 0.0, 0.0], + [0.0, 0.31, 0.0, 0.29, 0.36], + [-0.43, 0.37, 0.0, 0.0, -0.34], + ] + ), + "X": np.array( + [ + [3.1, 0.9, 1.6, 2.9, -6.5], + [-5.6, -4.1, 3.9, 4.7, -3.4], + [-6.3, -3.5, 2.2, 1.4, 0.4], + [-0.3, 0.2, -0.5, -1.5, 2.0], + [2.4, 1.6, -0.9, -0.7, -0.9], + [0.6, -0.1, 0.2, 1.5, -2.7], + [-2.4, -1.6, 2.2, 1.3, -1.6], + [-4.6, 0.5, 1.4, -2.6, 6.9], + [-1.3, -0.1, -1.3, -1.7, 3.7], + [0.5, 4.7, -2.0, -4.1, 6.8], + [8.8, 4.9, -2.7, -1.1, -0.3], + [3.8, 3.1, -0.4, 0.1, 1.5], + [2.0, -0.9, 0.1, 3.9, -1.6], + [-3.8, -0.9, 2.7, 1.3, 2.4], + [-4.4, 3.2, 2.1, -3.3, 9.9], + [0.2, 4.8, -1.6, -3.7, 9.7], + [5.1, 2.8, -6.4, -1.5, 3.7], + [8.1, 4.3, -2.4, -0.1, -1.6], + [2.1, 0.9, 1.5, 2.9, -4.1], + [-4.1, -3.9, 0.2, 1.1, 4.1], + [0.3, -3.0, 0.2, 3.0, -4.7], + [-1.2, 2.8, 2.7, -0.9, -0.9], + [-0.8, 1.8, 1.7, -0.7, 2.4], + [-0.2, -2.2, -1.8, 2.6, -0.3], + [-0.9, 2.1, 0.8, -2.3, 5.9], + [1.5, 3.6, 0.4, -0.1, -0.1], + [-3.0, 1.3, -1.6, -3.2, 9.5], + [6.8, -0.1, -3.5, 2.5, -3.2], + [3.5, 1.8, -1.7, -2.1, 0.5], + [5.1, 1.0, 1.6, 3.6, -6.5], + [-2.8, -3.0, 1.4, 1.8, -3.5], + [-5.9, -4.8, 3.2, 4.0, -3.6], + [-6.7, -2.2, 1.3, -2.7, 4.8], + [0.4, 0.0, -1.8, -0.9, 0.4], + [3.2, 1.7, -2.9, -3.6, 3.1], + [5.7, 0.9, -2.9, -0.3, -1.4], + [3.4, -2.0, -0.5, 3.5, -9.1], + [-4.3, -4.4, 2.9, 2.6, -5.2], + [-9.7, -6.4, 3.4, 1.9, -1.4], + [-5.2, -1.4, 1.6, -2.3, 1.0], + [-1.1, 1.1, -1.6, -3.5, 4.1], + [0.9, 2.0, -1.6, -2.6, 3.3], + [4.3, 4.8, -0.7, -0.7, 0.3], + [4.9, -0.4, -1.3, 1.8, -3.9], + [1.3, -2.9, -0.8, 2.0, -1.3], + [-0.1, -0.8, 2.9, 3.1, -6.7], + [-5.8, 0.5, 4.0, -1.2, 2.6], + [-1.8, -2.4, -2.0, -2.4, 3.5], + [3.3, 0.7, -3.0, -0.6, 0.0], + [4.5, -0.1, 0.4, 2.9, -10.8], + ] + ), + "Y": np.array( + [ + [6.1, 0.5, -2.0, 0.9, -3.0, 2.9, 1.5, -0.9, -0.2, 0.8], + [3.1, 0.9, 1.6, 2.9, -6.5, 6.1, 0.5, -2.0, 0.9, -3.0], + [-5.6, -4.1, 3.9, 4.7, -3.4, 3.1, 0.9, 1.6, 2.9, -6.5], + [-6.3, -3.5, 2.2, 1.4, 0.4, -5.6, -4.1, 3.9, 4.7, -3.4], + [-0.3, 0.2, -0.5, -1.5, 2.0, -6.3, -3.5, 2.2, 1.4, 0.4], + [2.4, 1.6, -0.9, -0.7, -0.9, -0.3, 0.2, -0.5, -1.5, 2.0], + [0.6, -0.1, 0.2, 1.5, -2.7, 2.4, 1.6, -0.9, -0.7, -0.9], + [-2.4, -1.6, 2.2, 1.3, -1.6, 0.6, -0.1, 0.2, 1.5, -2.7], + [-4.6, 0.5, 1.4, -2.6, 6.9, -2.4, -1.6, 2.2, 1.3, -1.6], + [-1.3, -0.1, -1.3, -1.7, 3.7, -4.6, 0.5, 1.4, -2.6, 6.9], + [0.5, 4.7, -2.0, -4.1, 6.8, -1.3, -0.1, -1.3, -1.7, 3.7], + [8.8, 4.9, -2.7, -1.1, -0.3, 0.5, 4.7, -2.0, -4.1, 6.8], + [3.8, 3.1, -0.4, 0.1, 1.5, 8.8, 4.9, -2.7, -1.1, -0.3], + [2.0, -0.9, 0.1, 3.9, -1.6, 3.8, 3.1, -0.4, 0.1, 1.5], + [-3.8, -0.9, 2.7, 1.3, 2.4, 2.0, -0.9, 0.1, 3.9, -1.6], + [-4.4, 3.2, 2.1, -3.3, 9.9, -3.8, -0.9, 2.7, 1.3, 2.4], + [0.2, 4.8, -1.6, -3.7, 9.7, -4.4, 3.2, 2.1, -3.3, 9.9], + [5.1, 2.8, -6.4, -1.5, 3.7, 0.2, 4.8, -1.6, -3.7, 9.7], + [8.1, 4.3, -2.4, -0.1, -1.6, 5.1, 2.8, -6.4, -1.5, 3.7], + [2.1, 0.9, 1.5, 2.9, -4.1, 8.1, 4.3, -2.4, -0.1, -1.6], + [-4.1, -3.9, 0.2, 1.1, 4.1, 2.1, 0.9, 1.5, 2.9, -4.1], + [0.3, -3.0, 0.2, 3.0, -4.7, -4.1, -3.9, 0.2, 1.1, 4.1], + [-1.2, 2.8, 2.7, -0.9, -0.9, 0.3, -3.0, 0.2, 3.0, -4.7], + [-0.8, 1.8, 1.7, -0.7, 2.4, -1.2, 2.8, 2.7, -0.9, -0.9], + [-0.2, -2.2, -1.8, 2.6, -0.3, -0.8, 1.8, 1.7, -0.7, 2.4], + [-0.9, 2.1, 0.8, -2.3, 5.9, -0.2, -2.2, -1.8, 2.6, -0.3], + [1.5, 3.6, 0.4, -0.1, -0.1, -0.9, 2.1, 0.8, -2.3, 5.9], + [-3.0, 1.3, -1.6, -3.2, 9.5, 1.5, 3.6, 0.4, -0.1, -0.1], + [6.8, -0.1, -3.5, 2.5, -3.2, -3.0, 1.3, -1.6, -3.2, 9.5], + [3.5, 1.8, -1.7, -2.1, 0.5, 6.8, -0.1, -3.5, 2.5, -3.2], + [5.1, 1.0, 1.6, 3.6, -6.5, 3.5, 1.8, -1.7, -2.1, 0.5], + [-2.8, -3.0, 1.4, 1.8, -3.5, 5.1, 1.0, 1.6, 3.6, -6.5], + [-5.9, -4.8, 3.2, 4.0, -3.6, -2.8, -3.0, 1.4, 1.8, -3.5], + [-6.7, -2.2, 1.3, -2.7, 4.8, -5.9, -4.8, 3.2, 4.0, -3.6], + [0.4, 0.0, -1.8, -0.9, 0.4, -6.7, -2.2, 1.3, -2.7, 4.8], + [3.2, 1.7, -2.9, -3.6, 3.1, 0.4, 0.0, -1.8, -0.9, 0.4], + [5.7, 0.9, -2.9, -0.3, -1.4, 3.2, 1.7, -2.9, -3.6, 3.1], + [3.4, -2.0, -0.5, 3.5, -9.1, 5.7, 0.9, -2.9, -0.3, -1.4], + [-4.3, -4.4, 2.9, 2.6, -5.2, 3.4, -2.0, -0.5, 3.5, -9.1], + [-9.7, -6.4, 3.4, 1.9, -1.4, -4.3, -4.4, 2.9, 2.6, -5.2], + [-5.2, -1.4, 1.6, -2.3, 1.0, -9.7, -6.4, 3.4, 1.9, -1.4], + [-1.1, 1.1, -1.6, -3.5, 4.1, -5.2, -1.4, 1.6, -2.3, 1.0], + [0.9, 2.0, -1.6, -2.6, 3.3, -1.1, 1.1, -1.6, -3.5, 4.1], + [4.3, 4.8, -0.7, -0.7, 0.3, 0.9, 2.0, -1.6, -2.6, 3.3], + [4.9, -0.4, -1.3, 1.8, -3.9, 4.3, 4.8, -0.7, -0.7, 0.3], + [1.3, -2.9, -0.8, 2.0, -1.3, 4.9, -0.4, -1.3, 1.8, -3.9], + [-0.1, -0.8, 2.9, 3.1, -6.7, 1.3, -2.9, -0.8, 2.0, -1.3], + [-5.8, 0.5, 4.0, -1.2, 2.6, -0.1, -0.8, 2.9, 3.1, -6.7], + [-1.8, -2.4, -2.0, -2.4, 3.5, -5.8, 0.5, 4.0, -1.2, 2.6], + [3.3, 0.7, -3.0, -0.6, 0.0, -1.8, -2.4, -2.0, -2.4, 3.5], + ] + ), + } + return data + + +@pytest.fixture() +def data_dynotears_p3() -> Dict[str, np.ndarray]: + """ + Training data for testing Dynamic Bayesian Networks. Return a time series with 50 time points, with 5 columns + This data was simulated with te following configurations. + Configurations: + - data points 50, + - num. variables: 5, + - p (lag amount): 3, + - graph type (intra-slice graph): 'erdos-renyi', + - graph type (inter-slice graph): 'erdos-renyi', + - SEM type: 'linear-gauss', + - weight range, intra-slice graph: (0.5, 2.0), + - weight range, inter-slice graph: (0.3, 0.5), + - expected degree, inter-slice graph: 3, + - noise scale (gaussian noise): 1.0, + - w decay: 1.1 + Returns: + dictionary with keys W (intra-weights), A (inter-weights), X and Y (inputs of from_numpy_dynamic) + """ + data = { + "W": np.array( + [ + [0.0, 0.0, -1.18, 0.0, -0.92], + [0.0, 0.0, -1.71, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, -1.15], + [0.0, 0.8, 0.0, 0.0, -1.65], + [0.0, 0.0, 0.0, 0.0, 0.0], + ] + ), + "A": np.array( + [ + [-0.42, -0.4, 0.35, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [-0.38, 0.0, -0.48, -0.48, 0.44], + [0.0, -0.36, -0.31, 0.35, 0.0], + [0.0, 0.0, -0.35, 0.0, -0.45], + [0.27, 0.0, 0.0, -0.28, -0.43], + [-0.44, 0.35, 0.0, 0.0, -0.44], + [0.0, 0.0, 0.0, 0.0, 0.0], + [-0.42, -0.39, -0.27, 0.3, -0.29], + [0.0, -0.43, 0.0, -0.41, 0.4], + [0.0, 0.32, 0.38, -0.37, 0.37], + [0.0, 0.37, 0.0, 0.0, 0.41], + [0.0, 0.34, 0.0, 0.27, -0.28], + [-0.31, 0.0, 0.0, -0.34, 0.0], + [0.27, -0.38, 0.0, 0.37, 0.36], + ] + ), + "X": np.array( + [ + [-21.5, 26.1, -56.8, -15.4, 82.4], + [14.4, 20.3, -51.3, -18.8, -4.6], + [16.5, -17.0, 48.2, 18.7, -100.2], + [5.1, -38.8, 70.5, 5.1, 8.6], + [-19.4, 20.5, -47.8, -4.3, 86.9], + [9.8, 28.7, -69.5, -18.3, 8.2], + [9.5, -7.1, 42.4, 13.3, -107.0], + [2.8, -42.5, 83.7, 1.5, 3.1], + [-25.7, 18.6, -42.4, -8.5, 112.6], + [11.5, 34.3, -94.2, -20.5, 33.3], + [19.0, -9.7, 39.5, 18.2, -137.5], + [7.9, -60.4, 122.3, 8.1, -33.3], + [-32.5, 6.0, -19.4, -11.0, 130.2], + [3.5, 48.0, -127.1, -33.7, 81.8], + [28.4, 7.6, 1.0, 16.3, -146.2], + [20.0, -71.8, 153.1, 17.9, -93.3], + [-34.3, -13.0, 20.5, -4.4, 130.2], + [-9.4, 58.4, -150.8, -41.9, 141.6], + [26.7, 32.9, -49.8, 7.9, -134.1], + [34.3, -74.1, 161.6, 26.1, -152.1], + [-34.2, -35.1, 75.5, 8.0, 105.1], + [-21.9, 54.5, -151.3, -49.8, 199.9], + [19.2, 54.9, -98.1, -8.1, -95.3], + [47.4, -72.0, 158.9, 22.4, -192.6], + [-25.7, -60.3, 129.5, 21.9, 54.6], + [-26.9, 42.2, -136.6, -48.4, 243.1], + [12.6, 79.6, -153.1, -11.6, -57.4], + [54.1, -66.7, 153.3, 17.7, -241.5], + [-26.1, -79.5, 185.0, 25.9, 19.5], + [-34.8, 36.3, -133.8, -54.2, 311.0], + [10.6, 105.3, -219.7, -19.1, -15.3], + [73.9, -58.5, 138.7, 25.1, -307.4], + [-17.7, -107.1, 257.4, 47.9, -58.4], + [-51.3, 17.0, -93.4, -59.1, 354.7], + [-13.4, 140.4, -287.4, -43.0, 83.7], + [85.8, -28.4, 71.1, 11.6, -310.8], + [4.5, -130.3, 310.9, 68.7, -171.8], + [-51.1, -25.3, -12.3, -46.8, 361.3], + [-34.2, 158.2, -341.9, -48.7, 191.1], + [92.6, 7.7, -9.7, -2.9, -314.2], + [20.2, -143.1, 363.2, 83.7, -294.6], + [-54.6, -69.6, 82.3, -36.4, 356.7], + [-60.4, 176.2, -390.3, -61.8, 319.4], + [97.2, 53.8, -116.7, -18.9, -289.0], + [46.0, -152.8, 400.1, 103.7, -440.8], + [-52.1, -124.2, 206.7, -16.7, 307.3], + [-93.8, 183.7, -415.9, -74.5, 465.6], + [92.4, 118.4, -265.0, -47.7, -202.0], + [76.5, -140.2, 396.2, 116.8, -592.8], + [-36.6, -195.8, 366.6, 8.3, 202.5], + ] + ), + "Y_1": np.array( + [ + [-4.9, -31.2, 50.6, -5.4, 42.7, 11.0, -26.3, 66.7, 21.5, -82.3], + [-21.5, 26.1, -56.8, -15.4, 82.4, -4.9, -31.2, 50.6, -5.4, 42.7], + [14.4, 20.3, -51.3, -18.8, -4.6, -21.5, 26.1, -56.8, -15.4, 82.4], + [16.5, -17.0, 48.2, 18.7, -100.2, 14.4, 20.3, -51.3, -18.8, -4.6], + [5.1, -38.8, 70.5, 5.1, 8.6, 16.5, -17.0, 48.2, 18.7, -100.2], + [-19.4, 20.5, -47.8, -4.3, 86.9, 5.1, -38.8, 70.5, 5.1, 8.6], + [9.8, 28.7, -69.5, -18.3, 8.2, -19.4, 20.5, -47.8, -4.3, 86.9], + [9.5, -7.1, 42.4, 13.3, -107.0, 9.8, 28.7, -69.5, -18.3, 8.2], + [2.8, -42.5, 83.7, 1.5, 3.1, 9.5, -7.1, 42.4, 13.3, -107.0], + [-25.7, 18.6, -42.4, -8.5, 112.6, 2.8, -42.5, 83.7, 1.5, 3.1], + [11.5, 34.3, -94.2, -20.5, 33.3, -25.7, 18.6, -42.4, -8.5, 112.6], + [19.0, -9.7, 39.5, 18.2, -137.5, 11.5, 34.3, -94.2, -20.5, 33.3], + [7.9, -60.4, 122.3, 8.1, -33.3, 19.0, -9.7, 39.5, 18.2, -137.5], + [-32.5, 6.0, -19.4, -11.0, 130.2, 7.9, -60.4, 122.3, 8.1, -33.3], + [3.5, 48.0, -127.1, -33.7, 81.8, -32.5, 6.0, -19.4, -11.0, 130.2], + [28.4, 7.6, 1.0, 16.3, -146.2, 3.5, 48.0, -127.1, -33.7, 81.8], + [20.0, -71.8, 153.1, 17.9, -93.3, 28.4, 7.6, 1.0, 16.3, -146.2], + [-34.3, -13.0, 20.5, -4.4, 130.2, 20.0, -71.8, 153.1, 17.9, -93.3], + [-9.4, 58.4, -150.8, -41.9, 141.6, -34.3, -13.0, 20.5, -4.4, 130.2], + [26.7, 32.9, -49.8, 7.9, -134.1, -9.4, 58.4, -150.8, -41.9, 141.6], + [34.3, -74.1, 161.6, 26.1, -152.1, 26.7, 32.9, -49.8, 7.9, -134.1], + [-34.2, -35.1, 75.5, 8.0, 105.1, 34.3, -74.1, 161.6, 26.1, -152.1], + [-21.9, 54.5, -151.3, -49.8, 199.9, -34.2, -35.1, 75.5, 8.0, 105.1], + [19.2, 54.9, -98.1, -8.1, -95.3, -21.9, 54.5, -151.3, -49.8, 199.9], + [47.4, -72.0, 158.9, 22.4, -192.6, 19.2, 54.9, -98.1, -8.1, -95.3], + [-25.7, -60.3, 129.5, 21.9, 54.6, 47.4, -72.0, 158.9, 22.4, -192.6], + [-26.9, 42.2, -136.6, -48.4, 243.1, -25.7, -60.3, 129.5, 21.9, 54.6], + [12.6, 79.6, -153.1, -11.6, -57.4, -26.9, 42.2, -136.6, -48.4, 243.1], + [54.1, -66.7, 153.3, 17.7, -241.5, 12.6, 79.6, -153.1, -11.6, -57.4], + [-26.1, -79.5, 185.0, 25.9, 19.5, 54.1, -66.7, 153.3, 17.7, -241.5], + [-34.8, 36.3, -133.8, -54.2, 311.0, -26.1, -79.5, 185.0, 25.9, 19.5], + [10.6, 105.3, -219.7, -19.1, -15.3, -34.8, 36.3, -133.8, -54.2, 311.0], + [73.9, -58.5, 138.7, 25.1, -307.4, 10.6, 105.3, -219.7, -19.1, -15.3], + [-17.7, -107.1, 257.4, 47.9, -58.4, 73.9, -58.5, 138.7, 25.1, -307.4], + [-51.3, 17.0, -93.4, -59.1, 354.7, -17.7, -107.1, 257.4, 47.9, -58.4], + [-13.4, 140.4, -287.4, -43.0, 83.7, -51.3, 17.0, -93.4, -59.1, 354.7], + [85.8, -28.4, 71.1, 11.6, -310.8, -13.4, 140.4, -287.4, -43.0, 83.7], + [4.5, -130.3, 310.9, 68.7, -171.8, 85.8, -28.4, 71.1, 11.6, -310.8], + [-51.1, -25.3, -12.3, -46.8, 361.3, 4.5, -130.3, 310.9, 68.7, -171.8], + [-34.2, 158.2, -341.9, -48.7, 191.1, -51.1, -25.3, -12.3, -46.8, 361.3], + [92.6, 7.7, -9.7, -2.9, -314.2, -34.2, 158.2, -341.9, -48.7, 191.1], + [20.2, -143.1, 363.2, 83.7, -294.6, 92.6, 7.7, -9.7, -2.9, -314.2], + [-54.6, -69.6, 82.3, -36.4, 356.7, 20.2, -143.1, 363.2, 83.7, -294.6], + [-60.4, 176.2, -390.3, -61.8, 319.4, -54.6, -69.6, 82.3, -36.4, 356.7], + [97.2, 53.8, -116.7, -18.9, -289.0, -60.4, 176.2, -390.3, -61.8, 319.4], + [46.0, -152.8, 400.1, 103.7, -440.8, 97.2, 53.8, -116.7, -18.9, -289.0], + [-52.1, -124.2, 206.7, -16.7, 307.3, 46, -152.8, 400.1, 103.7, -440.8], + [-93.8, 183.7, -415.9, -74.5, 465.6, -52, -124.2, 206.7, -16.7, 307.3], + [92.4, 118.4, -265.0, -47.7, -202, -93.8, 183.7, -415.9, -74.5, 465.6], + [76.5, -140.2, 396.2, 116.8, -592.8, 92.4, 118.4, -265, -47.7, -202], + ] + ), + "Y_2": np.array( + [ + [22.4, 6.9, -22.7, -6.3, -40.2], + [11.0, -26.3, 66.7, 21.5, -82.3], + [-4.9, -31.2, 50.6, -5.4, 42.7], + [-21.5, 26.1, -56.8, -15.4, 82.4], + [14.4, 20.3, -51.3, -18.8, -4.6], + [16.5, -17.0, 48.2, 18.7, -100.2], + [5.1, -38.8, 70.5, 5.1, 8.6], + [-19.4, 20.5, -47.8, -4.3, 86.9], + [9.8, 28.7, -69.5, -18.3, 8.2], + [9.5, -7.1, 42.4, 13.3, -107.0], + [2.8, -42.5, 83.7, 1.5, 3.1], + [-25.7, 18.6, -42.4, -8.5, 112.6], + [11.5, 34.3, -94.2, -20.5, 33.3], + [19.0, -9.7, 39.5, 18.2, -137.5], + [7.9, -60.4, 122.3, 8.1, -33.3], + [-32.5, 6.0, -19.4, -11.0, 130.2], + [3.5, 48.0, -127.1, -33.7, 81.8], + [28.4, 7.6, 1.0, 16.3, -146.2], + [20.0, -71.8, 153.1, 17.9, -93.3], + [-34.3, -13.0, 20.5, -4.4, 130.2], + [-9.4, 58.4, -150.8, -41.9, 141.6], + [26.7, 32.9, -49.8, 7.9, -134.1], + [34.3, -74.1, 161.6, 26.1, -152.1], + [-34.2, -35.1, 75.5, 8.0, 105.1], + [-21.9, 54.5, -151.3, -49.8, 199.9], + [19.2, 54.9, -98.1, -8.1, -95.3], + [47.4, -72.0, 158.9, 22.4, -192.6], + [-25.7, -60.3, 129.5, 21.9, 54.6], + [-26.9, 42.2, -136.6, -48.4, 243.1], + [12.6, 79.6, -153.1, -11.6, -57.4], + [54.1, -66.7, 153.3, 17.7, -241.5], + [-26.1, -79.5, 185.0, 25.9, 19.5], + [-34.8, 36.3, -133.8, -54.2, 311.0], + [10.6, 105.3, -219.7, -19.1, -15.3], + [73.9, -58.5, 138.7, 25.1, -307.4], + [-17.7, -107.1, 257.4, 47.9, -58.4], + [-51.3, 17.0, -93.4, -59.1, 354.7], + [-13.4, 140.4, -287.4, -43.0, 83.7], + [85.8, -28.4, 71.1, 11.6, -310.8], + [4.5, -130.3, 310.9, 68.7, -171.8], + [-51.1, -25.3, -12.3, -46.8, 361.3], + [-34.2, 158.2, -341.9, -48.7, 191.1], + [92.6, 7.7, -9.7, -2.9, -314.2], + [20.2, -143.1, 363.2, 83.7, -294.6], + [-54.6, -69.6, 82.3, -36.4, 356.7], + [-60.4, 176.2, -390.3, -61.8, 319.4], + [97.2, 53.8, -116.7, -18.9, -289.0], + [46.0, -152.8, 400.1, 103.7, -440.8], + [-52.1, -124.2, 206.7, -16.7, 307.3], + [-93.8, 183.7, -415.9, -74.5, 465.6], + ] + ), + } + + data["Y"] = np.array( + [list(y1) + list(y2) for y1, y2 in zip(data["Y_1"], data["Y_2"])] + ) + del data["Y_1"] + del data["Y_2"] + return data + + @pytest.fixture def adjacency_mat_num_stability() -> np.ndarray: """ diff --git a/tests/structure/data_generators/__init__.py b/tests/structure/data_generators/__init__.py new file mode 100644 index 0000000..5da8261 --- /dev/null +++ b/tests/structure/data_generators/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/structure/data_generators/test_core.py b/tests/structure/data_generators/test_core.py new file mode 100644 index 0000000..e1dad0d --- /dev/null +++ b/tests/structure/data_generators/test_core.py @@ -0,0 +1,543 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +import operator +from typing import Hashable, Tuple, Union + +import networkx as nx +import numpy as np +import pandas as pd +import pytest +from networkx.algorithms.dag import is_directed_acyclic_graph +from sklearn.gaussian_process.kernels import RBF + +from causalnex.structure.data_generators.core import ( + _sample_binary_from_latent, + _sample_count_from_latent, + _sample_poisson, + generate_structure, + nonlinear_sem_generator, + sem_generator, +) +from causalnex.structure.structuremodel import StructureModel + + +@pytest.fixture +def graph(): + graph = StructureModel() + edges = [(n, n + 1, 1) for n in range(5)] + graph.add_weighted_edges_from(edges) + return graph + + +@pytest.fixture +def schema(): + # use the default schema for 3 + schema = { + 0: "binary", + 1: "categorical:3", + 2: "binary", + 4: "continuous", + 5: "categorical:5", + } + return schema + + +class TestGenerateStructure: + @pytest.mark.parametrize("graph_type", ["erdos-renyi", "barabasi-albert", "full"]) + def test_is_dag_graph_type(self, graph_type): + """ Tests that the generated graph is a dag for all graph types. """ + degree, d_nodes = 4, 10 + sm = generate_structure(d_nodes, degree, graph_type) + assert is_directed_acyclic_graph(sm) + + @pytest.mark.parametrize("num_nodes,degree", [(5, 2), (10, 3), (15, 5)]) + def test_is_dag_nodes_degrees(self, num_nodes, degree): + """Tests that generated graph is dag for different numbers of nodes and degrees""" + sm = generate_structure(num_nodes, degree) + assert nx.is_directed_acyclic_graph(sm) + + def test_bad_graph_type(self): + """ Test that a value other than "erdos-renyi", "barabasi-albert", "full" throws ValueError """ + graph_type = "invalid" + degree, d_nodes = 4, 10 + with pytest.raises( + ValueError, + match="Unknown graph type invalid. Available types" + r" are \['erdos-renyi', 'barabasi-albert', 'full'\]", + ): + generate_structure(d_nodes, degree, graph_type) + + @pytest.mark.parametrize("num_nodes,degree", [(5, 2), (10, 3), (15, 5)]) + def test_expected_num_nodes(self, num_nodes, degree): + """ Test that generated structure has expected number of nodes = num_nodes """ + sm = generate_structure(num_nodes, degree) + assert len(sm.nodes) == num_nodes + + @pytest.mark.parametrize( + "num_nodes,degree,w_range", + [(5, 2, (1, 2)), (10, 3, (100, 200)), (15, 5, (1.0, 1.0))], + ) + def test_weight_range(self, num_nodes, degree, w_range): + """ Test that w_range is respected in output """ + w_min = w_range[0] + w_max = w_range[1] + sm = generate_structure(num_nodes, degree, w_min=w_min, w_max=w_max) + assert all(abs(sm[u][v]["weight"]) >= w_min for u, v in sm.edges) + assert all(abs(sm[u][v]["weight"]) <= w_max for u, v in sm.edges) + + @pytest.mark.parametrize("num_nodes", [-1, 0, 1]) + def test_num_nodes_exception(self, num_nodes): + """ Check a single node graph can't be generated """ + with pytest.raises(ValueError, match="DAG must have at least 2 nodes"): + generate_structure(num_nodes, 1) + + def test_min_max_weights_exception(self): + """ Check that w_range is valid """ + with pytest.raises( + ValueError, + match="Absolute minimum weight must be less than or equal to maximum weight", + ): + generate_structure(4, 1, w_min=0.5, w_max=0) + + def test_min_max_weights_equal(self): + """ If w_range (w, w) has w=w, check abs value of all weights respect this """ + w = 1 + sm = generate_structure(4, 1, w_min=w, w_max=w) + w_mat = nx.to_numpy_array(sm) + assert np.all((w_mat == 0) | (w_mat == w) | (w_mat == -w)) + + def test_erdos_renyi_degree_increases_edges(self): + """ Erdos-Renyi degree increases edges """ + edge_counts = [ + max( + [ + len(generate_structure(100, degree, "erdos-renyi").edges) + for _ in range(10) + ] + ) + for degree in [10, 90] + ] + + assert edge_counts == sorted(edge_counts) + + def test_barabasi_albert_degree_increases_edges(self): + """ Barabasi-Albert degree increases edges """ + edge_counts = [ + max( + [ + len(generate_structure(100, degree, "barabasi-albert").edges) + for _ in range(10) + ] + ) + for degree in [10, 90] + ] + + assert edge_counts == sorted(edge_counts) + + def test_full_network(self): + """ Fully connected network has expected edge counts """ + sm = generate_structure(40, degree=0, graph_type="full") + + assert len(sm.edges) == (40 * 39) / 2 + + +class TestMixedDataGen: + def test_run(self, graph, schema): + df = sem_generator( + graph=graph, + schema=schema, + default_type="continuous", + noise_std=1.0, + n_samples=1000, + intercept=False, + seed=12, + ) + + # test binary: + assert df[0].nunique() == 2 + assert df[2].nunique() == 2 + + # test categorical: + for col in ["1_{}".format(i) for i in range(3)]: + assert df[col].nunique() == 2 + assert len([x for x in df.columns if isinstance(x, str) and "1_" in x]) == 3 + + for col in ["5_{}".format(i) for i in range(5)]: + assert df[col].nunique() == 2 + assert len([x for x in df.columns if isinstance(x, str) and "5_" in x]) == 5 + + # test continuous + assert df[3].nunique() == 1000 + assert df[4].nunique() == 1000 + + def test_graph_not_a_dag(self): + graph = StructureModel() + graph.add_edges_from([(0, 1), (1, 2), (2, 0)]) + + with pytest.raises(ValueError, match="Provided graph is not a DAG"): + _ = sem_generator(graph=graph, seed=42) + + def test_not_permissible_type(self, graph): + schema = { + 0: "unknown data type", + } + with pytest.raises(ValueError, match="Unknown data type"): + _ = sem_generator( + graph=graph, + schema=schema, + default_type="continuous", + noise_std=1.0, + n_samples=1000, + intercept=False, + seed=12, + ) + + def test_missing_cardinality(self, graph): + schema = { + 0: "categorical", + 1: "categorical:3", + 5: "categorical:5", + } + with pytest.raises(ValueError, match="Missing cardinality for categorical"): + _ = sem_generator( + graph=graph, + schema=schema, + default_type="continuous", + noise_std=1.0, + n_samples=1000, + intercept=False, + seed=12, + ) + + def test_missing_default_type(self, graph): + with pytest.raises(ValueError, match="Unknown default data type"): + _ = sem_generator( + graph=graph, + schema=schema, + default_type="unknown", + noise_std=1.0, + n_samples=1000, + intercept=False, + seed=12, + ) + + def test_incorrect_weight_dist(self): + sm = StructureModel() + nodes = list(str(x) for x in range(6)) + np.random.shuffle(nodes) + sm.add_nodes_from(nodes) + + sm.add_weighted_edges_from([("0", "1", None), ("2", "4", None)]) + + with pytest.raises(ValueError, match="Unknown weight distribution"): + _ = sem_generator( + graph=sm, + schema=None, + default_type="continuous", + distributions={"weight": "unknown"}, + noise_std=2.0, + n_samples=1000, + intercept=False, + seed=10, + ) + + def test_incorrect_intercept_dist(self, graph): + with pytest.raises(ValueError, match="Unknown intercept distribution"): + _ = sem_generator( + graph=graph, + schema=None, + default_type="continuous", + distributions={"intercept": "unknown"}, + noise_std=2.0, + n_samples=10, + intercept=True, + seed=10, + ) + + # Seed 20 is an unlucky seed and fails the assertion. All other seeds tested + # pass the assertion. Similar issue to the categorical intercept test? + @pytest.mark.parametrize("seed", (10, 17)) + @pytest.mark.parametrize( + "n_categories", + ( + 2, + 5, + ), + ) + @pytest.mark.parametrize("weight_distribution", ["uniform", "gaussian"]) + @pytest.mark.parametrize("intercept_distribution", ["uniform", "gaussian"]) + def test_mixed_type_independence( + self, seed, n_categories, weight_distribution, intercept_distribution + ): + """ + Test whether the relation is accurate, implicitly tests sequence of + nodes. + """ + np.random.seed(seed) + + sm = StructureModel() + nodes = list(str(x) for x in range(6)) + np.random.shuffle(nodes) + sm.add_nodes_from(nodes) + # binary -> categorical + sm.add_weighted_edges_from([("0", "1", 10)]) + # binary -> continuous + sm.add_weighted_edges_from([("2", "4", None)]) + # binary -> count + sm.add_weighted_edges_from([("2", "6", 100)]) + + schema = { + "0": "binary", + "1": "categorical:{}".format(n_categories), + "2": "binary", + "4": "continuous", + "5": "categorical:{}".format(n_categories), + "6": "count", + } + + df = sem_generator( + graph=sm, + schema=schema, + default_type="continuous", + distributions={ + "weight": weight_distribution, + "intercept": intercept_distribution, + "count": 0.05, + }, + noise_std=2, + n_samples=100000, + intercept=True, + seed=seed, + ) + + atol = 0.05 # 5% difference bewteen joint & factored! + # 1. dependent links + # 0 -> 1 (we look at the class with the highest deviation from uniform + # to avoid small values) + c, _ = max( + [ + (c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) + for c in range(n_categories) + ], + key=operator.itemgetter(1), + ) + joint_proba, factored_proba = calculate_proba(df, "0", "1_{}".format(c)) + assert not np.isclose(joint_proba, factored_proba, rtol=0, atol=atol) + # 2 -> 4 + assert not np.isclose( + df["4"].mean(), df["4"][df["2"] == 1].mean(), rtol=0, atol=atol + ) + # binary on count + assert not np.isclose( + df.loc[df["2"] == 0, "6"].mean(), + df.loc[df["2"] == 1, "6"].mean(), + rtol=0, + atol=atol, + ) + + tol = 0.15 # relative tolerance of +- 15% of the + # 2. independent links + # categorical + c, _ = max( + [ + (c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) + for c in range(n_categories) + ], + key=operator.itemgetter(1), + ) + joint_proba, factored_proba = calculate_proba(df, "0", "5_{}".format(c)) + assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) + + # binary + joint_proba, factored_proba = calculate_proba(df, "0", "2") + assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) + + # categorical + c, _ = max( + [ + (c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) + for c in range(n_categories) + ], + key=operator.itemgetter(1), + ) + d, _ = max( + [ + (d, np.abs(df["5_{}".format(d)].mean() - 1 / n_categories)) + for d in range(n_categories) + ], + key=operator.itemgetter(1), + ) + joint_proba, factored_proba = calculate_proba( + df, "1_{}".format(d), "5_{}".format(c) + ) + assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) + + # continuous + # for gaussian distributions, zero variance is equivalent to independence + assert np.isclose(df[["3", "4"]].corr().values[0, 1], 0, atol=tol) + + +def calculate_proba( + df: Union[pd.DataFrame, np.ndarray], col_0: Hashable, col_1: Hashable +) -> Tuple[float, float]: + if isinstance(df, pd.DataFrame): + marginal_0 = df[col_0].mean() + marginal_1 = df[col_1].mean() + joint_proba = (df[col_0] * df[col_1]).mean() + else: + marginal_0 = df[:, col_0].mean() + marginal_1 = df[:, col_1].mean() + joint_proba = (df[:, col_0] * df[:, col_1]).mean() + + factored_proba = marginal_0 * marginal_1 + return joint_proba, factored_proba + + +@pytest.mark.parametrize("distribution", ["probit", "logit"]) +@pytest.mark.parametrize("max_imbalance", [0.01, 0.05, 0.1, 0.5]) +def test_sample_binary_from_latent_imbalance(max_imbalance, distribution): + """ + Tests max imbalance argument to sample the binary variable. + This way we are guaranteed to always have some positives/negatives. + """ + # corner case: + eta = np.ones(1000) * 1000 + + sample = _sample_binary_from_latent( + latent_mean=eta, + distribution=distribution, + noise_std=0.1, + root_node=False, + max_imbalance=max_imbalance, + ) + tol = 0.01 + assert np.isclose(sample.mean(), max_imbalance, atol=0, rtol=tol) + + +@pytest.mark.parametrize("poisson_lambda", [0.1, 1, 10, 100]) +def test_sample_poisson(poisson_lambda): + """ + We test whether the first two moments match a Poisson distribution + """ + sample = _sample_poisson(np.ones(shape=10000) * poisson_lambda) + tol = 0.05 + assert np.isclose(sample.mean(), poisson_lambda, atol=0, rtol=tol) + assert np.isclose(sample.var(), poisson_lambda, atol=0, rtol=tol) + + +@pytest.mark.parametrize("poisson_lambda", [-0.5, 0.1, 1, 10, 100]) +@pytest.mark.parametrize("zero_inflation_pct", [0.0, 0.01, 0.1, 0.5, 1.0]) +def test_sample_count_from_latent_zero_inflation(poisson_lambda, zero_inflation_pct): + """ + We test whether the zero-inflation is functional using the first two moments. + """ + sample = _sample_count_from_latent( + np.ones(shape=10000) * poisson_lambda, + zero_inflation_pct=zero_inflation_pct, + root_node=False, + ) + if poisson_lambda < 0: + poisson_lambda = np.exp(poisson_lambda) + + tol = 0.1 + assert np.isclose( + sample.mean(), (1 - zero_inflation_pct) * poisson_lambda, atol=0, rtol=tol + ) + assert np.isclose( + sample.var(), + (1 + zero_inflation_pct * poisson_lambda) + * (1 - zero_inflation_pct) + * poisson_lambda, + atol=0, + rtol=tol, + ) + + +class TestCountGenerator: + @pytest.mark.parametrize( + "zero_inflation_pct", [int(0), 0.0, 0.01, 0.1, 0.5, 1.0, int(1)] + ) + def test_only_count(self, graph, zero_inflation_pct): + df = sem_generator( + graph, + default_type="count", + n_samples=1000, + distributions={"count": zero_inflation_pct}, + seed=43, + ) + # count puts a lower bound on the output: + assert np.all(df.min() >= 0) + + # zero inflation puts a lower bound on the zero-share + assert np.all((df == 0).mean() >= zero_inflation_pct) + + # How to test dependence/independence for Poisson? + + @pytest.mark.parametrize("wrong_count_zif", ["text", (0.1,), {0.1}, -0.1, 1.01]) + def test_zif_value_error(self, graph, wrong_count_zif): + """ + Test if ValueError raised for unsupported Zero-Inflation Factor for the + count data type. + """ + with pytest.raises(ValueError, match="Unsupported zero-inflation factor"): + sem_generator( + graph, + default_type="count", + distributions={"count": wrong_count_zif}, + seed=42, + ) + + +class TestNonlinearGenerator: + def test_run(self, graph, schema): + df = nonlinear_sem_generator( + graph=graph, + schema=schema, + kernel=RBF(1), + default_type="continuous", + noise_std=1.0, + n_samples=1000, + seed=13, + ) + + # test binary: + assert df[0].nunique() == 2 + assert df[2].nunique() == 2 + + # test categorical: + for col in ["1_{}".format(i) for i in range(3)]: + assert df[col].nunique() == 2 + assert len([x for x in df.columns if isinstance(x, str) and "1_" in x]) == 3 + + for col in ["5_{}".format(i) for i in range(5)]: + assert df[col].nunique() == 2 + assert len([x for x in df.columns if isinstance(x, str) and "5_" in x]) == 5 + + # test continuous + assert df[3].nunique() == 1000 + assert df[4].nunique() == 1000 diff --git a/tests/structure/test_data_generators.py b/tests/structure/data_generators/test_wrappers.py similarity index 63% rename from tests/structure/test_data_generators.py rename to tests/structure/data_generators/test_wrappers.py index 807a0ea..486db8c 100644 --- a/tests/structure/test_data_generators.py +++ b/tests/structure/data_generators/test_wrappers.py @@ -1,3 +1,4 @@ +# pylint: disable=too-many-lines # Copyright 2019-2020 QuantumBlack Visual Analytics Limited # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,28 +26,30 @@ # # See the License for the specific language governing permissions and # limitations under the License. -import operator +import re import string from itertools import product -from typing import Hashable, Tuple, Union -import networkx as nx import numpy as np import pandas as pd import pytest -from networkx.algorithms.dag import is_directed_acyclic_graph from scipy.stats import anderson, stats +from sklearn.gaussian_process.kernels import RBF +from causalnex.structure import StructureModel from causalnex.structure.data_generators import ( + gen_stationary_dyn_net_and_df, generate_binary_data, generate_binary_dataframe, generate_categorical_dataframe, generate_continuous_data, generate_continuous_dataframe, + generate_count_dataframe, + generate_dataframe_dynamic, generate_structure, - sem_generator, + generate_structure_dynamic, ) -from causalnex.structure.structuremodel import StructureModel +from tests.structure.data_generators.test_core import calculate_proba @pytest.fixture @@ -58,19 +61,6 @@ def graph(): @pytest.fixture -def schema(): - # use the default schema for 3 - schema = { - 0: "binary", - 1: "categorical:3", - 2: "binary", - 4: "continuous", - 5: "categorical:5", - } - return schema - - -@pytest.fixture() def graph_gen(): def generator(num_nodes, seed, weight=None): np.random.seed(seed) @@ -89,102 +79,6 @@ def generator(num_nodes, seed, weight=None): return generator -class TestGenerateStructure: - @pytest.mark.parametrize("graph_type", ["erdos-renyi", "barabasi-albert", "full"]) - def test_is_dag_graph_type(self, graph_type): - """ Tests that the generated graph is a dag for all graph types. """ - degree, d_nodes = 4, 10 - sm = generate_structure(d_nodes, degree, graph_type) - assert is_directed_acyclic_graph(sm) - - @pytest.mark.parametrize("num_nodes,degree", [(5, 2), (10, 3), (15, 5)]) - def test_is_dag_nodes_degrees(self, num_nodes, degree): - """ Tests that generated graph is dag for different numbers of nodes and degrees - """ - sm = generate_structure(num_nodes, degree) - assert nx.is_directed_acyclic_graph(sm) - - def test_bad_graph_type(self): - """ Test that a value other than "erdos-renyi", "barabasi-albert", "full" throws ValueError """ - graph_type = "invalid" - degree, d_nodes = 4, 10 - with pytest.raises(ValueError, match="unknown graph type"): - generate_structure(d_nodes, degree, graph_type) - - @pytest.mark.parametrize("num_nodes,degree", [(5, 2), (10, 3), (15, 5)]) - def test_expected_num_nodes(self, num_nodes, degree): - """ Test that generated structure has expected number of nodes = num_nodes """ - sm = generate_structure(num_nodes, degree) - assert len(sm.nodes) == num_nodes - - @pytest.mark.parametrize( - "num_nodes,degree,w_range", - [(5, 2, (1, 2)), (10, 3, (100, 200)), (15, 5, (1.0, 1.0))], - ) - def test_weight_range(self, num_nodes, degree, w_range): - """ Test that w_range is respected in output """ - w_min = w_range[0] - w_max = w_range[1] - sm = generate_structure(num_nodes, degree, w_min=w_min, w_max=w_max) - assert all(abs(sm[u][v]["weight"]) >= w_min for u, v in sm.edges) - assert all(abs(sm[u][v]["weight"]) <= w_max for u, v in sm.edges) - - @pytest.mark.parametrize("num_nodes", [-1, 0, 1]) - def test_num_nodes_exception(self, num_nodes): - """ Check a single node graph can't be generated """ - with pytest.raises(ValueError, match="DAG must have at least 2 nodes"): - generate_structure(num_nodes, 1) - - def test_min_max_weights_exception(self): - """ Check that w_range is valid """ - with pytest.raises( - ValueError, - match="Absolute minimum weight must be less than or equal to maximum weight", - ): - generate_structure(4, 1, w_min=0.5, w_max=0) - - def test_min_max_weights_equal(self): - """ If w_range (w, w) has w=w, check abs value of all weights respect this """ - w = 1 - sm = generate_structure(4, 1, w_min=w, w_max=w) - w_mat = nx.to_numpy_array(sm) - assert np.all((w_mat == 0) | (w_mat == w) | (w_mat == -w)) - - def test_erdos_renyi_degree_increases_edges(self): - """ Erdos-Renyi degree increases edges """ - edge_counts = [ - max( - [ - len(generate_structure(100, degree, "erdos-renyi").edges) - for _ in range(10) - ] - ) - for degree in [10, 90] - ] - - assert edge_counts == sorted(edge_counts) - - def test_barabasi_albert_degree_increases_edges(self): - """ Barabasi-Albert degree increases edges """ - edge_counts = [ - max( - [ - len(generate_structure(100, degree, "barabasi-albert").edges) - for _ in range(10) - ] - ) - for degree in [10, 90] - ] - - assert edge_counts == sorted(edge_counts) - - def test_full_network(self): - """ Fully connected network has expected edge counts """ - sm = generate_structure(40, degree=0, graph_type="full") - - assert len(sm.edges) == (40 * 39) / 2 - - class TestGenerateContinuousData: @pytest.mark.parametrize( "distribution", ["gaussian", "normal", "student-t", "exponential", "gumbel"] @@ -197,8 +91,8 @@ def test_returns_ndarray(self, distribution): assert isinstance(ndarray, np.ndarray) def test_bad_distribution_type(self): - """ Test that invalid sem-type other than "gaussian", "normal", "student-t", - "exponential", "gumbel" is not accepted """ + """Test that invalid sem-type other than "gaussian", "normal", "student-t", + "exponential", "gumbel" is not accepted""" graph_type, degree, d_nodes = "erdos-renyi", 4, 10 sm = generate_structure(d_nodes, degree, graph_type) with pytest.raises(ValueError, match="Unknown continuous distribution"): @@ -271,7 +165,8 @@ def test_linear_gumbel_parent_dist(self, graph): @pytest.mark.parametrize( "distribution", ["gaussian", "normal", "student-t", "exponential", "gumbel"] ) - def test_intercept(self, distribution): + @pytest.mark.parametrize("noise_scale", [0.0, 0.1]) + def test_intercept(self, distribution, noise_scale): graph = StructureModel() graph.add_node("123") @@ -279,7 +174,7 @@ def test_intercept(self, distribution): graph, n_samples=100000, distribution=distribution, - noise_scale=0, + noise_scale=noise_scale, seed=10, intercept=False, ) @@ -287,12 +182,12 @@ def test_intercept(self, distribution): graph, n_samples=100000, distribution=distribution, - noise_scale=0, + noise_scale=noise_scale, seed=10, intercept=True, ) assert not np.isclose(data_noint[:, 0].mean(), data_intercept[:, 0].mean()) - assert np.isclose(data_noint[:, 0].std(), data_intercept[:, 0].std()) + assert np.isclose(data_noint[:, 0].std(), data_intercept[:, 0].std(), rtol=0.01) @pytest.mark.parametrize("num_nodes", (10, 20, 30)) @pytest.mark.parametrize("seed", (10, 20, 30)) @@ -344,7 +239,8 @@ def test_order_is_correct(self, graph_gen, num_nodes, seed): @pytest.mark.parametrize("noise_std", [0.1, 1, 2]) @pytest.mark.parametrize("intercept", [True, False]) @pytest.mark.parametrize("seed", [10, 12]) - def test_dataframe(self, graph, distribution, noise_std, intercept, seed): + @pytest.mark.parametrize("kernel", [None, RBF(1)]) + def test_dataframe(self, graph, distribution, noise_std, intercept, seed, kernel): """ Tests equivalence of dataframe wrapper """ @@ -355,6 +251,7 @@ def test_dataframe(self, graph, distribution, noise_std, intercept, seed): noise_scale=noise_std, seed=seed, intercept=intercept, + kernel=kernel, ) df = generate_continuous_dataframe( graph, @@ -363,6 +260,7 @@ def test_dataframe(self, graph, distribution, noise_std, intercept, seed): noise_scale=noise_std, seed=seed, intercept=intercept, + kernel=kernel, ) assert np.array_equal(data, df[list(graph.nodes())].values) @@ -432,15 +330,26 @@ def test_intercept_probability_logit(self, graph, distribution): assert not np.isclose(mean_prob, 0.5, atol=0.05) @pytest.mark.parametrize("distribution", ["logit", "probit", "normal"]) - def test_intercept(self, distribution): + @pytest.mark.parametrize("noise_scale", [0.0, 0.1]) + def test_intercept(self, distribution, noise_scale): graph = StructureModel() graph.add_node("123") data_noint = generate_binary_data( - graph, 100000, distribution, noise_scale=0, seed=10, intercept=False + graph, + 100000, + distribution, + noise_scale=noise_scale, + seed=10, + intercept=False, ) data_intercept = generate_binary_data( - graph, 100000, distribution, noise_scale=0, seed=10, intercept=True + graph, + 100000, + distribution, + noise_scale=noise_scale, + seed=10, + intercept=True, ) assert not np.isclose(data_noint[:, 0].mean(), data_intercept[:, 0].mean()) @@ -489,7 +398,8 @@ def test_order_is_correct(self, graph_gen, num_nodes, seed): @pytest.mark.parametrize("noise_std", [0.1, 1, 2]) @pytest.mark.parametrize("intercept", [True, False]) @pytest.mark.parametrize("seed", [10, 12]) - def test_dataframe(self, graph, distribution, noise_std, intercept, seed): + @pytest.mark.parametrize("kernel", [None, RBF(1)]) + def test_dataframe(self, graph, distribution, noise_std, intercept, seed, kernel): """ Tests equivalence of dataframe wrapper """ @@ -500,6 +410,7 @@ def test_dataframe(self, graph, distribution, noise_std, intercept, seed): noise_scale=noise_std, seed=seed, intercept=intercept, + kernel=kernel, ) df = generate_binary_dataframe( graph, @@ -508,6 +419,7 @@ def test_dataframe(self, graph, distribution, noise_std, intercept, seed): noise_scale=noise_std, seed=seed, intercept=intercept, + kernel=kernel, ) assert np.array_equal(data, df[list(graph.nodes())].values) @@ -607,6 +519,51 @@ def test_baseline_probability(self, graph, distribution, n_categories): # without intercept, the probabilities should be fairly uniform assert np.allclose(data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0) + @pytest.mark.parametrize("distribution", ["logit", "probit", "normal", "gumbel"]) + @pytest.mark.parametrize("noise_std", [0.1, 1, 2]) + @pytest.mark.parametrize("intercept", [True, False]) + @pytest.mark.parametrize("seed", [10, 42]) + @pytest.mark.parametrize("kernel", [None, RBF(1)]) + @pytest.mark.parametrize( + "n_categories", + ( + 2, + 10, + ), + ) + def test_dataframe( + self, graph, distribution, noise_std, intercept, seed, kernel, n_categories + ): + """ + Tests equivalence of dataframe wrapper + """ + data = generate_categorical_dataframe( + graph, + 100, + distribution, + noise_scale=noise_std, + seed=seed, + intercept=intercept, + kernel=kernel, + n_categories=n_categories, + ) + df = generate_categorical_dataframe( + graph, + 100, + distribution, + noise_scale=noise_std, + seed=seed, + intercept=intercept, + kernel=kernel, + n_categories=n_categories, + ) + + cols = [] + for node in graph.nodes(): + for cat in range(n_categories): + cols.append("{}_{}".format(node, cat)) + assert np.array_equal(data, df[cols].values) + @pytest.mark.parametrize( "distribution,n_categories", list(product(["logit", "probit", "normal", "gumbel"], [3, 5, 7])), @@ -626,9 +583,16 @@ def test_intercept_probability(self, graph, distribution, n_categories): ) assert not np.allclose(data.mean(axis=0), 1 / n_categories, atol=0.01, rtol=0) - @pytest.mark.parametrize("n_categories", (2, 10,)) + @pytest.mark.parametrize( + "n_categories", + ( + 2, + 10, + ), + ) @pytest.mark.parametrize("distribution", ["probit", "logit"]) - def test_intercept(self, distribution, n_categories): + @pytest.mark.parametrize("noise_scale", [0.0, 0.1]) + def test_intercept(self, distribution, n_categories, noise_scale): graph = StructureModel() graph.add_node("A") @@ -636,7 +600,7 @@ def test_intercept(self, distribution, n_categories): graph, 100000, distribution, - noise_scale=0.1, + noise_scale=noise_scale, n_categories=n_categories, seed=10, intercept=False, @@ -645,21 +609,28 @@ def test_intercept(self, distribution, n_categories): graph, 100000, distribution, - noise_scale=0.1, + noise_scale=noise_scale, n_categories=n_categories, seed=10, intercept=True, ) - assert np.all( - ~np.isclose( - data_intercept.mean(axis=0), data_noint.mean(axis=0), atol=0.05, rtol=0 - ) - ) + # NOTE: as n_categories increases, the probability that at least one category with + # intercept=True will be the same as intercept=False -> 1.0 + num_similar = np.isclose( + data_intercept.mean(axis=0), data_noint.mean(axis=0), atol=0.05, rtol=0 + ).sum() + assert num_similar < n_categories / 2 @pytest.mark.parametrize("num_nodes", (3, 6)) @pytest.mark.parametrize("seed", (10, 20)) - @pytest.mark.parametrize("n_categories", (2, 6,)) + @pytest.mark.parametrize( + "n_categories", + ( + 2, + 6, + ), + ) @pytest.mark.parametrize("distribution", ["probit", "logit"]) def test_independence(self, graph_gen, seed, num_nodes, n_categories, distribution): """ @@ -692,235 +663,212 @@ def test_independence(self, graph_gen, seed, num_nodes, n_categories, distributi assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) -class TestMixedDataGen: - def test_run(self, graph, schema): - df = sem_generator( - graph=graph, - schema=schema, - default_type="continuous", - noise_std=1.0, - n_samples=1000, - intercept=False, - seed=12, +class TestGenerateCountData: + def test_zero_lambda(self): + """ + A wrong initialisation could lead to counts always being zero if they dont + have parents. + """ + graph = StructureModel() + graph.add_nodes_from(list(range(20))) + df = generate_count_dataframe(graph, 10000) + assert not np.any(df.mean() == 0) + + @pytest.mark.parametrize("intercept", [True, False]) + @pytest.mark.parametrize("seed", [10, 12]) + @pytest.mark.parametrize("kernel", [None, RBF(1)]) + @pytest.mark.parametrize( + "zero_inflation_factor", [int(0), 0.0, 0.01, 0.1, 0.5, 1.0, int(1)] + ) + def test_dataframe(self, graph, intercept, seed, kernel, zero_inflation_factor): + """ + Tests equivalence of dataframe wrapper + """ + data = generate_count_dataframe( + graph, + 100, + zero_inflation_factor=zero_inflation_factor, + seed=seed, + intercept=intercept, + kernel=kernel, + ) + df = generate_count_dataframe( + graph, + 100, + zero_inflation_factor=zero_inflation_factor, + seed=seed, + intercept=intercept, + kernel=kernel, ) - # test binary: - assert df[0].nunique() == 2 - assert df[0].nunique() == 2 + assert np.array_equal(data, df[list(graph.nodes())].values) - # test categorical: - for col in ["1_{}".format(i) for i in range(3)]: - assert df[col].nunique() == 2 - assert len([x for x in df.columns if isinstance(x, str) and "1_" in x]) == 3 - for col in ["5_{}".format(i) for i in range(5)]: - assert df[col].nunique() == 2 - assert len([x for x in df.columns if isinstance(x, str) and "5_" in x]) == 5 +class TestGenerateStructureDynamic: + @pytest.mark.parametrize("num_nodes", (10, 20)) + @pytest.mark.parametrize("p", [1, 10]) + @pytest.mark.parametrize("degree_intra, degree_inter", [(3, 0), (0, 3), (1, 1)]) + def test_all_nodes_in_structure(self, num_nodes, p, degree_intra, degree_inter): + """both intra- and iter-slice nodes should be in the structure""" + g = generate_structure_dynamic(num_nodes, p, degree_intra, degree_inter) + assert np.all( + [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) in g.nodes + for l_val in range(p + 1) + for var in range(num_nodes) + ] + ) + + def test_naming_nodes(self): + """Nodes should have the format {var}_lag{l}""" + g = generate_structure_dynamic(5, 3, 3, 4) + pattern = re.compile(r"[0-5]_lag[0-3]") + for node in g.nodes: + match = pattern.match(node) + assert match and (match.group() == node) + + def test_degree_zero_implies_no_edges(self): + """If the degree is zero, zero edges are generated. + We test this is true for intra edges (ending in 'lag0') and inter edges + """ + g = generate_structure_dynamic(15, 3, 0, 4) # No intra edges + lags = [(u.split("_lag")[1], v.split("_lag")[1]) for u, v in g.edges] + assert np.all([el[0] != "0" for el in lags]) + g = generate_structure_dynamic(15, 3, 4, 0) + lags = [(u.split("_lag")[1], v.split("_lag")[1]) for u, v in g.edges] + assert np.all([el == ("0", "0") for el in lags]) # only Intra edges + g = generate_structure_dynamic(15, 3, 0, 0) # no edges + assert len(g.edges) == 0 + + def test_edges_have_weights(self): + """all edges must have weight values as floats or int""" + g = generate_structure_dynamic(10, 3, 4, 4) # No intra edges + ws = [w for _, _, w in g.edges(data="weight")] + assert np.all([isinstance(w, (float, int)) for w in ws]) + + def test_raise_error_if_wrong_graph_type(self): + """if the graph_type chosen is not among the options available, raise error""" + with pytest.raises( + ValueError, + match=r"Unknown graph type some_type\. " + r"Available types are \['erdos-renyi', 'barabasi-albert', 'full'\]", + ): + generate_structure_dynamic(10, 10, 10, 10, graph_type_intra="some_type") + with pytest.raises( + ValueError, + match=r"Unknown inter-slice graph type `some_type`\. " + "Valid types are 'erdos-renyi' and 'full'", + ): + generate_structure_dynamic(10, 10, 10, 10, graph_type_inter="some_type") - # test continuous - assert df[3].nunique() == 1000 - assert df[4].nunique() == 1000 + def test_raise_error_if_min_greater_than_max(self): + """if min > max,raise error""" + with pytest.raises( + ValueError, + match="Absolute minimum weight must be " + r"less than or equal to maximum weight\: 3 \> 2", + ): + generate_structure_dynamic(10, 10, 10, 10, w_min_inter=3, w_max_inter=2) - def test_graph_not_a_dag(self): - graph = StructureModel() - graph.add_edges_from([(0, 1), (1, 2), (2, 0)]) - - with pytest.raises(ValueError, match="Provided graph is not a DAG"): - _ = sem_generator(graph=graph) - - def test_not_permissible_type(self, graph): - schema = { - 0: "unknown data type", - } - with pytest.raises(ValueError, match="Unknown data type"): - _ = sem_generator( - graph=graph, - schema=schema, - default_type="continuous", - noise_std=1.0, - n_samples=1000, - intercept=False, - seed=12, - ) + @pytest.mark.parametrize("num_nodes", (10, 20)) + @pytest.mark.parametrize("p", [1, 10]) + def test_full_graph_type(self, num_nodes, p): + """all the connections from past variables to current variables should be there if using `full` graph_type""" + g = generate_structure_dynamic(num_nodes, p, 4, 4, graph_type_inter="full") + lagged_edges = sorted((u, v) for u, v in g.edges if int(u.split("_lag")[1]) > 0) + assert lagged_edges == sorted( + ("{v}_lag{l_}".format(v=v_f, l_=l_), "{v}_lag0".format(v=v_t)) + for l_ in range(1, p + 1) + for v_f in range(num_nodes) # var from + for v_t in range(num_nodes) # var to + ) - def test_missing_cardinality(self, graph): - schema = { - 0: "categorical", - 1: "categorical:3", - 5: "categorical:5", - } - with pytest.raises(ValueError, match="Missing cardinality for categorical"): - _ = sem_generator( - graph=graph, - schema=schema, - default_type="continuous", - noise_std=1.0, - n_samples=1000, - intercept=False, - seed=12, - ) - def test_missing_default_type(self, graph): - with pytest.raises(ValueError, match="Unknown default data type"): - _ = sem_generator( - graph=graph, - schema=schema, - default_type="unknown", - noise_std=1.0, - n_samples=1000, - intercept=False, - seed=12, - ) +class TestGenerateDataframeDynamic: + @pytest.mark.parametrize( + "sem_type", ["linear-gauss", "linear-exp", "linear-gumbel"] + ) + def test_returns_dateframe(self, sem_type): + """ Return value is an ndarray - test over all sem_types """ + graph_type, degree, d_nodes = "erdos-renyi", 4, 10 + sm = generate_structure_dynamic(d_nodes, 2, degree, degree, graph_type) + data = generate_dataframe_dynamic(sm, sem_type=sem_type, n_samples=10) + assert isinstance(data, pd.DataFrame) - def test_incorrect_weight_dist(self): - sm = StructureModel() - nodes = list(str(x) for x in range(6)) - np.random.shuffle(nodes) - sm.add_nodes_from(nodes) + def test_bad_sem_type(self): + """ Test that invalid sem-type other than "linear-gauss", "linear-exp", "linear-gumbel" is not accepted """ + graph_type, degree, d_nodes = "erdos-renyi", 4, 10 + sm = generate_structure_dynamic(d_nodes, 2, degree, degree, graph_type) + with pytest.raises( + ValueError, + match="unknown sem type invalid. Available types are:" + r" \('linear-gauss', 'linear-exp', 'linear-gumbel'\)", + ): + generate_dataframe_dynamic(sm, sem_type="invalid", n_samples=10) - sm.add_weighted_edges_from([("0", "1", None), ("2", "4", None)]) - - with pytest.raises(ValueError, match="Unknown weight distribution"): - _ = sem_generator( - graph=sm, - schema=None, - default_type="continuous", - distributions={"weight": "unknown"}, - noise_std=2.0, - n_samples=1000, - intercept=False, - seed=10, - ) + @pytest.mark.parametrize("p", [0, 1, 2]) + def test_labels_correct(self, p): + graph_type, degree, d_nodes = "erdos-renyi", 4, 10 + sm = generate_structure_dynamic(d_nodes, p, degree, degree, graph_type) + data = generate_dataframe_dynamic(sm, sem_type="linear-gauss", n_samples=10) + intra_nodes = sorted([el for el in sm.nodes if "_lag0" in el]) + inter_nodes = sorted([el for el in sm.nodes if "_lag0" not in el]) + assert sorted(data.columns) == sorted(list(inter_nodes) + list(intra_nodes)) - def test_incorrect_intercept_dist(self, graph): - with pytest.raises(ValueError, match="Unknown intercept distribution"): - _ = sem_generator( - graph=graph, - schema=None, - default_type="continuous", - distributions={"intercept": "unknown"}, - noise_std=2.0, - n_samples=10, - intercept=True, - seed=10, + +class TestGenerateStationaryDynamicStructureAndSamples: + def test_wta(self): + with pytest.warns( + UserWarning, match="Could not simulate data, returning constant dataframe" + ): + gen_stationary_dyn_net_and_df( + w_min_inter=1, w_max_inter=2, max_data_gen_trials=2 ) - # def test_mixed_type_independence(self): - @pytest.mark.parametrize("seed", (10, 20)) - @pytest.mark.parametrize("n_categories", (2, 5,)) - @pytest.mark.parametrize("weight_distribution", ["uniform", "gaussian"]) - @pytest.mark.parametrize("intercept_distribution", ["uniform", "gaussian"]) - def test_mixed_type_independence( - self, seed, n_categories, weight_distribution, intercept_distribution - ): - """ - Test whether the relation is accurate, implicitly tests sequence of - nodes. - """ + @pytest.mark.parametrize("seed", [2, 3, 5]) + def test_seems_stationary(self, seed): np.random.seed(seed) - - sm = StructureModel() - nodes = list(str(x) for x in range(6)) - np.random.shuffle(nodes) - sm.add_nodes_from(nodes) - # binary -> categorical - sm.add_weighted_edges_from([("0", "1", 10)]) - # binary -> continuous - sm.add_weighted_edges_from([("2", "4", None)]) - - schema = { - "0": "binary", - "1": "categorical:{}".format(n_categories), - "2": "binary", - "4": "continuous", - "5": "categorical:{}".format(n_categories), - } - - df = sem_generator( - graph=sm, - schema=schema, - default_type="continuous", - distributions={ - "weight": weight_distribution, - "intercept": intercept_distribution, - }, - noise_std=2, - n_samples=100000, - intercept=True, - seed=seed, + _, df, _, _ = gen_stationary_dyn_net_and_df( + w_min_inter=0.1, w_max_inter=0.2, max_data_gen_trials=2 ) + assert np.all(df.max() - df.min() < 10) - atol = 0.05 # 5% difference bewteen joint & factored! - # 1. dependent links - # 0 -> 1 (we look at the class with the highest deviation from uniform - # to avoid small values) - c, _ = max( - [ - (c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) - for c in range(n_categories) - ], - key=operator.itemgetter(1), - ) - joint_proba, factored_proba = calculate_proba(df, "0", "1_{}".format(c)) - assert not np.isclose(joint_proba, factored_proba, rtol=0, atol=atol) - # 2 -> 4 - assert not np.isclose( - df["4"].mean(), df["4"][df["2"] == 1].mean(), rtol=0, atol=atol - ) - - tol = 0.15 # relative tolerance of +- 15% of the - # 2. independent links - # categorical - c, _ = max( - [ - (c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) - for c in range(n_categories) - ], - key=operator.itemgetter(1), - ) - joint_proba, factored_proba = calculate_proba(df, "0", "5_{}".format(c)) - assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) + def test_error_if_wmin_less_wmax(self): + with pytest.raises( + ValueError, + match="Absolute minimum weight must be less than or equal to maximum weight: 2 > 1", + ): + gen_stationary_dyn_net_and_df( + w_min_inter=2, w_max_inter=1, max_data_gen_trials=2 + ) - # binary - joint_proba, factored_proba = calculate_proba(df, "0", "2") - assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) + def test_dense_networks(self): + """dense network are more likely to be non stationary. we check that the simulator is still able to provide a + stationary time-deries in that case. - # categorical - c, _ = max( - [ - (c, np.abs(df["1_{}".format(c)].mean() - 1 / n_categories)) - for c in range(n_categories) - ], - key=operator.itemgetter(1), - ) - d, _ = max( - [ - (d, np.abs(df["5_{}".format(d)].mean() - 1 / n_categories)) - for d in range(n_categories) - ], - key=operator.itemgetter(1), - ) - joint_proba, factored_proba = calculate_proba( - df, "1_{}".format(d), "5_{}".format(c) - ) - assert np.isclose(joint_proba, factored_proba, rtol=tol, atol=0) - - # continuous - # for gaussian distributions, zero variance is equivalent to independence - assert np.isclose(df[["3", "4"]].corr().values[0, 1], 0, atol=tol) - - -def calculate_proba( - df: Union[pd.DataFrame, np.ndarray], col_0: Hashable, col_1: Hashable -) -> Tuple[float, float]: - if isinstance(df, pd.DataFrame): - marginal_0 = df[col_0].mean() - marginal_1 = df[col_1].mean() - joint_proba = (df[col_0] * df[col_1]).mean() - else: - marginal_0 = df[:, col_0].mean() - marginal_1 = df[:, col_1].mean() - joint_proba = (df[:, col_0] * df[:, col_1]).mean() - - factored_proba = marginal_0 * marginal_1 - return joint_proba, factored_proba + If df contain only ones it means that the generator failed to obtain a stationary structure""" + np.random.seed(4) + _, df, _, _ = gen_stationary_dyn_net_and_df( + n_samples=1000, + p=1, + w_min_inter=0.2, + w_max_inter=0.5, + max_data_gen_trials=10, + degree_intra=4, + degree_inter=7, + ) + assert np.any(np.ones(df.shape) != df) + + def test_fail_to_find_stationary_network(self): + """if fails to find suitable network, returns dataset of ones""" + np.random.seed(5) + _, df, _, _ = gen_stationary_dyn_net_and_df( + n_samples=1000, + p=1, + w_min_inter=0.6, + w_max_inter=0.6, + max_data_gen_trials=20, + degree_intra=4, + degree_inter=7, + ) + assert np.any(np.ones(df.shape) == df) diff --git a/tests/structure/test_dynotears.py b/tests/structure/test_dynotears.py new file mode 100644 index 0000000..bfb8ceb --- /dev/null +++ b/tests/structure/test_dynotears.py @@ -0,0 +1,738 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +import networkx as nx +import numpy as np +import pandas as pd +import pytest + +from causalnex.structure.dynotears import from_numpy_dynamic, from_pandas_dynamic + + +class TestFromNumpyDynotears: + """Test behaviour of the learn_dynamic_structure of `from_numpy_dynamic`""" + + def test_empty_data_raises_error(self): + """ + Providing an empty data set should result in a Value Error explaining that data must not be empty. + This error is useful to catch and handle gracefully, because otherwise the user would experience + misleading division by zero, or unpacking errors. + """ + + with pytest.raises( + ValueError, match="Input data X is empty, cannot learn any structure" + ): + from_numpy_dynamic(np.empty([0, 5]), np.zeros([5, 5])) + with pytest.raises( + ValueError, match="Input data Xlags is empty, cannot learn any structure" + ): + from_numpy_dynamic(np.zeros([5, 5]), np.empty([0, 5])) + + def test_nrows_data_mismatch_raises_error(self): + """ + Providing input data and lagged data with different number of rows should result in a Value Error. + """ + + with pytest.raises( + ValueError, match="Input data X and Xlags must have the same number of rows" + ): + from_numpy_dynamic(np.zeros([5, 5]), np.zeros([6, 5])) + + def test_ncols_lagged_data_not_multiple_raises_error(self): + """ + Number of columns of lagged data is not a multiple of those of input data should result in a Value Error. + """ + + with pytest.raises( + ValueError, + match="Number of columns of Xlags must be a multiple of number of columns of X", + ): + from_numpy_dynamic(np.zeros([5, 5]), np.zeros([5, 6])) + + def test_single_iter_gets_converged_fail_warnings(self, data_dynotears_p1): + """ + With a single iteration on this dataset, learn_structure fails to converge and should give warnings. + """ + + with pytest.warns( + UserWarning, match=r"Failed to converge\. Consider increasing max_iter." + ): + from_numpy_dynamic( + data_dynotears_p1["X"], data_dynotears_p1["Y"], max_iter=1 + ) + + def test_naming_nodes(self, data_dynotears_p3): + """ + Nodes should have the format {var}_lag{l} + """ + sm = from_numpy_dynamic(data_dynotears_p3["X"], data_dynotears_p3["Y"]) + pattern = re.compile(r"[0-5]_lag[0-3]") + + for node in sm.nodes: + match = pattern.match(node) + assert match + assert match.group() == node + + def test_inter_edges(self, data_dynotears_p3): + """ + inter-slice edges must be {var}_lag{l} -> {var'}_lag0 , l > 0 + """ + + sm = from_numpy_dynamic(data_dynotears_p3["X"], data_dynotears_p3["Y"]) + + for start, end in sm.edges: + if int(start[-1]) > 0: + assert int(end[-1]) == 0 + + def test_expected_structure_learned_p1(self, data_dynotears_p1): + """ + Given a small data set with p=1, find all the intra-slice edges and the majority of the inter-slice ones + """ + + sm = from_numpy_dynamic( + data_dynotears_p1["X"], data_dynotears_p1["Y"], w_threshold=0.2 + ) + w_edges = [ + ("{i}_lag0".format(i=i), "{j}_lag0".format(j=j)) + for i in range(5) + for j in range(5) + if data_dynotears_p1["W"][i, j] != 0 + ] + a_edges = [ + ("{i_1}_lag{i_2}".format(i_1=i % 5, i_2=1 + i // 5), "{j}_lag0".format(j=j)) + for i in range(5) + for j in range(5) + if data_dynotears_p1["A"][i, j] != 0 + ] + + edges_in_sm_and_a = [el for el in sm.edges if el in a_edges] + sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]] + + assert sorted([el for el in sm.edges if "lag0" in el[0]]) == sorted(w_edges) + assert len(edges_in_sm_and_a) / len(a_edges) > 0.6 + assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.9 + + def test_expected_structure_learned_p2(self, data_dynotears_p2): + """ + Given a small data set with p=2, all the intra-slice must be correct, and 90%+ found. + the majority of the inter edges must be found too + """ + + sm = from_numpy_dynamic( + data_dynotears_p2["X"], data_dynotears_p2["Y"], w_threshold=0.25 + ) + w_edges = [ + ("{i}_lag0".format(i=i), "{j}_lag0".format(j=j)) + for i in range(5) + for j in range(5) + if data_dynotears_p2["W"][i, j] != 0 + ] + a_edges = [ + ("{i_1}_lag{i_2}".format(i_1=i % 5, i_2=1 + i // 5), "{j}_lag0".format(j=j)) + for i in range(5) + for j in range(5) + if data_dynotears_p2["A"][i, j] != 0 + ] + + edges_in_sm_and_a = [el for el in sm.edges if el in a_edges] + sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]] + sm_intra_edges = [el for el in sm.edges if "lag0" in el[0]] + + assert len([el for el in sm_intra_edges if el not in w_edges]) == 0 + assert ( + len([el for el in w_edges if el not in sm_intra_edges]) / len(w_edges) + <= 1.0 + ) + assert len(edges_in_sm_and_a) / len(a_edges) > 0.5 + assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.5 + + def test_tabu_parents(self, data_dynotears_p2): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_parent_nodes=[1], + ) + assert not [el for el in sm.edges if "1_lag" in el[0]] + + def test_tabu_children(self, data_dynotears_p2): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_child_nodes=[4], + ) + assert not ([el for el in sm.edges if "4_lag" in el[1]]) + + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_child_nodes=[1], + ) + assert not ([el for el in sm.edges if "1_lag" in el[1]]) + + def test_tabu_edges(self, data_dynotears_p2): + """ + Tabu edges must not be in the edges learnt + """ + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_edges=[(0, 2, 4), (0, 0, 3), (1, 1, 4), (1, 3, 4)], + ) + + assert ("2_lag0", "4_lag0") not in sm.edges + assert ("0_lag0", "3_lag0") not in sm.edges + assert ("1_lag1", "4_lag0") not in sm.edges + assert ("3_lag1", "4_lag0") not in sm.edges + + def test_multiple_tabu(self, data_dynotears_p2): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_edges=[(0, 1, 4), (0, 0, 3), (1, 1, 4), (1, 3, 4)], + tabu_child_nodes=[0, 1], + tabu_parent_nodes=[3], + ) + + assert ("1_lag0", "4_lag0") not in sm.edges + assert ("0_lag0", "3_lag0") not in sm.edges + assert ("1_lag1", "4_lag0") not in sm.edges + assert ("3_lag1", "4_lag0") not in sm.edges + assert not ([el for el in sm.edges if "0_lag" in el[1]]) + assert not ([el for el in sm.edges if "1_lag" in el[1]]) + assert not ([el for el in sm.edges if "3_lag" in el[0]]) + + def test_all_columns_in_structure(self, data_dynotears_p2): + """Every columns that is in the data should become a node in the learned structure""" + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + ) + assert sorted(sm.nodes) == [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for var in range(5) + for l_val in range(3) + ] + + def test_isolated_nodes_exist(self, data_dynotears_p2): + """Isolated nodes should still be in the learned structure""" + sm = from_numpy_dynamic( + data_dynotears_p2["X"], data_dynotears_p2["Y"], w_threshold=1 + ) + assert len(sm.edges) == 2 + assert len(sm.nodes) == 15 + + def test_edges_contain_weight(self, data_dynotears_p2): + """Edges must contain the 'weight' from the adjacent table """ + sm = from_numpy_dynamic(data_dynotears_p2["X"], data_dynotears_p2["Y"]) + assert np.all( + [ + isinstance(w, (float, int, np.number)) + for u, v, w in sm.edges(data="weight") + ] + ) + + def test_certain_relationships_get_near_certain_weight(self): + """If a == b always, ther should be an edge a->b or b->a with coefficient close to one """ + + np.random.seed(17) + data = pd.DataFrame( + [[np.sqrt(el), np.sqrt(el)] for el in np.random.choice(100, size=500)], + columns=["a", "b"], + ) + sm = from_numpy_dynamic(data.values[1:], data.values[:-1], w_threshold=0.1) + edge = ( + sm.get_edge_data("1_lag0", "0_lag0") or sm.get_edge_data("0_lag0", "1_lag0") + )["weight"] + + assert 0.99 < edge <= 1.01 + + def test_inverse_relationships_get_negative_weight(self): + """If a == -b always, ther should be an edge a->b or b->a with coefficient close to minus one """ + + np.random.seed(17) + data = pd.DataFrame( + [[el, -el] for el in np.random.choice(100, size=500)], columns=["a", "b"] + ) + sm = from_numpy_dynamic(data.values[1:], data.values[:-1], w_threshold=0.1) + edge = ( + sm.get_edge_data("1_lag0", "0_lag0") or sm.get_edge_data("0_lag0", "1_lag0") + )["weight"] + assert -1.01 < edge <= -0.99 + + def test_no_cycles(self, data_dynotears_p2): + """ + The learned structure should be acyclic + """ + + sm = from_numpy_dynamic( + data_dynotears_p2["X"], data_dynotears_p2["Y"], w_threshold=0.05 + ) + assert nx.algorithms.is_directed_acyclic_graph(sm) + + def test_tabu_edges_on_non_existing_edges_do_nothing(self, data_dynotears_p2): + """If tabu edges do not exist in the original unconstrained network then nothing changes""" + sm = from_numpy_dynamic( + data_dynotears_p2["X"], data_dynotears_p2["Y"], w_threshold=0.2 + ) + + sm_2 = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + w_threshold=0.2, + tabu_edges=[(0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3)], + ) + assert set(sm_2.edges) == set(sm.edges) + + +class TestFromPandasDynotears: + """Test behaviour of the learn_dynamic_structure of `from_pandas_dynamic`""" + + def test_empty_data_raises_error(self): + """ + Providing an empty data set should result in a Value Error explaining that data must not be empty. + This error is useful to catch and handle gracefully, because otherwise the user would experience + misleading division by zero, or unpacking errors. + """ + with pytest.raises( + ValueError, match="Input data X is empty, cannot learn any structure" + ): + from_pandas_dynamic(pd.DataFrame(np.empty([2, 5])), p=2) + + with pytest.raises( + ValueError, match="Input data X is empty, cannot learn any structure" + ): + from_pandas_dynamic(pd.DataFrame(np.empty([1, 5])), p=1) + + with pytest.raises( + ValueError, match="Input data X is empty, cannot learn any structure" + ): + from_pandas_dynamic(pd.DataFrame(np.empty([0, 5])), p=1) + + def test_single_iter_gets_converged_fail_warnings(self, data_dynotears_p1): + """ + With a single iteration on this dataset, learn_structure fails to converge and should give warnings. + """ + + with pytest.warns( + UserWarning, match=r"Failed to converge\. Consider increasing max_iter." + ): + from_pandas_dynamic(pd.DataFrame(data_dynotears_p1["X"]), p=1, max_iter=1) + + def test_naming_nodes(self, data_dynotears_p3): + """ + Nodes should have the format {var}_lag{l} + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + ) + pattern = re.compile(r"[abcde]_lag[0-3]") + + for node in sm.nodes: + match = pattern.match(node) + assert match + assert match.group() == node + + def test_inter_edges(self, data_dynotears_p3): + """ + inter-slice edges must be {var}_lag{l} -> {var'}_lag0 , l > 0 + """ + + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + ) + for start, end in sm.edges: + if int(start[-1]) > 0: + assert int(end[-1]) == 0 + + def test_expected_structure_learned_p1(self, data_dynotears_p1): + """ + Given a small data set with p=1, find all the intra-slice edges and the majority of the inter-slice ones + """ + df = pd.DataFrame(data_dynotears_p1["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p1["Y"][0, :] + df = df.sort_index() + + sm = from_pandas_dynamic( + df, + p=1, + w_threshold=0.2, + ) + map_ = dict(zip(range(5), ["a", "b", "c", "d", "e"])) + w_edges = [ + ("{i}_lag0".format(i=map_[i]), "{j}_lag0".format(j=map_[j])) + for i in range(5) + for j in range(5) + if data_dynotears_p1["W"][i, j] != 0 + ] + a_edges = [ + ( + "{i_1}_lag{i_2}".format(i_1=map_[i % 5], i_2=1 + i // 5), + "{j}_lag0".format(j=map_[j]), + ) + for i in range(5) + for j in range(5) + if data_dynotears_p1["A"][i, j] != 0 + ] + + edges_in_sm_and_a = [el for el in sm.edges if el in a_edges] + sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]] + assert sorted(el for el in sm.edges if "lag0" in el[0]) == sorted(w_edges) + assert len(edges_in_sm_and_a) / len(a_edges) > 0.6 + assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.9 + + def test_expected_structure_learned_p2(self, data_dynotears_p2): + """ + Given a small data set with p=2, all the intra-slice must be correct, and 90%+ found. + the majority of the inter edges must be found too + """ + df = pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p2["Y"][0, :5] + df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10] + + df = df.sort_index() + + sm = from_pandas_dynamic( + df, + p=2, + w_threshold=0.25, + ) + map_ = dict(zip(range(5), ["a", "b", "c", "d", "e"])) + w_edges = [ + ("{i}_lag0".format(i=map_[i]), "{j}_lag0".format(j=map_[j])) + for i in range(5) + for j in range(5) + if data_dynotears_p2["W"][i, j] != 0 + ] + a_edges = [ + ( + "{i_1}_lag{i_2}".format(i_1=map_[i % 5], i_2=1 + i // 5), + "{j}_lag0".format(j=map_[j]), + ) + for i in range(5) + for j in range(5) + if data_dynotears_p2["A"][i, j] != 0 + ] + + edges_in_sm_and_a = [el for el in sm.edges if el in a_edges] + sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]] + sm_intra_edges = [el for el in sm.edges if "lag0" in el[0]] + + assert len([el for el in sm_intra_edges if el not in w_edges]) == 0 + assert ( + len([el for el in w_edges if el not in sm_intra_edges]) / len(w_edges) + <= 1.0 + ) + assert len(edges_in_sm_and_a) / len(a_edges) > 0.5 + assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.5 + + def test_tabu_parents(self, data_dynotears_p3): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_parent_nodes=["a"], + ) + assert not [el for el in sm.edges if "a_lag" in el[0]] + + def test_tabu_children(self, data_dynotears_p3): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_child_nodes=["c", "d"], + ) + assert not ([el for el in sm.edges if "c_lag" in el[1]]) + assert not ([el for el in sm.edges if "d_lag" in el[1]]) + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_child_nodes=["a", "b"], + ) + assert not ([el for el in sm.edges if "a_lag" in el[1]]) + assert not ([el for el in sm.edges if "b_lag" in el[1]]) + + def test_tabu_edges(self, data_dynotears_p3): + """ + Tabu edges must not be in the edges learnt + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_edges=[(0, "c", "e"), (0, "a", "d"), (1, "b", "e"), (1, "d", "e")], + ) + + assert ("c_lag0", "e_lag0") not in sm.edges + assert ("a_lag0", "d_lag0") not in sm.edges + assert ("b_lag1", "e_lag0") not in sm.edges + assert ("d_lag1", "e_lag0") not in sm.edges + + def test_multiple_tabu(self, data_dynotears_p3): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_edges=[(0, "a", "e"), (0, "a", "d"), (1, "b", "e"), (1, "d", "e")], + tabu_child_nodes=["a", "b"], + tabu_parent_nodes=["d"], + ) + + assert ("a_lag0", "e_lag0") not in sm.edges + assert ("a_lag0", "d_lag0") not in sm.edges + assert ("b_lag1", "e_lag0") not in sm.edges + assert ("d_lag1", "e_lag0") not in sm.edges + assert not ([el for el in sm.edges if "a_lag" in el[1]]) + assert not ([el for el in sm.edges if "b_lag" in el[1]]) + assert not ([el for el in sm.edges if "d_lag" in el[0]]) + + def test_all_columns_in_structure(self, data_dynotears_p2): + """Every columns that is in the data should become a node in the learned structure""" + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]), + p=2, + w_threshold=0.4, + ) + assert sorted(sm.nodes) == [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for var in ["a", "b", "c", "d", "e"] + for l_val in range(3) + ] + + def test_isolated_nodes_exist(self, data_dynotears_p2): + """Isolated nodes should still be in the learned structure""" + df = pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p2["Y"][0, :5] + df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10] + df = df.sort_index() + + sm = from_pandas_dynamic(df, p=2, w_threshold=1) + assert len(sm.edges) == 2 + assert len(sm.nodes) == 15 + + def test_edges_contain_weight(self, data_dynotears_p3): + """Edges must contain the 'weight' from the adjacent table """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + ) + assert np.all( + [ + isinstance(w, (float, int, np.number)) + for u, v, w in sm.edges(data="weight") + ] + ) + + def test_certain_relationships_get_near_certain_weight(self): + """If a == b always, ther should be an edge a->b or b->a with coefficient close to one """ + + np.random.seed(17) + data = pd.DataFrame( + [[np.sqrt(el), np.sqrt(el)] for el in np.random.choice(100, size=500)], + columns=["a", "b"], + ) + sm = from_pandas_dynamic(data, p=1, w_threshold=0.1) + edge = ( + sm.get_edge_data("b_lag0", "a_lag0") or sm.get_edge_data("a_lag0", "b_lag0") + )["weight"] + + assert 0.99 < edge <= 1.01 + + def test_inverse_relationships_get_negative_weight(self): + """If a == -b always, there should be an edge a->b or b->a with coefficient close to minus one """ + + np.random.seed(17) + data = pd.DataFrame( + [[el, -el] for el in np.random.choice(100, size=500)], columns=["a", "b"] + ) + sm = from_pandas_dynamic(data, p=1, w_threshold=0.1) + edge = ( + sm.get_edge_data("b_lag0", "a_lag0") or sm.get_edge_data("a_lag0", "b_lag0") + )["weight"] + assert -1.01 < edge <= -0.99 + + def test_no_cycles(self, data_dynotears_p2): + """ + The learned structure should be acyclic + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]), + p=2, + w_threshold=0.05, + ) + assert nx.algorithms.is_directed_acyclic_graph(sm) + + def test_tabu_edges_on_non_existing_edges_do_nothing(self, data_dynotears_p2): + """If tabu edges do not exist in the original unconstrained network then nothing changes""" + df = pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p2["Y"][0, :5] + df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10] + df = df.sort_index() + + sm = from_pandas_dynamic( + df, + p=2, + w_threshold=0.2, + ) + sm_2 = from_pandas_dynamic( + df, + p=2, + w_threshold=0.2, + tabu_edges=[(0, "a", "a"), (0, "a", "b"), (0, "a", "c"), (0, "a", "d")], + ) + assert set(sm_2.edges) == set(sm.edges) + + def test_list_of_dfs_as_input(self, data_dynotears_p2): + """ + the result when given a list of dataframes should be the same as a single dataframe. + Also, stacking two dataframes should give the same result as well + """ + df = pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p2["Y"][0, :5] + df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10] + + df = df.sort_index() + df_ = df.copy() + df_.index = range(100, 152) + df = pd.concat([df, df_]) + sm = from_pandas_dynamic(df, p=2, w_threshold=0.05) + sm_1 = from_pandas_dynamic([df], p=2, w_threshold=0.05) + sm_2 = from_pandas_dynamic([df, df], p=2, w_threshold=0.05) + + assert list(sm_2.edges) == list(sm_1.edges) + assert list(sm.edges) == list(sm_1.edges) + + weights = np.array([w for _, _, w in sm.edges(data="weight")]) + weights_1 = np.array([w for _, _, w in sm_1.edges(data="weight")]) + weights_2 = np.array([w for _, _, w in sm_2.edges(data="weight")]) + assert np.max(np.abs(weights - weights_1)) < 0.001 + assert np.max(np.abs(weights - weights_2)) < 0.001 + + def test_discondinuity(self): + """ + The result when having a point of discontinuity must be the same as if we cut the df in two (on the discont. + point) and provide the two datasets as input. + + This is because, inside, the algorithm cuts the dfs into continuous datasets + """ + np.random.seed(12) + df = pd.DataFrame(np.random.random([100, 5]), columns=["a", "b", "c", "d", "e"]) + df_2 = pd.DataFrame( + np.random.random([100, 5]), + columns=["a", "b", "c", "d", "e"], + index=np.arange(200, 300), + ) + + sm = from_pandas_dynamic(pd.concat([df, df_2], axis=0), p=2, w_threshold=0.05) + sm_1 = from_pandas_dynamic([df, df_2], p=2, w_threshold=0.05) + + assert [(u, v, round(w, 3)) for u, v, w in sm_1.edges(data="weight")] == [ + (u, v, round(w, 3)) for u, v, w in sm.edges(data="weight") + ] + + def test_incorrect_input_format(self): + with pytest.raises( + ValueError, + match="Provided empty list of time_series." + " At least one DataFrame must be provided", + ): + from_pandas_dynamic([], 1) + + with pytest.raises( + ValueError, + match=r"All columns must have numeric data\. " + r"Consider mapping the following columns to int: \['a'\]", + ): + from_pandas_dynamic(pd.DataFrame([["1"]], columns=["a"]), 1) + + with pytest.raises( + TypeError, + match="Time series entries must be instances of `pd.DataFrame`", + ): + from_pandas_dynamic([np.array([1, 2])], 1) + + with pytest.raises( + ValueError, + match="Index for dataframe must be provided in increasing order", + ): + df = pd.DataFrame(np.random.random([5, 5]), index=[3, 1, 2, 5, 0]) + from_pandas_dynamic(df, 1) + + with pytest.raises( + ValueError, + match="All inputs must have the same columns and same types", + ): + df = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "e"], + ) + df_2 = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "f"], + ) + from_pandas_dynamic([df, df_2], 1) + + with pytest.raises( + ValueError, + match="All inputs must have the same columns and same types", + ): + df = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "e"], + ) + df_2 = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "e"], + ) + df_2["a"] = df_2["a"].astype(int) + from_pandas_dynamic([df, df_2], 1) + + with pytest.raises( + TypeError, + match="Index must be integers", + ): + df = pd.DataFrame(np.random.random([5, 5]), index=[0, 1, 2, 3.0, 4]) + from_pandas_dynamic(df, 1) diff --git a/tests/structure/test_nonlinear.py b/tests/structure/test_nonlinear.py new file mode 100644 index 0000000..6572943 --- /dev/null +++ b/tests/structure/test_nonlinear.py @@ -0,0 +1,41 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch.nn as nn + +from causalnex.structure.pytorch.nonlinear import LocallyConnected + + +class TestLocallyConnected: + def test_bias_true(self): + lc = LocallyConnected(1, 1, 1, bias=True) + assert isinstance(lc.bias, nn.Parameter) + + def test_bias_false(self): + lc = LocallyConnected(1, 1, 1, bias=False) + assert lc.bias is None diff --git a/tests/structure/test_notears.py b/tests/structure/test_notears.py index b978093..a07aa79 100644 --- a/tests/structure/test_notears.py +++ b/tests/structure/test_notears.py @@ -80,6 +80,29 @@ def test_non_numeric_data_raises_error(self): with pytest.raises(ValueError, match="All columns must have numeric data.*"): from_pandas(pd.DataFrame(data=["x"], columns=["a"])) + def test_array_with_nan_raises_error(self): + """ + Providing a data set including nan should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_pandas(pd.DataFrame(data=[np.nan, 0], columns=["a"])) + + def test_array_with_inf_raises_error(self): + """ + Providing a data set including infinite values should result in a Value Error explaining that data + contains infinite values. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_pandas(pd.DataFrame(data=[np.inf, 0], columns=["a"])) + def test_single_iter_gets_converged_fail_warnings(self, train_data_idx): """ With a single iteration on this dataset, learn_structure fails to converge and should give warnings. @@ -206,6 +229,29 @@ def test_non_numeric_data_raises_error(self): with pytest.raises(ValueError, match="All columns must have numeric data.*"): from_pandas_lasso(pd.DataFrame(data=["x"], columns=["a"]), 0.1) + def test_array_with_nan_raises_error(self): + """ + Providing a data set including nan should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_pandas_lasso(pd.DataFrame(data=[np.nan, 0], columns=["a"]), 0.1) + + def test_array_with_inf_raises_error(self): + """ + Providing a data set including infinite values should result in a Value Error explaining that data + contains infinite values. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_pandas_lasso(pd.DataFrame(data=[np.inf, 0], columns=["a"]), 0.1) + def test_single_iter_gets_converged_fail_warnings(self, train_data_idx): """ With a single iteration on this dataset, learn_structure fails to converge and should give warnings. @@ -369,6 +415,29 @@ def test_empty_data_raises_error(self): with pytest.raises(ValueError): from_numpy(np.empty([0, 5])) + def test_array_with_nan_raises_error(self): + """ + Providing a data set including nan should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_numpy(np.array([[0, np.nan]])) + + def test_array_with_inf_raises_error(self): + """ + Providing a data set including infinite values should result in a Value Error explaining that data + contains infinite values. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_numpy(np.array([[0, np.inf]])) + def test_single_iter_gets_converged_fail_warnings(self, train_data_idx): """ With a single iteration on this dataset, learn_structure fails to converge and should give warnings. @@ -489,6 +558,29 @@ def test_empty_data_raises_error(self): with pytest.raises(ValueError): from_numpy_lasso(np.empty([0, 5]), 0.1) + def test_array_with_nan_raises_error(self): + """ + Providing a data set including nan should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_numpy_lasso(np.array([[3, np.nan]]), 0.1) + + def test_array_with_inf_raises_error(self): + """ + Providing a data set including infinite values should result in a Value Error explaining that data + contains infinite values. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_numpy_lasso(np.array([[3, np.inf]]), 0.1) + def test_single_iter_gets_converged_fail_warnings(self, train_data_idx): """ With a single iteration on this dataset, learn_structure fails to converge and should give warnings. @@ -609,7 +701,7 @@ def test_f1score_generated(self, adjacency_mat_num_stability): ) train_model = StructureModel(df.values) X = generate_continuous_dataframe( - StructureModel(df), 50, noise_scale=1, seed=20 + StructureModel(df), 100, noise_scale=1, seed=20 ) g = from_numpy_lasso(X[["a", "b", "c", "d", "e"]].values, 0.1, w_threshold=0.1) right_edges = train_model.edges diff --git a/tests/structure/test_pytorch_notears.py b/tests/structure/test_pytorch_notears.py new file mode 100644 index 0000000..6ec5f8d --- /dev/null +++ b/tests/structure/test_pytorch_notears.py @@ -0,0 +1,422 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging + +import networkx as nx +import numpy as np +import pandas as pd +import pytest +import scipy.optimize as sopt +from mock import patch + +from causalnex.structure import StructureModel +from causalnex.structure.data_generators import generate_continuous_dataframe +from causalnex.structure.pytorch.notears import from_numpy, from_pandas + + +class TestFromPandas: + """Test behaviour of the from_pandas method""" + + def test_all_columns_in_structure(self, train_data_idx): + """Every columns that is in the data should become a node in the learned structure""" + + g = from_pandas(train_data_idx) + assert len(g.nodes) == len(train_data_idx.columns) + + def test_isolated_nodes_exist(self, train_data_idx): + """Isolated nodes should still be in the learned structure""" + + g = from_pandas(train_data_idx, w_threshold=1.0) + assert len(g.nodes) == len(train_data_idx.columns) + + def test_expected_structure_learned(self, train_data_idx, train_model): + """Given a small data set that can be examined by hand, the structure should be deterministic""" + + g = from_pandas(train_data_idx, w_threshold=0.25) + assert set(g.edges) == set(train_model.edges) + + def test_empty_data_raises_error(self): + """ + Providing an empty data set should result in a Value Error explaining that data must not be empty. + This error is useful to catch and handle gracefully, because otherwise the user would experience + misleading division by zero, or unpacking errors. + """ + + with pytest.raises(ValueError): + from_pandas(pd.DataFrame(data=[], columns=["a"])) + + def test_non_numeric_data_raises_error(self): + """Only numeric data frames should be supported""" + + with pytest.raises(ValueError, match="All columns must have numeric data.*"): + from_pandas(pd.DataFrame(data=["x"], columns=["a"])) + + def test_single_iter_gets_converged_fail_warnings(self, caplog, train_data_idx): + """ + With a single iteration on this dataset, learn_structure fails to converge and should give warnings. + """ + + with caplog.at_level(logging.WARNING): + from_numpy(train_data_idx.values, max_iter=1) + assert "Failed to converge. Consider increasing max_iter." in caplog.text + + def test_certain_relationships_get_near_certain_weight(self): + """If observations reliably show a==b and !a==!b then the relationship from a->b should be certain""" + + data = pd.DataFrame([[0, 1] for _ in range(10)], columns=["a", "b"]) + g = from_pandas(data) + assert all( + [ + 0.99 <= weight <= 1 + for u, v, weight in g.edges(data="weight") + if u == 0 and v == 1 + ] + ) + + def test_inverse_relationships_get_negative_weight(self): + """If observations indicate a==!b and b==!a then the weight of the relationship from a-> should be negative""" + + data = pd.DataFrame([[0, 1] for _ in range(10)], columns=["a", "b"]) + data.append(pd.DataFrame([[1, 0] for _ in range(10)], columns=["a", "b"])) + g = from_pandas(data) + assert all( + [ + weight < 0 + for u, v, weight in g.edges(data="mean_effect") + if u == 0 and v == 1 + ] + ) + + def test_no_cycles(self, train_data_idx): + """ + The learned structure should be acyclic + """ + + g = from_pandas(train_data_idx, w_threshold=0.25) + assert nx.algorithms.is_directed_acyclic_graph(g) + + def test_tabu_expected_edges(self, train_data_idx): + """Tabu edges should not exist in the network""" + + tabu_e = [("d", "a"), ("b", "c")] + g = from_pandas(train_data_idx, tabu_edges=tabu_e) + assert [e not in g.edges for e in tabu_e] + + def test_tabu_expected_parent_nodes(self, train_data_idx): + """Tabu parent nodes should not have any outgoing edges""" + + tabu_p = ["a", "d", "b"] + g = from_pandas(train_data_idx, tabu_parent_nodes=tabu_p) + assert [p not in [e[0] for e in g.edges] for p in tabu_p] + + def test_tabu_expected_child_nodes(self, train_data_idx): + """Tabu child nodes should not have any ingoing edges""" + + tabu_c = ["a", "d", "b"] + g = from_pandas(train_data_idx, tabu_child_nodes=tabu_c) + assert [c not in [e[1] for e in g.edges] for c in tabu_c] + + def test_multiple_tabu(self, train_data_idx): + """Any edge related to tabu edges/parent nodes/child nodes should not exist in the network""" + + tabu_e = [("d", "a"), ("b", "c")] + tabu_p = ["b"] + tabu_c = ["a", "d"] + g = from_pandas( + train_data_idx, + tabu_edges=tabu_e, + tabu_parent_nodes=tabu_p, + tabu_child_nodes=tabu_c, + ) + assert [e not in g.edges for e in tabu_e] + assert [p not in [e[0] for e in g.edges] for p in tabu_p] + assert [c not in [e[1] for e in g.edges] for c in tabu_c] + + def test_sparsity(self, train_data_idx): + """Structure learnt from larger lambda should be sparser than smaller lambda""" + + g1 = from_pandas(train_data_idx, lasso_beta=10.0, w_threshold=0.25) + g2 = from_pandas(train_data_idx, lasso_beta=1e-6, w_threshold=0.25) + assert len(g1.edges) < len(g2.edges) + + def test_sparsity_against_without_reg(self, train_data_idx): + """Structure learnt from regularisation should be sparser than the one without""" + + g1 = from_pandas(train_data_idx, lasso_beta=10.0, w_threshold=0.25) + g2 = from_pandas(train_data_idx, w_threshold=0.25) + assert len(g1.edges) < len(g2.edges) + + def test_f1_score_fixed(self, train_data_idx, train_model): + """Structure learnt from regularisation should have very high f1 score relative to the ground truth""" + g = from_pandas(train_data_idx, lasso_beta=0.01, w_threshold=0.25) + + n_predictions_made = len(g.edges) + n_correct_predictions = len(set(g.edges).intersection(set(train_model.edges))) + n_relevant_predictions = len(train_model.edges) + + precision = n_correct_predictions / n_predictions_made + recall = n_correct_predictions / n_relevant_predictions + f1_score = 2 * (precision * recall) / (precision + recall) + + assert f1_score > 0.8 + + def test_f1score_generated(self, adjacency_mat_num_stability): + """Structure learnt from regularisation should have very high f1 score relative to the ground truth""" + df = pd.DataFrame( + adjacency_mat_num_stability, + columns=["a", "b", "c", "d", "e"], + index=["a", "b", "c", "d", "e"], + ) + train_model = StructureModel(df) + X = generate_continuous_dataframe(train_model, 50, noise_scale=1, seed=1) + g = from_pandas(X, lasso_beta=0.1, w_threshold=0.25) + right_edges = train_model.edges + + n_predictions_made = len(g.edges) + n_correct_predictions = len(set(g.edges).intersection(set(right_edges))) + n_relevant_predictions = len(right_edges) + + precision = n_correct_predictions / n_predictions_made + recall = n_correct_predictions / n_relevant_predictions + f1_score = 2 * (precision * recall) / (precision + recall) + + assert f1_score > 0.85 + + @pytest.mark.parametrize("data", [[np.nan, 0], [np.inf, 0]]) + def test_check_array(self, data): + """ + Providing a data set including nan or inf should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_pandas(pd.DataFrame(data=data, columns=["a"])) + + +class TestFromNumpy: + """Test behaviour of the from_numpy_lasso method""" + + def test_all_columns_in_structure(self, train_data_idx): + """Every columns that is in the data should become a node in the learned structure""" + + g = from_numpy(train_data_idx.values) + assert (len(g.nodes)) == len(train_data_idx.columns) + + def test_isolated_nodes_exist(self, train_data_idx): + """Isolated nodes should still be in the learned structure""" + + g = from_numpy(train_data_idx.values, w_threshold=1.0) + assert len(g.nodes) == len(train_data_idx.columns) + + def test_expected_structure_learned(self, train_data_idx, train_model_idx): + """Given a small data set that can be examined by hand, the structure should be deterministic""" + + g = from_numpy(train_data_idx.values, w_threshold=0.25) + assert set(g.edges) == set(train_model_idx.edges) + + def test_empty_data_raises_error(self): + """ + Providing an empty data set should result in a Value Error explaining that data must not be empty. + This error is useful to catch and handle gracefully, because otherwise the user would experience + misleading division by zero, or unpacking errors. + """ + + with pytest.raises(ValueError): + from_numpy(np.empty([0, 5])) + + def test_single_iter_gets_converged_fail_warnings(self, caplog, train_data_idx): + """ + With a single iteration on this dataset, learn_structure fails to converge and should give warnings. + """ + with caplog.at_level(logging.WARNING): + from_numpy(train_data_idx.values, max_iter=1) + assert "Failed to converge. Consider increasing max_iter." in caplog.text + + def test_certain_relationships_get_near_certain_weight(self): + """If observations reliably show a==b and !a==!b then the relationship from a->b should be certain""" + + data = pd.DataFrame([[1, 2] for _ in range(10)], columns=["a", "b"]) + g = from_numpy(data.values, w_threshold=0.25) + assert set(g.edges) == {(0, 1)} + assert 1.9 <= g.get_edge_data(0, 1)["weight"] <= 2 + + def test_inverse_relationships_get_negative_weight(self): + """If observations indicate a==!b and b==!a then the weight of the relationship from a-> should be negative""" + + data = pd.DataFrame([[1, -2] for _ in range(10)], columns=["a", "b"]) + data.append(pd.DataFrame([[-1, 2] for _ in range(10)], columns=["a", "b"])) + g = from_numpy(data.values, w_threshold=0.25) + assert set(g.edges) == {(0, 1)} + assert -2 <= g.get_edge_data(0, 1)["mean_effect"] <= -1.9 + + def test_no_cycles(self, train_data_idx): + """ + The learned structure should be acyclic + """ + + g = from_numpy(train_data_idx.values, w_threshold=0.25) + assert nx.algorithms.is_directed_acyclic_graph(g) + + def test_tabu_expected_edges(self, train_data_idx): + """Tabu edges should not exist in the network""" + + tabu_e = [(3, 0), (1, 2)] + g = from_numpy(train_data_idx.values, tabu_edges=tabu_e) + assert [e not in g.edges for e in tabu_e] + + def test_tabu_expected_parent_nodes(self, train_data_idx): + """Tabu parent nodes should not have any outgoing edges""" + + tabu_p = [0, 3, 1] + g = from_numpy(train_data_idx.values, tabu_parent_nodes=tabu_p) + assert [p not in [e[0] for e in g.edges] for p in tabu_p] + + def test_tabu_expected_child_nodes(self, train_data_idx): + """Tabu child nodes should not have any ingoing edges""" + + tabu_c = [0, 3, 1] + g = from_numpy(train_data_idx.values, tabu_child_nodes=tabu_c) + assert [c not in [e[1] for e in g.edges] for c in tabu_c] + + def test_multiple_tabu(self, train_data_idx): + """Any edge related to tabu edges/parent nodes/child nodes should not exist in the network""" + + tabu_e = [(3, 0), (1, 2)] + tabu_p = [1] + tabu_c = [0, 3] + g = from_numpy( + train_data_idx.values, + tabu_edges=tabu_e, + tabu_parent_nodes=tabu_p, + tabu_child_nodes=tabu_c, + ) + assert [e not in g.edges for e in tabu_e] + assert [p not in [e[0] for e in g.edges] for p in tabu_p] + assert [c not in [e[1] for e in g.edges] for c in tabu_c] + + def test_sparsity(self, train_data_idx): + """Structure learnt from larger lambda should be sparser than smaller lambda""" + + g1 = from_numpy(train_data_idx.values, lasso_beta=10.0, w_threshold=0.25) + g2 = from_numpy(train_data_idx.values, lasso_beta=1e-6, w_threshold=0.25) + assert len(g1.edges) < len(g2.edges) + + def test_sparsity_against_without_reg(self, train_data_idx): + """Structure learnt from regularisation should be sparser than the one without""" + + g1 = from_numpy(train_data_idx.values, lasso_beta=10.0, w_threshold=0.25) + g2 = from_numpy(train_data_idx.values, w_threshold=0.25) + assert len(g1.edges) < len(g2.edges) + + def test_f1_score_fixed(self, train_data_idx, train_model_idx): + """Structure learnt from regularisation should have very high f1 score relative to the ground truth""" + g = from_numpy(train_data_idx.values, lasso_beta=0.01, w_threshold=0.25) + + n_predictions_made = len(g.edges) + n_correct_predictions = len( + set(g.edges).intersection(set(train_model_idx.edges)) + ) + n_relevant_predictions = len(train_model_idx.edges) + + precision = n_correct_predictions / n_predictions_made + recall = n_correct_predictions / n_relevant_predictions + f1_score = 2 * (precision * recall) / (precision + recall) + + assert f1_score > 0.8 + + def test_f1score_generated(self, adjacency_mat_num_stability): + """Structure learnt from regularisation should have very high f1 score relative to the ground truth""" + df = pd.DataFrame( + adjacency_mat_num_stability, + columns=["a", "b", "c", "d", "e"], + index=["a", "b", "c", "d", "e"], + ) + train_model = StructureModel(df.values) + X = generate_continuous_dataframe(StructureModel(df), 50, noise_scale=1, seed=1) + g = from_numpy( + X[["a", "b", "c", "d", "e"]].values, lasso_beta=0.1, w_threshold=0.25 + ) + right_edges = train_model.edges + + n_predictions_made = len(g.edges) + n_correct_predictions = len(set(g.edges).intersection(set(right_edges))) + n_relevant_predictions = len(right_edges) + precision = n_correct_predictions / n_predictions_made + recall = n_correct_predictions / n_relevant_predictions + f1_score = 2 * (precision * recall) / (precision + recall) + + assert f1_score > 0.85 + + def test_non_negativity_constraint(self, train_data_idx): + """ + The optimisation in notears lasso involves reshaping the initial similarity matrix + into two strictly positive matrixes (w+ and w-) and imposing a non negativity constraint + to the solver. We test here if these two contraints are imposed. + + We check if: + (1) bounds impose non negativity constraint + (2) initial guess obeys non negativity constraint + (3) most importantly: output of sopt obeys the constraint + """ + # using `wraps` to **spy** on the function + with patch( + "causalnex.structure.pytorch.core.sopt.minimize", + wraps=sopt.minimize, + ) as mocked: + from_numpy(train_data_idx.values, lasso_beta=0.1, w_threshold=0.25) + # We iterate over each time `sopt.minimize` was called + for called_arguments in list(mocked.call_args_list): + # These are the arguments with which the `sopt.minimize` was called + func_ = called_arguments[0][0] # positional arg + w_est = called_arguments[0][1] # positional arg + keyword_args = called_arguments[1] + + # check 1: + assert [ + (len(el) == 2) and (el[0] == 0) for el in keyword_args["bounds"] + ] + # check 2: + assert [el >= 0 for el in w_est] + # check 3 + sol = sopt.minimize(func_, w_est, **keyword_args) + assert [el >= 0 for el in sol.x] + + @pytest.mark.parametrize("data", [[np.nan, 0], [np.inf, 0]]) + def test_check_array(self, data): + """ + Providing a data set including nan or inf should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_numpy(np.array([data])) diff --git a/tests/structure/test_sklearn.py b/tests/structure/test_sklearn.py new file mode 100644 index 0000000..bbb2a97 --- /dev/null +++ b/tests/structure/test_sklearn.py @@ -0,0 +1,221 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import networkx as nx +import numpy as np +import pandas as pd +import pytest +from IPython.display import Image +from mock import patch +from sklearn.exceptions import NotFittedError +from sklearn.gaussian_process.kernels import RBF +from sklearn.model_selection import KFold, cross_val_score + +from causalnex.structure import data_generators as dg +from causalnex.structure.sklearn import DAGRegressor + + +class TestStructureModel: + @pytest.mark.parametrize( + "val, msg, error", + [ + ({"alpha": "0.0"}, "alpha should be numeric", TypeError), + ({"beta": "0.0"}, "beta should be numeric", TypeError), + ({"fit_intercept": 0}, "fit_intercept should be a bool", TypeError), + ({"threshold": "0.0"}, "threshold should be numeric", TypeError), + ], + ) + def test_input_type_assertion(self, val, msg, error): + with pytest.raises(error, match=msg): + DAGRegressor(**val) + + def test_pandas_fit(self): + reg = DAGRegressor() + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + X, y = pd.DataFrame(X), pd.Series(y) + reg.fit(X, y) + + def test_numpy_fit(self): + reg = DAGRegressor() + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + reg.fit(X, y) + + def test_predict_type(self): + reg = DAGRegressor() + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + reg.fit(X, y) + assert isinstance(reg.predict(X), np.ndarray) + reg = DAGRegressor() + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + X, y = pd.DataFrame(X), pd.Series(y) + reg.fit(X, y) + assert isinstance(reg.predict(X), np.ndarray) + + def test_notfitted_error(self): + reg = DAGRegressor() + X = np.random.normal(size=(100, 2)) + with pytest.raises(NotFittedError): + reg.predict(X) + + @pytest.mark.parametrize("hidden_layer_units", [None, [2], [2, 2]]) + def test_coef(self, hidden_layer_units): + reg = DAGRegressor(hidden_layer_units=hidden_layer_units) + X, y = ( + pd.DataFrame(np.random.normal(size=(100, 1))), + pd.Series(np.random.normal(size=(100,))), + ) + X["true_feat"] = y * -3 + reg.fit(X, y) + assert isinstance(reg.coef_, np.ndarray) + coef_ = pd.Series(reg.coef_, index=X.columns) + # assert that the sign of the coefficient is correct for both nonlinear and linear cases + assert coef_["true_feat"] < 0 + + @pytest.mark.parametrize("hidden_layer_units", [None, [2], [2, 2]]) + def test_feature_importances(self, hidden_layer_units): + reg = DAGRegressor(hidden_layer_units=hidden_layer_units) + X, y = ( + pd.DataFrame(np.random.normal(size=(100, 1))), + pd.Series(np.random.normal(size=(100,))), + ) + X["true_feat"] = y * -3 + reg.fit(X, y) + assert isinstance(reg.feature_importances_, np.ndarray) + coef_ = pd.Series(reg.feature_importances_, index=X.columns) + # assert that the sign of the coefficient is positive for both nonlinear and linear cases + assert coef_["true_feat"] > 0 + + def test_tabu_parent_nodes(self): + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + X, y = pd.DataFrame(X), pd.Series(y, name="test") + + reg = DAGRegressor(dependent_target=True, tabu_parent_nodes=["test"]) + assert "test" in reg.tabu_parent_nodes + + reg = DAGRegressor(dependent_target=True, tabu_parent_nodes=[]) + reg.fit(X, y) + assert "test" not in reg.tabu_parent_nodes + + @pytest.mark.parametrize( + "fit_intercept, equals_zero", [(True, False), (False, True)] + ) + def test_intercept(self, fit_intercept, equals_zero): + reg = DAGRegressor(fit_intercept=fit_intercept) + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + X, y = pd.DataFrame(X), pd.Series(y) + reg.fit(X, y) + # intercept should return zero when fit_intercept == False + assert (reg.intercept_ == 0) is equals_zero + assert isinstance(reg.intercept_, float) + + @pytest.mark.parametrize("enforce_dag", [True, False]) + def test_plot_dag(self, enforce_dag): + reg = DAGRegressor() + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + reg.fit(X, y) + image = reg.plot_dag(enforce_dag=enforce_dag) + assert isinstance(image, Image) + + def test_plot_dag_importerror(self): + with patch.dict("sys.modules", {"IPython.display": None}): + reg = DAGRegressor() + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + reg.fit(X, y) + + with pytest.raises( + ImportError, + match=r"DAGRegressor\.plot_dag method requires IPython installed\.", + ): + reg.plot_dag() + + @pytest.mark.parametrize( + "hidden_layer_units", [None, [], [0], [1], (0,), (1,), [1, 1], (1, 1)] + ) + def test_hidden_layer_units(self, hidden_layer_units): + reg = DAGRegressor(hidden_layer_units=hidden_layer_units) + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + reg.fit(X, y) + + def test_enforce_dag(self): + reg = DAGRegressor(enforce_dag=True) + X, y = np.random.normal(size=(100, 2)), np.random.normal(size=(100,)) + X, y = pd.DataFrame(X), pd.Series(y) + reg.fit(X, y) + assert nx.algorithms.is_directed_acyclic_graph(reg.graph_) + + @pytest.mark.parametrize("standardize", [True, False]) + def test_nonlinear_performance(self, standardize): + np.random.seed(42) + sm = dg.generate_structure(num_nodes=10, degree=3) + sm.threshold_till_dag() + data = dg.generate_continuous_dataframe( + sm, n_samples=1000, intercept=True, seed=42, noise_scale=0.1, kernel=RBF(1) + ) + node = 1 + y = data.iloc[:, node] + X = data.drop(node, axis=1) + + reg = DAGRegressor( + alpha=0.0, + l1_ratio=0.0, + fit_intercept=True, + dependent_target=True, + enforce_dag=False, + hidden_layer_units=[0], + standardize=standardize, + ) + linear_score = cross_val_score( + reg, X, y, cv=KFold(shuffle=True, random_state=42) + ).mean() + + reg = DAGRegressor( + alpha=0.1, + l1_ratio=1.0, + fit_intercept=True, + enforce_dag=False, + hidden_layer_units=[2], + standardize=standardize, + ) + small_nl_score = cross_val_score( + reg, X, y, cv=KFold(shuffle=True, random_state=42) + ).mean() + + reg = DAGRegressor( + alpha=0.1, + l1_ratio=1.0, + fit_intercept=True, + enforce_dag=False, + hidden_layer_units=[4], + standardize=standardize, + ) + medium_nl_score = cross_val_score( + reg, X, y, cv=KFold(shuffle=True, random_state=42) + ).mean() + + assert small_nl_score > linear_score + assert medium_nl_score > small_nl_score diff --git a/tests/structure/test_transformers.py b/tests/structure/test_transformers.py new file mode 100644 index 0000000..4831a05 --- /dev/null +++ b/tests/structure/test_transformers.py @@ -0,0 +1,153 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import numpy as np +import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError + +from causalnex.structure.transformers import DynamicDataTransformer + + +class TestDynamicDataTransformer: + def test_naming_nodes(self, data_dynotears_p3): + """ + Nodes should have the format {var}_lag{l} + """ + df = pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]) + df_dyno = DynamicDataTransformer(p=3).fit_transform(df) + + pattern = re.compile(r"[abcde]_lag[0-3]") + for node in df_dyno.columns: + match = pattern.match(node) + assert match + assert match.group() == node + + def test_all_nodes_in_df(self, data_dynotears_p3): + """ + Nodes should have the format {var}_lag{l} + """ + df = pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]) + df_dyno = DynamicDataTransformer(p=3).fit_transform(df) + + assert list(df_dyno.columns) == [ + el + "_lag" + str(i) for i in range(4) for el in ["a", "b", "c", "d", "e"] + ] + + def test_incorrect_input_format(self): + with pytest.raises( + ValueError, + match="Provided empty list of time_series." + " At least one DataFrame must be provided", + ): + DynamicDataTransformer(p=3).fit_transform([]) + + with pytest.raises( + ValueError, + match=r"All columns must have numeric data\. " + r"Consider mapping the following columns to int: \['a'\]", + ): + DynamicDataTransformer(p=1).fit_transform( + pd.DataFrame([["1"]], columns=["a"]) + ) + + with pytest.raises( + TypeError, + match="Time series entries must be instances of `pd.DataFrame`", + ): + DynamicDataTransformer(p=1).fit_transform([np.array([1, 2])]) + + with pytest.raises( + ValueError, + match="Index for dataframe must be provided in increasing order", + ): + df = pd.DataFrame(np.random.random([5, 5]), index=[3, 1, 2, 5, 0]) + DynamicDataTransformer(p=1).fit_transform(df) + + with pytest.raises( + ValueError, + match="All inputs must have the same columns and same types", + ): + df = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "e"], + ) + df_2 = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "f"], + ) + DynamicDataTransformer(p=1).fit_transform([df, df_2]) + + with pytest.raises( + ValueError, + match="All inputs must have the same columns and same types", + ): + cols = ["a", "b", "c", "d", "e"] + df = pd.DataFrame(np.random.random([5, 5]), columns=cols) + df_2 = pd.DataFrame(np.random.random([5, 5]), columns=cols) + df_2["a"] = df_2["a"].astype(int) + DynamicDataTransformer(p=1).fit_transform([df, df_2]) + + with pytest.raises( + TypeError, + match="Index must be integers", + ): + df = pd.DataFrame(np.random.random([5, 5]), index=[0, 1, 2, 3.0, 4]) + DynamicDataTransformer(p=1).fit_transform(df) + + def test_not_fitted_transform(self): + """if transform called before fit: raise error""" + with pytest.raises( + NotFittedError, + match=r"This DynamicDataTransformer is not fitted yet\." + " Call `fit` before using this method", + ): + df = pd.DataFrame(np.random.random([5, 5])) + DynamicDataTransformer(p=1).transform(df) + + def test_transform_wrong_input(self): + """If transform df does not have all necessaty columns, raise error""" + with pytest.raises( + ValueError, + match="We should provide all necessary columns in " + r"the time series\. Columns not provided: \[2, 3\]", + ): + df = pd.DataFrame(np.random.random([5, 5])) + ddt = DynamicDataTransformer(p=1).fit(df) + ddt.transform(df.drop([2, 3], axis=1)) + + def test_return_df_true_equivalent_to_false(self): + """Check that the df from `return_df=true` is + equivalent the result if `return_df=false`""" + df = pd.DataFrame(np.random.random([50, 10])) + df_dyno = DynamicDataTransformer(p=3).fit_transform(df, return_df=True) + X, Xlags = DynamicDataTransformer(p=3).fit_transform(df, return_df=False) + assert np.all(df_dyno.values[:, :10] == X) + assert np.all(df_dyno.values[:, 10:] == Xlags) diff --git a/tools/license_and_headers.py b/tools/license_and_headers.py index 3842454..bee6300 100644 --- a/tools/license_and_headers.py +++ b/tools/license_and_headers.py @@ -66,8 +66,11 @@ def files_at_path(path: str): - return [fn for fn in glob.glob(path + '/**/*.py', recursive=True) - if not ('ebaybbn' in fn or 'structure/notears.py' in fn)] + return [ + fn + for fn in glob.glob(path + "/**/*.py", recursive=True) + if not ("ebaybbn" in fn or "structure/notears.py" in fn) + ] def files_missing_substring(file_names, substring):