diff --git a/RELEASE.md b/RELEASE.md index 07f7571..2084d3d 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -4,6 +4,7 @@ * Set bounds/max class imbalance for binary features for the data generators * Add non-linear data generators for multiple data types * Added Pytorch implementation for NOTEARS MLP which is much faster (Only supporting linear structure learning for now) +* Bugfix to resolve issue when applying notears on data containing NaN * Added StructureRegressor sklearn interface using the Pytorch NOTEARS implementation. * Hotfix for data_gen system. Fixes issues with root node initialization. diff --git a/causalnex/structure/notears.py b/causalnex/structure/notears.py index 9dab187..0472cc2 100644 --- a/causalnex/structure/notears.py +++ b/causalnex/structure/notears.py @@ -102,6 +102,8 @@ def from_numpy( # n examples, d properties _, d = X.shape + _assert_all_finite(X) + bnds = [ (0, 0) if i == j @@ -162,6 +164,8 @@ def from_numpy_lasso( # n examples, d properties _, d = X.shape + _assert_all_finite(X) + bnds = [ (0, 0) if i == j @@ -550,3 +554,32 @@ def _grad(w_vec: np.ndarray) -> np.ndarray: w_new = w_est[: d ** 2].reshape([d, d]) - w_est[d ** 2 :].reshape([d, d]) w_new[np.abs(w_new) < w_threshold] = 0 return StructureModel(w_new.reshape([d, d])) + + +def _assert_all_finite(X: np.ndarray): + """Throw a ValueError if X contains NaN or Infinity. + + Based on Sklearn method to handle NaN & Infinity. + @inproceedings{sklearn_api, + author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and + Fabian Pedregosa and Andreas Mueller and Olivier Grisel and + Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort + and Jaques Grobler and Robert Layton and Jake VanderPlas and + Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux}, + title = {{API} design for machine learning software: experiences from the scikit-learn + project}, + booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning}, + year = {2013}, + pages = {108--122}, + } + + Args: + X: Array to validate + + Raises: + ValueError: If X contains NaN or Infinity + """ + + msg_err = "Input contains NaN, infinity or a value too large for {!r}." + if not np.isfinite(X).all(): + raise ValueError(msg_err.format(X.dtype)) diff --git a/tests/structure/test_notears.py b/tests/structure/test_notears.py index aa1d3b5..a07aa79 100644 --- a/tests/structure/test_notears.py +++ b/tests/structure/test_notears.py @@ -80,6 +80,29 @@ def test_non_numeric_data_raises_error(self): with pytest.raises(ValueError, match="All columns must have numeric data.*"): from_pandas(pd.DataFrame(data=["x"], columns=["a"])) + def test_array_with_nan_raises_error(self): + """ + Providing a data set including nan should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_pandas(pd.DataFrame(data=[np.nan, 0], columns=["a"])) + + def test_array_with_inf_raises_error(self): + """ + Providing a data set including infinite values should result in a Value Error explaining that data + contains infinite values. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_pandas(pd.DataFrame(data=[np.inf, 0], columns=["a"])) + def test_single_iter_gets_converged_fail_warnings(self, train_data_idx): """ With a single iteration on this dataset, learn_structure fails to converge and should give warnings. @@ -206,6 +229,29 @@ def test_non_numeric_data_raises_error(self): with pytest.raises(ValueError, match="All columns must have numeric data.*"): from_pandas_lasso(pd.DataFrame(data=["x"], columns=["a"]), 0.1) + def test_array_with_nan_raises_error(self): + """ + Providing a data set including nan should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_pandas_lasso(pd.DataFrame(data=[np.nan, 0], columns=["a"]), 0.1) + + def test_array_with_inf_raises_error(self): + """ + Providing a data set including infinite values should result in a Value Error explaining that data + contains infinite values. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_pandas_lasso(pd.DataFrame(data=[np.inf, 0], columns=["a"]), 0.1) + def test_single_iter_gets_converged_fail_warnings(self, train_data_idx): """ With a single iteration on this dataset, learn_structure fails to converge and should give warnings. @@ -369,6 +415,29 @@ def test_empty_data_raises_error(self): with pytest.raises(ValueError): from_numpy(np.empty([0, 5])) + def test_array_with_nan_raises_error(self): + """ + Providing a data set including nan should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_numpy(np.array([[0, np.nan]])) + + def test_array_with_inf_raises_error(self): + """ + Providing a data set including infinite values should result in a Value Error explaining that data + contains infinite values. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_numpy(np.array([[0, np.inf]])) + def test_single_iter_gets_converged_fail_warnings(self, train_data_idx): """ With a single iteration on this dataset, learn_structure fails to converge and should give warnings. @@ -489,6 +558,29 @@ def test_empty_data_raises_error(self): with pytest.raises(ValueError): from_numpy_lasso(np.empty([0, 5]), 0.1) + def test_array_with_nan_raises_error(self): + """ + Providing a data set including nan should result in a Value Error explaining that data contains nan. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_numpy_lasso(np.array([[3, np.nan]]), 0.1) + + def test_array_with_inf_raises_error(self): + """ + Providing a data set including infinite values should result in a Value Error explaining that data + contains infinite values. + This error is useful to catch and handle gracefully, because otherwise the user would have empty structures. + """ + with pytest.raises( + ValueError, + match="Input contains NaN, infinity or a value too large for dtype*", + ): + from_numpy_lasso(np.array([[3, np.inf]]), 0.1) + def test_single_iter_gets_converged_fail_warnings(self, train_data_idx): """ With a single iteration on this dataset, learn_structure fails to converge and should give warnings.