Skip to content

Commit

Permalink
Fix/check for NA or Infinity when notears is used (#54)
Browse files Browse the repository at this point in the history
  • Loading branch information
Jebq committed Sep 8, 2020
1 parent 3096676 commit 07eb5ed
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 0 deletions.
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* Set bounds/max class imbalance for binary features for the data generators
* Add non-linear data generators for multiple data types
* Added Pytorch implementation for NOTEARS MLP which is much faster (Only supporting linear structure learning for now)
* Bugfix to resolve issue when applying notears on data containing NaN
* Added StructureRegressor sklearn interface using the Pytorch NOTEARS implementation.
* Hotfix for data_gen system. Fixes issues with root node initialization.

Expand Down
33 changes: 33 additions & 0 deletions causalnex/structure/notears.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def from_numpy(
# n examples, d properties
_, d = X.shape

_assert_all_finite(X)

bnds = [
(0, 0)
if i == j
Expand Down Expand Up @@ -162,6 +164,8 @@ def from_numpy_lasso(
# n examples, d properties
_, d = X.shape

_assert_all_finite(X)

bnds = [
(0, 0)
if i == j
Expand Down Expand Up @@ -550,3 +554,32 @@ def _grad(w_vec: np.ndarray) -> np.ndarray:
w_new = w_est[: d ** 2].reshape([d, d]) - w_est[d ** 2 :].reshape([d, d])
w_new[np.abs(w_new) < w_threshold] = 0
return StructureModel(w_new.reshape([d, d]))


def _assert_all_finite(X: np.ndarray):
"""Throw a ValueError if X contains NaN or Infinity.
Based on Sklearn method to handle NaN & Infinity.
@inproceedings{sklearn_api,
author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and
Fabian Pedregosa and Andreas Mueller and Olivier Grisel and
Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort
and Jaques Grobler and Robert Layton and Jake VanderPlas and
Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux},
title = {{API} design for machine learning software: experiences from the scikit-learn
project},
booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning},
year = {2013},
pages = {108--122},
}
Args:
X: Array to validate
Raises:
ValueError: If X contains NaN or Infinity
"""

msg_err = "Input contains NaN, infinity or a value too large for {!r}."
if not np.isfinite(X).all():
raise ValueError(msg_err.format(X.dtype))
92 changes: 92 additions & 0 deletions tests/structure/test_notears.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,29 @@ def test_non_numeric_data_raises_error(self):
with pytest.raises(ValueError, match="All columns must have numeric data.*"):
from_pandas(pd.DataFrame(data=["x"], columns=["a"]))

def test_array_with_nan_raises_error(self):
"""
Providing a data set including nan should result in a Value Error explaining that data contains nan.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_pandas(pd.DataFrame(data=[np.nan, 0], columns=["a"]))

def test_array_with_inf_raises_error(self):
"""
Providing a data set including infinite values should result in a Value Error explaining that data
contains infinite values.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_pandas(pd.DataFrame(data=[np.inf, 0], columns=["a"]))

def test_single_iter_gets_converged_fail_warnings(self, train_data_idx):
"""
With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
Expand Down Expand Up @@ -206,6 +229,29 @@ def test_non_numeric_data_raises_error(self):
with pytest.raises(ValueError, match="All columns must have numeric data.*"):
from_pandas_lasso(pd.DataFrame(data=["x"], columns=["a"]), 0.1)

def test_array_with_nan_raises_error(self):
"""
Providing a data set including nan should result in a Value Error explaining that data contains nan.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_pandas_lasso(pd.DataFrame(data=[np.nan, 0], columns=["a"]), 0.1)

def test_array_with_inf_raises_error(self):
"""
Providing a data set including infinite values should result in a Value Error explaining that data
contains infinite values.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_pandas_lasso(pd.DataFrame(data=[np.inf, 0], columns=["a"]), 0.1)

def test_single_iter_gets_converged_fail_warnings(self, train_data_idx):
"""
With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
Expand Down Expand Up @@ -369,6 +415,29 @@ def test_empty_data_raises_error(self):
with pytest.raises(ValueError):
from_numpy(np.empty([0, 5]))

def test_array_with_nan_raises_error(self):
"""
Providing a data set including nan should result in a Value Error explaining that data contains nan.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_numpy(np.array([[0, np.nan]]))

def test_array_with_inf_raises_error(self):
"""
Providing a data set including infinite values should result in a Value Error explaining that data
contains infinite values.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_numpy(np.array([[0, np.inf]]))

def test_single_iter_gets_converged_fail_warnings(self, train_data_idx):
"""
With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
Expand Down Expand Up @@ -489,6 +558,29 @@ def test_empty_data_raises_error(self):
with pytest.raises(ValueError):
from_numpy_lasso(np.empty([0, 5]), 0.1)

def test_array_with_nan_raises_error(self):
"""
Providing a data set including nan should result in a Value Error explaining that data contains nan.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_numpy_lasso(np.array([[3, np.nan]]), 0.1)

def test_array_with_inf_raises_error(self):
"""
Providing a data set including infinite values should result in a Value Error explaining that data
contains infinite values.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_numpy_lasso(np.array([[3, np.inf]]), 0.1)

def test_single_iter_gets_converged_fail_warnings(self, train_data_idx):
"""
With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
Expand Down

0 comments on commit 07eb5ed

Please sign in to comment.