Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/check for NA or Infinity when notears is used #54

Merged
merged 10 commits into from
Sep 8, 2020
33 changes: 33 additions & 0 deletions causalnex/structure/notears.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ def from_numpy(
# n examples, d properties
_, d = X.shape

_assert_all_finite(X)

bnds = [
(0, 0)
if i == j
Expand Down Expand Up @@ -162,6 +164,8 @@ def from_numpy_lasso(
# n examples, d properties
_, d = X.shape

_assert_all_finite(X)

bnds = [
(0, 0)
if i == j
Expand Down Expand Up @@ -550,3 +554,32 @@ def _grad(w_vec: np.ndarray) -> np.ndarray:
w_new = w_est[: d ** 2].reshape([d, d]) - w_est[d ** 2 :].reshape([d, d])
w_new[np.abs(w_new) < w_threshold] = 0
return StructureModel(w_new.reshape([d, d]))


def _assert_all_finite(X: np.ndarray):
"""Throw a ValueError if X contains NaN or Infinity.

Based on Sklearn method to handle NaN & Infinity.
@inproceedings{sklearn_api,
author = {Lars Buitinck and Gilles Louppe and Mathieu Blondel and
Fabian Pedregosa and Andreas Mueller and Olivier Grisel and
Vlad Niculae and Peter Prettenhofer and Alexandre Gramfort
and Jaques Grobler and Robert Layton and Jake VanderPlas and
Arnaud Joly and Brian Holt and Ga{\"{e}}l Varoquaux},
title = {{API} design for machine learning software: experiences from the scikit-learn
project},
booktitle = {ECML PKDD Workshop: Languages for Data Mining and Machine Learning},
year = {2013},
pages = {108--122},
}

Args:
X: Array to validate

Raises:
ValueError: If X contains NaN or Infinity
"""

msg_err = "Input contains NaN, infinity or a value too large for {!r}."
if not np.isfinite(X).all():
raise ValueError(msg_err.format(X.dtype))
92 changes: 92 additions & 0 deletions tests/structure/test_notears.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,29 @@ def test_non_numeric_data_raises_error(self):
with pytest.raises(ValueError, match="All columns must have numeric data.*"):
from_pandas(pd.DataFrame(data=["x"], columns=["a"]))

def test_array_with_nan_raises_error(self):
"""
Providing a data set including nan should result in a Value Error explaining that data contains nan.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_pandas(pd.DataFrame(data=[np.nan, 0], columns=["a"]))

def test_array_with_inf_raises_error(self):
"""
Providing a data set including infinite values should result in a Value Error explaining that data
contains infinite values.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_pandas(pd.DataFrame(data=[np.inf, 0], columns=["a"]))

def test_single_iter_gets_converged_fail_warnings(self, train_data_idx):
"""
With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
Expand Down Expand Up @@ -206,6 +229,29 @@ def test_non_numeric_data_raises_error(self):
with pytest.raises(ValueError, match="All columns must have numeric data.*"):
from_pandas_lasso(pd.DataFrame(data=["x"], columns=["a"]), 0.1)

def test_array_with_nan_raises_error(self):
"""
Providing a data set including nan should result in a Value Error explaining that data contains nan.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_pandas_lasso(pd.DataFrame(data=[np.nan, 0], columns=["a"]), 0.1)

def test_array_with_inf_raises_error(self):
"""
Providing a data set including infinite values should result in a Value Error explaining that data
contains infinite values.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_pandas_lasso(pd.DataFrame(data=[np.inf, 0], columns=["a"]), 0.1)

def test_single_iter_gets_converged_fail_warnings(self, train_data_idx):
"""
With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
Expand Down Expand Up @@ -369,6 +415,29 @@ def test_empty_data_raises_error(self):
with pytest.raises(ValueError):
from_numpy(np.empty([0, 5]))

def test_array_with_nan_raises_error(self):
"""
Providing a data set including nan should result in a Value Error explaining that data contains nan.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_numpy(np.array([[0, np.nan]]))

def test_array_with_inf_raises_error(self):
"""
Providing a data set including infinite values should result in a Value Error explaining that data
contains infinite values.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_numpy(np.array([[0, np.inf]]))

def test_single_iter_gets_converged_fail_warnings(self, train_data_idx):
"""
With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
Expand Down Expand Up @@ -489,6 +558,29 @@ def test_empty_data_raises_error(self):
with pytest.raises(ValueError):
from_numpy_lasso(np.empty([0, 5]), 0.1)

def test_array_with_nan_raises_error(self):
"""
Providing a data set including nan should result in a Value Error explaining that data contains nan.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_numpy_lasso(np.array([[3, np.nan]]), 0.1)

def test_array_with_inf_raises_error(self):
"""
Providing a data set including infinite values should result in a Value Error explaining that data
contains infinite values.
This error is useful to catch and handle gracefully, because otherwise the user would have empty structures.
"""
with pytest.raises(
ValueError,
match="Input contains NaN, infinity or a value too large for dtype*",
):
from_numpy_lasso(np.array([[3, np.inf]]), 0.1)

def test_single_iter_gets_converged_fail_warnings(self, train_data_idx):
"""
With a single iteration on this dataset, learn_structure fails to converge and should give warnings.
Expand Down