From 3707474ff06f0fa214144bf9506eb25b90a91267 Mon Sep 17 00:00:00 2001 From: Jeremy Steward Date: Thu, 8 Jun 2017 03:51:48 -0600 Subject: [PATCH] [MRG+1] Fixes issue #3367 -> MCD fails on data with singular covariance matrix (#8328) * Adds failing test for issue 3367 * Adds extra comments to describe what I'm testing Basically in this instance I discovered this bug independently from \#3367 because I was trying to use principle components to estimate plane normals / geometry of points in 3D space. When you have a set of points that specify a perfect plane though (or in the case of wanting to use MCD, you have a subset of your points that specify a perfect plane), then the code fails because the covariance matrix is singular. However, if your covariance matrix is singular, you've already found the set of points with the lowest determinant. As per Rousseeuw & Van Driessen 1999, at this point you can stop searching. The code did stop searching, however, it raised a ValueError on singular matrices for no reason. So the correct fix should be to remove that. * Fixes issue 3367 This should work with the test case provided. * Adds missing argument to test * Style corrections to pass flake runner Implements the style corrections as mentioned in pull request #8328 * Adds entry for PR #8328 to what's new * Adds review changes from @jnothman --- doc/whats_new.rst | 7 ++++ sklearn/covariance/robust_covariance.py | 18 +++++----- .../tests/test_robust_covariance.py | 34 +++++++++++++++++++ 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/doc/whats_new.rst b/doc/whats_new.rst index 32663b199833f..2122373628ea5 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -187,6 +187,12 @@ Enhancements Bug fixes ......... + + - Fixed a bug in :class:`sklearn.covariance.MinCovDet` where inputting data + that produced a singular covariance matrix would cause the helper method + `_c_step` to throw an exception. + :issue:`3367` by :user:`Jeremy Steward ` + - Fixed a bug where :class:`sklearn.ensemble.IsolationForest` uses an an incorrect formula for the average path length :issue:`8549` by `Peter Wang `_. @@ -203,6 +209,7 @@ Bug fixes - Fixed a bug where :func:`sklearn.model_selection.BaseSearchCV.inverse_transform` returns self.best_estimator_.transform() instead of self.best_estimator_.inverse_transform() :issue:`8344` by :user:`Akshay Gupta ` + - Fixed same issue in :func:`sklearn.grid_search.BaseSearchCV.inverse_transform` :issue:`8846` by :user:`Rasmus Eriksson ` diff --git a/sklearn/covariance/robust_covariance.py b/sklearn/covariance/robust_covariance.py index 6115e6ada4e2c..428cb05928440 100644 --- a/sklearn/covariance/robust_covariance.py +++ b/sklearn/covariance/robust_covariance.py @@ -96,6 +96,7 @@ def _c_step(X, n_support, random_state, remaining_iterations=30, initial_estimates=None, verbose=False, cov_computation_method=empirical_covariance): n_samples, n_features = X.shape + dist = np.inf # Initialisation support = np.zeros(n_samples, dtype=bool) @@ -119,8 +120,14 @@ def _c_step(X, n_support, random_state, remaining_iterations=30, # Iterative procedure for Minimum Covariance Determinant computation det = fast_logdet(covariance) + # If the data already has singular covariance, calculate the precision, + # as the loop below will not be entered. + if np.isinf(det): + precision = pinvh(covariance) + previous_det = np.inf - while (det < previous_det) and (remaining_iterations > 0): + while (det < previous_det and remaining_iterations > 0 + and not np.isinf(det)): # save old estimates values previous_location = location previous_covariance = covariance @@ -142,14 +149,9 @@ def _c_step(X, n_support, random_state, remaining_iterations=30, previous_dist = dist dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1) - # Catch computation errors + # Check if best fit already found (det => 0, logdet => -inf) if np.isinf(det): - raise ValueError( - "Singular covariance matrix. " - "Please check that the covariance matrix corresponding " - "to the dataset is full rank and that MinCovDet is used with " - "Gaussian-distributed data (or at least data drawn from a " - "unimodal, symmetric distribution.") + results = location, covariance, det, support, dist # Check convergence if np.allclose(det, previous_det): # c_step procedure converged diff --git a/sklearn/covariance/tests/test_robust_covariance.py b/sklearn/covariance/tests/test_robust_covariance.py index 27e423b410210..b34db354177d4 100644 --- a/sklearn/covariance/tests/test_robust_covariance.py +++ b/sklearn/covariance/tests/test_robust_covariance.py @@ -4,6 +4,8 @@ # # License: BSD 3 clause +import itertools + import numpy as np from sklearn.utils.testing import assert_almost_equal @@ -92,6 +94,38 @@ def test_mcd_issue1127(): mcd.fit(X) +def test_mcd_issue3367(): + # Check that MCD completes when the covariance matrix is singular + # i.e. one of the rows and columns are all zeros + rand_gen = np.random.RandomState(0) + + # Think of these as the values for X and Y -> 10 values between -5 and 5 + data_values = np.linspace(-5, 5, 10).tolist() + # Get the cartesian product of all possible coordinate pairs from above set + data = np.array(list(itertools.product(data_values, data_values))) + + # Add a third column that's all zeros to make our data a set of point + # within a plane, which means that the covariance matrix will be singular + data = np.hstack((data, np.zeros((data.shape[0], 1)))) + + # The below line of code should raise an exception if the covariance matrix + # is singular. As a further test, since we have points in XYZ, the + # principle components (Eigenvectors) of these directly relate to the + # geometry of the points. Since it's a plane, we should be able to test + # that the Eigenvector that corresponds to the smallest Eigenvalue is the + # plane normal, specifically [0, 0, 1], since everything is in the XY plane + # (as I've set it up above). To do this one would start by: + # + # evals, evecs = np.linalg.eigh(mcd_fit.covariance_) + # normal = evecs[:, np.argmin(evals)] + # + # After which we need to assert that our `normal` is equal to [0, 0, 1]. + # Do note that there is floating point error associated with this, so it's + # best to subtract the two and then compare some small tolerance (e.g. + # 1e-12). + MinCovDet(random_state=rand_gen).fit(data) + + def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10)