From 3707474ff06f0fa214144bf9506eb25b90a91267 Mon Sep 17 00:00:00 2001
From: Jeremy Steward <jeremy@thatgeoguy.ca>
Date: Thu, 8 Jun 2017 03:51:48 -0600
Subject: [PATCH] [MRG+1] Fixes issue #3367 -> MCD fails on data with singular
 covariance matrix (#8328)

* Adds failing test for issue 3367

* Adds extra comments to describe what I'm testing

Basically in this instance I discovered this bug independently from
\#3367 because I was trying to use principle components to estimate
plane normals / geometry of points in 3D space. When you have a set of
points that specify a perfect plane though (or in the case of wanting to
use MCD, you have a subset of your points that specify a perfect plane),
then the code fails because the covariance matrix is singular. However,
if your covariance matrix is singular, you've already found the set of
points with the lowest determinant. As per Rousseeuw & Van Driessen
1999, at this point you can stop searching.

The code did stop searching, however, it raised a ValueError on singular
matrices for no reason. So the correct fix should be to remove that.

* Fixes issue 3367

This should work with the test case provided.

* Adds missing argument to test

* Style corrections to pass flake runner

Implements the style corrections as mentioned in pull request #8328

* Adds entry for PR #8328 to what's new

* Adds review changes from @jnothman
---
 doc/whats_new.rst                             |  7 ++++
 sklearn/covariance/robust_covariance.py       | 18 +++++-----
 .../tests/test_robust_covariance.py           | 34 +++++++++++++++++++
 3 files changed, 51 insertions(+), 8 deletions(-)
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
index 32663b199833f..2122373628ea5 100644
--- a/doc/whats_new.rst
+++ b/doc/whats_new.rst
@@ -187,6 +187,12 @@ Enhancements
 
 Bug fixes
 .........
+
+   - Fixed a bug in :class:`sklearn.covariance.MinCovDet` where inputting data
+     that produced a singular covariance matrix would cause the helper method
+     `_c_step` to throw an exception.
+     :issue:`3367` by :user:`Jeremy Steward <ThatGeoGuy>`
+
    - Fixed a bug where :class:`sklearn.ensemble.IsolationForest` uses an
      an incorrect formula for the average path length
      :issue:`8549` by `Peter Wang <https://github.com/PTRWang>`_.
@@ -203,6 +209,7 @@ Bug fixes
    - Fixed a bug where :func:`sklearn.model_selection.BaseSearchCV.inverse_transform`
      returns self.best_estimator_.transform() instead of self.best_estimator_.inverse_transform()
      :issue:`8344` by :user:`Akshay Gupta <Akshay0724>`
+
    - Fixed same issue in :func:`sklearn.grid_search.BaseSearchCV.inverse_transform`
      :issue:`8846` by :user:`Rasmus Eriksson <MrMjauh>`
 
diff --git a/sklearn/covariance/robust_covariance.py b/sklearn/covariance/robust_covariance.py
index 6115e6ada4e2c..428cb05928440 100644
--- a/sklearn/covariance/robust_covariance.py
+++ b/sklearn/covariance/robust_covariance.py
@@ -96,6 +96,7 @@ def _c_step(X, n_support, random_state, remaining_iterations=30,
             initial_estimates=None, verbose=False,
             cov_computation_method=empirical_covariance):
     n_samples, n_features = X.shape
+    dist = np.inf
 
     # Initialisation
     support = np.zeros(n_samples, dtype=bool)
@@ -119,8 +120,14 @@ def _c_step(X, n_support, random_state, remaining_iterations=30,
 
     # Iterative procedure for Minimum Covariance Determinant computation
     det = fast_logdet(covariance)
+    # If the data already has singular covariance, calculate the precision,
+    # as the loop below will not be entered.
+    if np.isinf(det):
+        precision = pinvh(covariance)
+
     previous_det = np.inf
-    while (det < previous_det) and (remaining_iterations > 0):
+    while (det < previous_det and remaining_iterations > 0
+            and not np.isinf(det)):
         # save old estimates values
         previous_location = location
         previous_covariance = covariance
@@ -142,14 +149,9 @@ def _c_step(X, n_support, random_state, remaining_iterations=30,
 
     previous_dist = dist
     dist = (np.dot(X - location, precision) * (X - location)).sum(axis=1)
-    # Catch computation errors
+    # Check if best fit already found (det => 0, logdet => -inf)
     if np.isinf(det):
-        raise ValueError(
-            "Singular covariance matrix. "
-            "Please check that the covariance matrix corresponding "
-            "to the dataset is full rank and that MinCovDet is used with "
-            "Gaussian-distributed data (or at least data drawn from a "
-            "unimodal, symmetric distribution.")
+        results = location, covariance, det, support, dist
     # Check convergence
     if np.allclose(det, previous_det):
         # c_step procedure converged
diff --git a/sklearn/covariance/tests/test_robust_covariance.py b/sklearn/covariance/tests/test_robust_covariance.py
index 27e423b410210..b34db354177d4 100644
--- a/sklearn/covariance/tests/test_robust_covariance.py
+++ b/sklearn/covariance/tests/test_robust_covariance.py
@@ -4,6 +4,8 @@
 #
 # License: BSD 3 clause
 
+import itertools
+
 import numpy as np
 
 from sklearn.utils.testing import assert_almost_equal
@@ -92,6 +94,38 @@ def test_mcd_issue1127():
     mcd.fit(X)
 
 
+def test_mcd_issue3367():
+    # Check that MCD completes when the covariance matrix is singular
+    # i.e. one of the rows and columns are all zeros
+    rand_gen = np.random.RandomState(0)
+
+    # Think of these as the values for X and Y -> 10 values between -5 and 5
+    data_values = np.linspace(-5, 5, 10).tolist()
+    # Get the cartesian product of all possible coordinate pairs from above set
+    data = np.array(list(itertools.product(data_values, data_values)))
+
+    # Add a third column that's all zeros to make our data a set of point
+    # within a plane, which means that the covariance matrix will be singular
+    data = np.hstack((data, np.zeros((data.shape[0], 1))))
+
+    # The below line of code should raise an exception if the covariance matrix
+    # is singular. As a further test, since we have points in XYZ, the
+    # principle components (Eigenvectors) of these directly relate to the
+    # geometry of the points. Since it's a plane, we should be able to test
+    # that the Eigenvector that corresponds to the smallest Eigenvalue is the
+    # plane normal, specifically [0, 0, 1], since everything is in the XY plane
+    # (as I've set it up above). To do this one would start by:
+    #
+    #     evals, evecs = np.linalg.eigh(mcd_fit.covariance_)
+    #     normal = evecs[:, np.argmin(evals)]
+    #
+    # After which we need to assert that our `normal` is equal to [0, 0, 1].
+    # Do note that there is floating point error associated with this, so it's
+    # best to subtract the two and then compare some small tolerance (e.g.
+    # 1e-12).
+    MinCovDet(random_state=rand_gen).fit(data)
+
+
 def test_outlier_detection():
     rnd = np.random.RandomState(0)
     X = rnd.randn(100, 10)