Merge pull request #647 from danibene/feature/expand_outlier_identifi…

…cation [Feature] multiple methods of outlier identification
neuropsychology · Jun 1, 2022 · cf3535d · cf3535d
2 parents 59fce35 + 451dec1
commit cf3535d
Show file tree

Hide file tree

Showing 2 changed files with 86 additions and 15 deletions.
diff --git a/NEWS.rst b/NEWS.rst
@@ -15,6 +15,7 @@ New Features
 +++++++++++++
 
 * Add new time-domain measures in `hrv_time()`: `Prc20NN`, `Prc80NN`, `MinNN`, and `MaxNN`
+* Add argument `method` in `find_outliers()` calling `_find_outliers_standardize()` or `_find_outliers_percentile()`
 
 Fixes
 +++++++++++++

diff --git a/neurokit2/misc/find_outliers.py b/neurokit2/misc/find_outliers.py
@@ -4,44 +4,114 @@
 from ..stats import standardize
 
 
-def find_outliers(data, exclude=0.05, side="both"):
+def find_outliers(data, exclude=2, side="both", method="sd", **kwargs):
     """**Identify outliers (abnormal values)**
 
-    Extreme values identification.
+    Extreme values identification using different methods, such as:
+
+    * **sd**: Data is :func:`standardized <.standardize>`, i.e., centered and
+      scaled, and absolute value beyond a certain SD threshold are considered as outliers.
+    * **norm**: Extreme values identified using theoretical percentiles to identify outliers
+      beyond a certain theoretical percentile (assuming the data comes from a normal distribution).
+      For example, with this method, ``exclude=0.025`` (one-sided) corresponds to the 2.5% lower
+      bound of the normal distribution, which corresponds to approx. -1.96 SD. This method is
+      related to the **SD** one, but instead of specifying the threshold in SDs, it is specified in
+      percentiles.
+    * **percentile**: Extreme values identified using percentiles.
 
     Parameters
     ----------
     data : list or ndarray
         Data array
     exclude : int, float
-        Proportion of extreme observation to be excluded.
+        Amount of outliers to detect (depends on the chosen method).
     side: str
-        Can be ``"both"``, ``"left"`` or ``"right"``. If ``exclude=0.05`` and ``side="both"``, 2.5%
-        of extreme observation of each side will be marked as outliers.
+        Can be ``"both"``, ``"left"`` or ``"right"``. If ``exclude=0.05`` and ``side="both"`` and
+        ``method="norm"``, 2.5% of extreme observation of each side will be marked as outliers.
+    method: str
+        Can be "standardize" or "percentile". The default is "standardize".
+    **kwargs : optional
+        Other arguments to be passed to :func:`standardize`.
 
     Returns
     ----------
     outliers : ndarray
-        A list of True/False with True being the outliers.
+        A boolean vector of with ``True`` being the outliers.
+
+    See Also
+    ----------
+    .standardize
 
     Example
     ----------
     .. ipython:: python
 
       import neurokit2 as nk
-      outliers = nk.find_outliers([1, 2, 1, 5, 666, 4, 1 ,3, 5])
-      outliers
+
+      data = [-12, 2, 1, 3, 66.6, 2, 1, 3, 2, -42, 2, 4, 1, 12]
+
+      # Outliers beyond 2 SD of the mean
+      outliers = nk.find_outliers(data, exclude=2, side="both", method="sd")
+      np.where(outliers)[0]
+
+      # Outliers beyond 1 MAD of the Median on one side
+      outliers = nk.find_outliers(data, exclude=1, side="left", method="sd", robust=True)
+      np.where(outliers)[0]
+
+      # 2.5% theoretical percentiles on each side
+      outliers = nk.find_outliers(data, exclude=0.05, method="norm")
+      np.where(outliers)[0]
+
+      # Outliers are beyond interquartile range
+      outliers = nk.find_outliers(data, exclude=(0.25, 0.75), method="percentile")
+      np.where(outliers)[0]
+
+      # Outliers are beyond interdecile range
+      outliers = nk.find_outliers(data, exclude=(0.1, 0.9), method="percentile")
+      np.where(outliers)[0]
 
     """
+    # Sanity checks
+    if side not in ["both", "left", "right"]:
+        raise ValueError("side must be 'both', 'left' or 'right'.")
+
+    method = method.lower()
+    if method not in ["standardize", "z", "sd", "percentile", "norm"]:
+        raise ValueError("method must be 'standardize' or 'percentile'.")
+
+    # Force array
+    data = np.array(data)
+
+    # Find thresholds
+    if method in ["percentile"]:
+        if isinstance(exclude, (list, tuple, np.ndarray)):
+            right = np.percentile(data, exclude[1] * 100)
+            left = np.percentile(data, exclude[0] * 100)
+        else:
+            right = np.percentile(data, (1 - (exclude / 2)) * 100)
+            left = np.percentile(data, (exclude / 2) * 100)
+
+    elif method in ["sd"]:
+        if isinstance(exclude, (list, tuple, np.ndarray)):
+            right = exclude[1]
+            left = exclude[0]
+        else:
+            right = exclude
+            left = -right
+    else:
+        if side == "both":
+            exclude = exclude / 2
+        right = scipy.stats.norm.ppf(1 - exclude)
+        left = -right
+
+    if method in ["standardize", "z", "sd", "norm"]:
+        data = np.array(standardize(data, **kwargs))
 
-    z = np.array(standardize(data))
     if side == "both":
-        outliers = abs(z) > scipy.stats.norm.ppf(1 - (exclude / 2))
+        outliers = (data < left) | (data > right)
     elif side == "left":
-        outliers = z < -scipy.stats.norm.ppf(1 - exclude)
+        outliers = data < left
     elif side == "right":
-        outliers = z > scipy.stats.norm.ppf(1 - exclude)
-    else:
-        raise ValueError("side must be 'both', 'left' or 'right'.")
+        outliers = data > right
 
-    return np.array(outliers)
+    return outliers