From d426a263588f3b38cda80ad2684f2abfdccc1787 Mon Sep 17 00:00:00 2001
From: Rob Hudson <robhudson@mozilla.com>
Date: Thu, 7 Sep 2017 15:22:04 -0700
Subject: [PATCH] Add Mann-Whitney U test for comparing histograms (bug
 1395571)

---
 .gitignore            |   3 ++
 Dockerfile            |  39 +++++++-------
 environment.yml       |   1 +
 moztelemetry/stats.py | 115 ++++++++++++++++++++++++++++++++++++++++++
 setup.py              |   2 +-
 tests/test_stats.py   | 115 ++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 257 insertions(+), 18 deletions(-)
 create mode 100644 moztelemetry/stats.py
 create mode 100644 tests/test_stats.py

diff --git a/.gitignore b/.gitignore
index 0f68958..4fdd213 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,7 @@
 *.o
 *~
 .DS_Store
+.coverage
+.eggs/
+python_moztelemetry.egg-info/
 docs/_build/
diff --git a/Dockerfile b/Dockerfile
index ab8d5c1..2b37d16 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,27 +1,32 @@
-FROM java:8
+FROM openjdk:8
 
 # Versions of spark + hbase to use for our testing environment
-ENV SPARK_VERSION=2.0.2
-ENV HBASE_VERSION=1.2.3
+ENV SPARK_VERSION=2.0.2 \
+    HBASE_VERSION=1.2.3
 
 # install gcc
-RUN apt-get update && apt-get install -y g++ libpython-dev libsnappy-dev
+RUN apt-get update --fix-missing && \
+    apt-get install -y \
+    g++ libpython-dev libsnappy-dev
 
 # setup conda environment
-
 # temporary workaround, pin miniconda version until it's fixed.
-RUN wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh
-RUN bash miniconda.sh -b -p /miniconda
-ENV PATH="/miniconda/bin:${PATH}"
-RUN hash -r
-RUN conda config --set always_yes yes --set changeps1 no
-# TODO: uncomment
-# RUN conda update -q conda
-RUN conda info -a # Useful for debugging any issues with conda
-
-# install spark/hbase
-RUN wget -nv https://archive.apache.org/dist/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz
-RUN tar -zxf hbase-$HBASE_VERSION-bin.tar.gz
+RUN echo "export PATH=/miniconda/bin:${PATH}" > /etc/profile.d/conda.sh && \
+    wget --quiet https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh && \
+    /bin/bash miniconda.sh -b -p /miniconda && \
+    rm miniconda.sh
+
+ENV PATH /miniconda/bin:${PATH}
+
+RUN hash -r && \
+    conda config --set always_yes yes --set changeps1 no && \
+    # TODO: uncomment \
+    # RUN conda update -q conda && \
+    conda info -a # Useful for debugging any issues with conda
+
+# install hbase
+RUN wget -nv https://archive.apache.org/dist/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz && \
+    tar -zxf hbase-$HBASE_VERSION-bin.tar.gz
 
 # build + activate conda environment
 COPY ./environment.yml /python_moztelemetry/
diff --git a/environment.yml b/environment.yml
index ed99bb2..1b2c182 100644
--- a/environment.yml
+++ b/environment.yml
@@ -5,3 +5,4 @@ dependencies:
 - pyspark
 - python-snappy
 - snappy
+- scipy
diff --git a/moztelemetry/stats.py b/moztelemetry/stats.py
new file mode 100644
index 0000000..323938b
--- /dev/null
+++ b/moztelemetry/stats.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import division
+
+import math
+from collections import namedtuple
+
+
+def _rank(sample):
+    """
+    Assign numeric ranks to all values in the sample.
+
+    The ranks begin with 1 for the smallest value. When there are groups of
+    tied values, assign a rank equal to the midpoint of unadjusted rankings.
+
+    E.g.::
+
+        >>> rank({3: 1, 5: 4, 9: 1})
+        {3: 1.0, 5: 3.5, 9: 6.0}
+
+    """
+    rank = 1
+    ranks = {}
+
+    for k in sorted(sample.keys()):
+        n = sample[k]
+        ranks[k] = rank + (n - 1) / 2
+        rank += n
+
+    return ranks
+
+
+def _tie_correct(sample):
+    """
+    Returns the tie correction value for U.
+
+    See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.tiecorrect.html
+
+    """
+    tc = 0
+    n = sum(sample.values())
+
+    if n < 2:
+        return 1.0  # Avoid a ``ZeroDivisionError``.
+
+    for k in sorted(sample.keys()):
+        tc += math.pow(sample[k], 3) - sample[k]
+    tc = 1 - tc / (math.pow(n, 3) - n)
+
+    return tc
+
+
+def ndtr(a):
+    """
+    Returns the area under the Gaussian probability density function,
+    integrated from minus infinity to x.
+
+    See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.ndtr.html#scipy.special.ndtr
+
+    """
+    sqrth = math.sqrt(2) / 2
+    x = float(a) * sqrth
+    z = abs(x)
+    if z < sqrth:
+        y = 0.5 + 0.5 * math.erf(x)
+    else:
+        y = 0.5 * math.erfc(z)
+        if x > 0:
+            y = 1 - y
+    return y
+
+
+mwu_result = namedtuple('Mann_Whitney_U', ('u', 'p'))
+
+
+def mann_whitney_u(sample1, sample2, use_continuity=True):
+    """
+    Computes the Mann-Whitney rank test on both samples.
+
+    Each sample is expected to be of the form::
+
+        {1: 5, 2: 20, 3: 12, ...}
+
+    Returns a named tuple with:
+        ``u`` equal to min(U for sample1, U for sample2), and
+        ``p`` equal to the p-value.
+
+    """
+    # Merge dictionaries, adding values if keys match.
+    sample = sample1.copy()
+    for k, v in sample2.items():
+        sample[k] = sample.get(k, 0) + v
+
+    # Create a ranking dictionary using same keys for lookups.
+    ranks = _rank(sample)
+
+    sum_of_ranks = sum([sample1[k] * ranks[k] for k, v in sample1.items()])
+    n1 = sum(sample1.values())
+    n2 = sum(sample2.values())
+
+    # Calculate Mann-Whitney U for both samples.
+    u1 = sum_of_ranks - (n1 * (n1 + 1)) / 2
+    u2 = n1 * n2 - u1
+
+    tie_correction = _tie_correct(sample)
+    sd_u = math.sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0)
+    mean_rank = n1 * n2 / 2.0 + 0.5 * use_continuity
+    z = abs((max(u1, u2) - mean_rank) / sd_u)
+
+    return mwu_result(min(u1, u2), ndtr(-z))
diff --git a/setup.py b/setup.py
index 976a689..e9fbdbe 100755
--- a/setup.py
+++ b/setup.py
@@ -22,5 +22,5 @@
     setup_requires=['pytest-runner', 'setuptools_scm'],
     # put pytest last to workaround this bug
     # https://bitbucket.org/pypa/setuptools/issues/196/tests_require-pytest-pytest-cov-breaks
-    tests_require=['mock', 'pytest-timeout', 'moto', 'responses', 'pytest'],
+    tests_require=['mock', 'pytest-timeout', 'moto', 'responses', 'scipy', 'pytest'],
 )
diff --git a/tests/test_stats.py b/tests/test_stats.py
new file mode 100644
index 0000000..d9c0919
--- /dev/null
+++ b/tests/test_stats.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+"""
+This module implements test coverage for the stats functions in stats.py.
+"""
+import itertools
+
+import numpy.random
+import scipy.stats
+
+import pytest
+from moztelemetry import stats
+
+
+def l2d(values):
+    # Convert a list of values to a histogram representation.
+    d = {}
+    for v in values:
+        d[v] = d.get(v, 0) + 1
+    return d
+
+
+# A normally distributed sample set.
+norm1 = list(numpy.random.normal(5, 3.25, 1000))
+norm2 = list(numpy.random.normal(6, 2.5, 1000))
+
+# A uniformly distributed sample set.
+uni1 = numpy.random.randint(1, 100, 1000)
+uni2 = numpy.random.randint(10, 120, 900)
+
+# A skewed normal distribution.
+skew1 = list(scipy.stats.skewnorm.rvs(10, size=1000))
+skew2 = list(scipy.stats.skewnorm.rvs(5, size=900))
+
+
+samples = {
+    'normalized': (norm1, norm2),
+    'uniform': (uni1, uni2),
+    'skewed': (skew1, skew2),
+}
+
+
+def test_rank():
+    assert stats._rank({1: 1}) == {1: 1.0}
+    assert stats._rank({1: 5, 2: 4, 3: 3, 4: 2, 5: 1}) == {
+        1: 3.0,
+        2: 7.5,
+        3: 11.0,
+        4: 13.5,
+        5: 15.0,
+    }
+
+
+def test_tie_correct():
+    assert stats._tie_correct({}) == 1.0
+    assert stats._tie_correct({1: 1}) == 1.0
+
+
+def test_ndtr():
+    # Test invalid values raise an error.
+    with pytest.raises(TypeError):
+        stats.ndtr(None)
+    with pytest.raises(ValueError):
+        stats.ndtr('a')
+
+    assert round(stats.ndtr(0), 6) == 0.5
+    assert round(stats.ndtr(1), 6) == 0.841345
+    assert round(stats.ndtr(2), 6) == 0.977250
+    assert round(stats.ndtr(3), 6) == 0.998650
+
+    assert round(stats.ndtr(0), 6) == round(scipy.special.ndtr(0), 6)
+    assert round(stats.ndtr(1), 6) == round(scipy.special.ndtr(1), 6)
+    assert round(stats.ndtr(1.5), 6) == round(scipy.special.ndtr(1.5), 6)
+    assert round(stats.ndtr(2), 6) == round(scipy.special.ndtr(2), 6)
+    assert round(stats.ndtr(3), 6) == round(scipy.special.ndtr(3), 6)
+
+
+def test_mann_whitney_u():
+    distribution_types = ('normalized', 'uniform', 'skewed')
+
+    # Test different distributions against each other, including
+    # like distributions against themselves.
+    for sample1, sample2 in itertools.product(distribution_types, repeat=2):
+
+        arr1, arr2 = samples[sample1][0], samples[sample2][1]
+        hist1, hist2 = l2d(arr1), l2d(arr2)
+
+        # Basic test, with defaults.
+        res = stats.mann_whitney_u(hist1, hist2)
+        sci = scipy.stats.mannwhitneyu(arr1, arr2)
+        assert res.u == sci.statistic
+        assert round(res.p, 6) == round(sci.pvalue, 6)
+
+        # Test that order of samples doesn't matter.
+        res = stats.mann_whitney_u(hist2, hist1)
+        sci = scipy.stats.mannwhitneyu(arr1, arr2)
+        assert res.u == sci.statistic
+        assert round(res.p, 6) == round(sci.pvalue, 6)
+
+        # Test exact same samples.
+        res = stats.mann_whitney_u(hist1, hist1)
+        sci = scipy.stats.mannwhitneyu(arr1, arr1)
+        assert res.u == sci.statistic
+        assert round(res.p, 6) == round(sci.pvalue, 6)
+
+        # Test with use_continuity = False.
+        res = stats.mann_whitney_u(hist1, hist2, False)
+        sci = scipy.stats.mannwhitneyu(arr1, arr2, False)
+        assert res.u == sci.statistic
+        assert round(res.p, 6) == round(sci.pvalue, 6)