From d426a263588f3b38cda80ad2684f2abfdccc1787 Mon Sep 17 00:00:00 2001 From: Rob Hudson Date: Thu, 7 Sep 2017 15:22:04 -0700 Subject: [PATCH] Add Mann-Whitney U test for comparing histograms (bug 1395571) --- .gitignore | 3 ++ Dockerfile | 39 +++++++------- environment.yml | 1 + moztelemetry/stats.py | 115 ++++++++++++++++++++++++++++++++++++++++++ setup.py | 2 +- tests/test_stats.py | 115 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 257 insertions(+), 18 deletions(-) create mode 100644 moztelemetry/stats.py create mode 100644 tests/test_stats.py diff --git a/.gitignore b/.gitignore index 0f68958..4fdd213 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,7 @@ *.o *~ .DS_Store +.coverage +.eggs/ +python_moztelemetry.egg-info/ docs/_build/ diff --git a/Dockerfile b/Dockerfile index ab8d5c1..2b37d16 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,27 +1,32 @@ -FROM java:8 +FROM openjdk:8 # Versions of spark + hbase to use for our testing environment -ENV SPARK_VERSION=2.0.2 -ENV HBASE_VERSION=1.2.3 +ENV SPARK_VERSION=2.0.2 \ + HBASE_VERSION=1.2.3 # install gcc -RUN apt-get update && apt-get install -y g++ libpython-dev libsnappy-dev +RUN apt-get update --fix-missing && \ + apt-get install -y \ + g++ libpython-dev libsnappy-dev # setup conda environment - # temporary workaround, pin miniconda version until it's fixed. -RUN wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh -RUN bash miniconda.sh -b -p /miniconda -ENV PATH="/miniconda/bin:${PATH}" -RUN hash -r -RUN conda config --set always_yes yes --set changeps1 no -# TODO: uncomment -# RUN conda update -q conda -RUN conda info -a # Useful for debugging any issues with conda - -# install spark/hbase -RUN wget -nv https://archive.apache.org/dist/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz -RUN tar -zxf hbase-$HBASE_VERSION-bin.tar.gz +RUN echo "export PATH=/miniconda/bin:${PATH}" > /etc/profile.d/conda.sh && \ + wget --quiet https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh && \ + /bin/bash miniconda.sh -b -p /miniconda && \ + rm miniconda.sh + +ENV PATH /miniconda/bin:${PATH} + +RUN hash -r && \ + conda config --set always_yes yes --set changeps1 no && \ + # TODO: uncomment \ + # RUN conda update -q conda && \ + conda info -a # Useful for debugging any issues with conda + +# install hbase +RUN wget -nv https://archive.apache.org/dist/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz && \ + tar -zxf hbase-$HBASE_VERSION-bin.tar.gz # build + activate conda environment COPY ./environment.yml /python_moztelemetry/ diff --git a/environment.yml b/environment.yml index ed99bb2..1b2c182 100644 --- a/environment.yml +++ b/environment.yml @@ -5,3 +5,4 @@ dependencies: - pyspark - python-snappy - snappy +- scipy diff --git a/moztelemetry/stats.py b/moztelemetry/stats.py new file mode 100644 index 0000000..323938b --- /dev/null +++ b/moztelemetry/stats.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# encoding: utf-8 + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +from __future__ import division + +import math +from collections import namedtuple + + +def _rank(sample): + """ + Assign numeric ranks to all values in the sample. + + The ranks begin with 1 for the smallest value. When there are groups of + tied values, assign a rank equal to the midpoint of unadjusted rankings. + + E.g.:: + + >>> rank({3: 1, 5: 4, 9: 1}) + {3: 1.0, 5: 3.5, 9: 6.0} + + """ + rank = 1 + ranks = {} + + for k in sorted(sample.keys()): + n = sample[k] + ranks[k] = rank + (n - 1) / 2 + rank += n + + return ranks + + +def _tie_correct(sample): + """ + Returns the tie correction value for U. + + See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.tiecorrect.html + + """ + tc = 0 + n = sum(sample.values()) + + if n < 2: + return 1.0 # Avoid a ``ZeroDivisionError``. + + for k in sorted(sample.keys()): + tc += math.pow(sample[k], 3) - sample[k] + tc = 1 - tc / (math.pow(n, 3) - n) + + return tc + + +def ndtr(a): + """ + Returns the area under the Gaussian probability density function, + integrated from minus infinity to x. + + See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.ndtr.html#scipy.special.ndtr + + """ + sqrth = math.sqrt(2) / 2 + x = float(a) * sqrth + z = abs(x) + if z < sqrth: + y = 0.5 + 0.5 * math.erf(x) + else: + y = 0.5 * math.erfc(z) + if x > 0: + y = 1 - y + return y + + +mwu_result = namedtuple('Mann_Whitney_U', ('u', 'p')) + + +def mann_whitney_u(sample1, sample2, use_continuity=True): + """ + Computes the Mann-Whitney rank test on both samples. + + Each sample is expected to be of the form:: + + {1: 5, 2: 20, 3: 12, ...} + + Returns a named tuple with: + ``u`` equal to min(U for sample1, U for sample2), and + ``p`` equal to the p-value. + + """ + # Merge dictionaries, adding values if keys match. + sample = sample1.copy() + for k, v in sample2.items(): + sample[k] = sample.get(k, 0) + v + + # Create a ranking dictionary using same keys for lookups. + ranks = _rank(sample) + + sum_of_ranks = sum([sample1[k] * ranks[k] for k, v in sample1.items()]) + n1 = sum(sample1.values()) + n2 = sum(sample2.values()) + + # Calculate Mann-Whitney U for both samples. + u1 = sum_of_ranks - (n1 * (n1 + 1)) / 2 + u2 = n1 * n2 - u1 + + tie_correction = _tie_correct(sample) + sd_u = math.sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0) + mean_rank = n1 * n2 / 2.0 + 0.5 * use_continuity + z = abs((max(u1, u2) - mean_rank) / sd_u) + + return mwu_result(min(u1, u2), ndtr(-z)) diff --git a/setup.py b/setup.py index 976a689..e9fbdbe 100755 --- a/setup.py +++ b/setup.py @@ -22,5 +22,5 @@ setup_requires=['pytest-runner', 'setuptools_scm'], # put pytest last to workaround this bug # https://bitbucket.org/pypa/setuptools/issues/196/tests_require-pytest-pytest-cov-breaks - tests_require=['mock', 'pytest-timeout', 'moto', 'responses', 'pytest'], + tests_require=['mock', 'pytest-timeout', 'moto', 'responses', 'scipy', 'pytest'], ) diff --git a/tests/test_stats.py b/tests/test_stats.py new file mode 100644 index 0000000..d9c0919 --- /dev/null +++ b/tests/test_stats.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python +# encoding: utf-8 + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" +This module implements test coverage for the stats functions in stats.py. +""" +import itertools + +import numpy.random +import scipy.stats + +import pytest +from moztelemetry import stats + + +def l2d(values): + # Convert a list of values to a histogram representation. + d = {} + for v in values: + d[v] = d.get(v, 0) + 1 + return d + + +# A normally distributed sample set. +norm1 = list(numpy.random.normal(5, 3.25, 1000)) +norm2 = list(numpy.random.normal(6, 2.5, 1000)) + +# A uniformly distributed sample set. +uni1 = numpy.random.randint(1, 100, 1000) +uni2 = numpy.random.randint(10, 120, 900) + +# A skewed normal distribution. +skew1 = list(scipy.stats.skewnorm.rvs(10, size=1000)) +skew2 = list(scipy.stats.skewnorm.rvs(5, size=900)) + + +samples = { + 'normalized': (norm1, norm2), + 'uniform': (uni1, uni2), + 'skewed': (skew1, skew2), +} + + +def test_rank(): + assert stats._rank({1: 1}) == {1: 1.0} + assert stats._rank({1: 5, 2: 4, 3: 3, 4: 2, 5: 1}) == { + 1: 3.0, + 2: 7.5, + 3: 11.0, + 4: 13.5, + 5: 15.0, + } + + +def test_tie_correct(): + assert stats._tie_correct({}) == 1.0 + assert stats._tie_correct({1: 1}) == 1.0 + + +def test_ndtr(): + # Test invalid values raise an error. + with pytest.raises(TypeError): + stats.ndtr(None) + with pytest.raises(ValueError): + stats.ndtr('a') + + assert round(stats.ndtr(0), 6) == 0.5 + assert round(stats.ndtr(1), 6) == 0.841345 + assert round(stats.ndtr(2), 6) == 0.977250 + assert round(stats.ndtr(3), 6) == 0.998650 + + assert round(stats.ndtr(0), 6) == round(scipy.special.ndtr(0), 6) + assert round(stats.ndtr(1), 6) == round(scipy.special.ndtr(1), 6) + assert round(stats.ndtr(1.5), 6) == round(scipy.special.ndtr(1.5), 6) + assert round(stats.ndtr(2), 6) == round(scipy.special.ndtr(2), 6) + assert round(stats.ndtr(3), 6) == round(scipy.special.ndtr(3), 6) + + +def test_mann_whitney_u(): + distribution_types = ('normalized', 'uniform', 'skewed') + + # Test different distributions against each other, including + # like distributions against themselves. + for sample1, sample2 in itertools.product(distribution_types, repeat=2): + + arr1, arr2 = samples[sample1][0], samples[sample2][1] + hist1, hist2 = l2d(arr1), l2d(arr2) + + # Basic test, with defaults. + res = stats.mann_whitney_u(hist1, hist2) + sci = scipy.stats.mannwhitneyu(arr1, arr2) + assert res.u == sci.statistic + assert round(res.p, 6) == round(sci.pvalue, 6) + + # Test that order of samples doesn't matter. + res = stats.mann_whitney_u(hist2, hist1) + sci = scipy.stats.mannwhitneyu(arr1, arr2) + assert res.u == sci.statistic + assert round(res.p, 6) == round(sci.pvalue, 6) + + # Test exact same samples. + res = stats.mann_whitney_u(hist1, hist1) + sci = scipy.stats.mannwhitneyu(arr1, arr1) + assert res.u == sci.statistic + assert round(res.p, 6) == round(sci.pvalue, 6) + + # Test with use_continuity = False. + res = stats.mann_whitney_u(hist1, hist2, False) + sci = scipy.stats.mannwhitneyu(arr1, arr2, False) + assert res.u == sci.statistic + assert round(res.p, 6) == round(sci.pvalue, 6)