Skip to content
This repository was archived by the owner on Jan 22, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,7 @@
*.o
*~
.DS_Store
.coverage
.eggs/
python_moztelemetry.egg-info/
docs/_build/
39 changes: 22 additions & 17 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,27 +1,32 @@
FROM java:8
FROM openjdk:8

# Versions of spark + hbase to use for our testing environment
ENV SPARK_VERSION=2.0.2
ENV HBASE_VERSION=1.2.3
ENV SPARK_VERSION=2.0.2 \
HBASE_VERSION=1.2.3

# install gcc
RUN apt-get update && apt-get install -y g++ libpython-dev libsnappy-dev
RUN apt-get update --fix-missing && \
apt-get install -y \
g++ libpython-dev libsnappy-dev

# setup conda environment

# temporary workaround, pin miniconda version until it's fixed.
RUN wget https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh
RUN bash miniconda.sh -b -p /miniconda
ENV PATH="/miniconda/bin:${PATH}"
RUN hash -r
RUN conda config --set always_yes yes --set changeps1 no
# TODO: uncomment
# RUN conda update -q conda
RUN conda info -a # Useful for debugging any issues with conda

# install spark/hbase
RUN wget -nv https://archive.apache.org/dist/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz
RUN tar -zxf hbase-$HBASE_VERSION-bin.tar.gz
RUN echo "export PATH=/miniconda/bin:${PATH}" > /etc/profile.d/conda.sh && \
wget --quiet https://repo.continuum.io/miniconda/Miniconda2-4.3.21-Linux-x86_64.sh -O miniconda.sh && \
/bin/bash miniconda.sh -b -p /miniconda && \
rm miniconda.sh

ENV PATH /miniconda/bin:${PATH}

RUN hash -r && \
conda config --set always_yes yes --set changeps1 no && \
# TODO: uncomment \
# RUN conda update -q conda && \
conda info -a # Useful for debugging any issues with conda

# install hbase
RUN wget -nv https://archive.apache.org/dist/hbase/$HBASE_VERSION/hbase-$HBASE_VERSION-bin.tar.gz && \
tar -zxf hbase-$HBASE_VERSION-bin.tar.gz

# build + activate conda environment
COPY ./environment.yml /python_moztelemetry/
Expand Down
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ dependencies:
- pyspark
- python-snappy
- snappy
- scipy
115 changes: 115 additions & 0 deletions moztelemetry/stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python
# encoding: utf-8

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from __future__ import division

import math
from collections import namedtuple


def _rank(sample):
"""
Assign numeric ranks to all values in the sample.

The ranks begin with 1 for the smallest value. When there are groups of
tied values, assign a rank equal to the midpoint of unadjusted rankings.

E.g.::

>>> rank({3: 1, 5: 4, 9: 1})
{3: 1.0, 5: 3.5, 9: 6.0}

"""
rank = 1
ranks = {}

for k in sorted(sample.keys()):
n = sample[k]
ranks[k] = rank + (n - 1) / 2
rank += n

return ranks


def _tie_correct(sample):
"""
Returns the tie correction value for U.

See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.tiecorrect.html

"""
tc = 0
n = sum(sample.values())

if n < 2:
return 1.0 # Avoid a ``ZeroDivisionError``.

for k in sorted(sample.keys()):
tc += math.pow(sample[k], 3) - sample[k]
tc = 1 - tc / (math.pow(n, 3) - n)

return tc


def ndtr(a):
"""
Returns the area under the Gaussian probability density function,
integrated from minus infinity to x.

See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.special.ndtr.html#scipy.special.ndtr

"""
sqrth = math.sqrt(2) / 2
x = float(a) * sqrth
z = abs(x)
if z < sqrth:
y = 0.5 + 0.5 * math.erf(x)
else:
y = 0.5 * math.erfc(z)
if x > 0:
y = 1 - y
return y


mwu_result = namedtuple('Mann_Whitney_U', ('u', 'p'))


def mann_whitney_u(sample1, sample2, use_continuity=True):
"""
Computes the Mann-Whitney rank test on both samples.

Each sample is expected to be of the form::

{1: 5, 2: 20, 3: 12, ...}

Returns a named tuple with:
``u`` equal to min(U for sample1, U for sample2), and
``p`` equal to the p-value.

"""
# Merge dictionaries, adding values if keys match.
sample = sample1.copy()
for k, v in sample2.items():
sample[k] = sample.get(k, 0) + v

# Create a ranking dictionary using same keys for lookups.
ranks = _rank(sample)

sum_of_ranks = sum([sample1[k] * ranks[k] for k, v in sample1.items()])
n1 = sum(sample1.values())
n2 = sum(sample2.values())

# Calculate Mann-Whitney U for both samples.
u1 = sum_of_ranks - (n1 * (n1 + 1)) / 2
u2 = n1 * n2 - u1

tie_correction = _tie_correct(sample)
sd_u = math.sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0)
mean_rank = n1 * n2 / 2.0 + 0.5 * use_continuity
z = abs((max(u1, u2) - mean_rank) / sd_u)

return mwu_result(min(u1, u2), ndtr(-z))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@
setup_requires=['pytest-runner', 'setuptools_scm'],
# put pytest last to workaround this bug
# https://bitbucket.org/pypa/setuptools/issues/196/tests_require-pytest-pytest-cov-breaks
tests_require=['mock', 'pytest-timeout', 'moto', 'responses', 'pytest'],
tests_require=['mock', 'pytest-timeout', 'moto', 'responses', 'scipy', 'pytest'],
)
115 changes: 115 additions & 0 deletions tests/test_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
#!/usr/bin/env python
# encoding: utf-8

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

"""
This module implements test coverage for the stats functions in stats.py.
"""
import itertools
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add more tests, with some different distributions? We could add, for example:

  • Normally distributed histograms
  • Skewed distributions
  • Uniformly distributed histograms

And combinations therein. We can create them randomly, and check that our result is within scipy's by some value.


import numpy.random
import scipy.stats

import pytest
from moztelemetry import stats


def l2d(values):
# Convert a list of values to a histogram representation.
d = {}
for v in values:
d[v] = d.get(v, 0) + 1
return d


# A normally distributed sample set.
norm1 = list(numpy.random.normal(5, 3.25, 1000))
norm2 = list(numpy.random.normal(6, 2.5, 1000))

# A uniformly distributed sample set.
uni1 = numpy.random.randint(1, 100, 1000)
uni2 = numpy.random.randint(10, 120, 900)

# A skewed normal distribution.
skew1 = list(scipy.stats.skewnorm.rvs(10, size=1000))
skew2 = list(scipy.stats.skewnorm.rvs(5, size=900))


samples = {
'normalized': (norm1, norm2),
'uniform': (uni1, uni2),
'skewed': (skew1, skew2),
}


def test_rank():
assert stats._rank({1: 1}) == {1: 1.0}
assert stats._rank({1: 5, 2: 4, 3: 3, 4: 2, 5: 1}) == {
1: 3.0,
2: 7.5,
3: 11.0,
4: 13.5,
5: 15.0,
}


def test_tie_correct():
assert stats._tie_correct({}) == 1.0
assert stats._tie_correct({1: 1}) == 1.0


def test_ndtr():
# Test invalid values raise an error.
with pytest.raises(TypeError):
stats.ndtr(None)
with pytest.raises(ValueError):
stats.ndtr('a')

assert round(stats.ndtr(0), 6) == 0.5
assert round(stats.ndtr(1), 6) == 0.841345
assert round(stats.ndtr(2), 6) == 0.977250
assert round(stats.ndtr(3), 6) == 0.998650

assert round(stats.ndtr(0), 6) == round(scipy.special.ndtr(0), 6)
assert round(stats.ndtr(1), 6) == round(scipy.special.ndtr(1), 6)
assert round(stats.ndtr(1.5), 6) == round(scipy.special.ndtr(1.5), 6)
assert round(stats.ndtr(2), 6) == round(scipy.special.ndtr(2), 6)
assert round(stats.ndtr(3), 6) == round(scipy.special.ndtr(3), 6)


def test_mann_whitney_u():
distribution_types = ('normalized', 'uniform', 'skewed')

# Test different distributions against each other, including
# like distributions against themselves.
for sample1, sample2 in itertools.product(distribution_types, repeat=2):

arr1, arr2 = samples[sample1][0], samples[sample2][1]
hist1, hist2 = l2d(arr1), l2d(arr2)

# Basic test, with defaults.
res = stats.mann_whitney_u(hist1, hist2)
sci = scipy.stats.mannwhitneyu(arr1, arr2)
assert res.u == sci.statistic
assert round(res.p, 6) == round(sci.pvalue, 6)

# Test that order of samples doesn't matter.
res = stats.mann_whitney_u(hist2, hist1)
sci = scipy.stats.mannwhitneyu(arr1, arr2)
assert res.u == sci.statistic
assert round(res.p, 6) == round(sci.pvalue, 6)

# Test exact same samples.
res = stats.mann_whitney_u(hist1, hist1)
sci = scipy.stats.mannwhitneyu(arr1, arr1)
assert res.u == sci.statistic
assert round(res.p, 6) == round(sci.pvalue, 6)

# Test with use_continuity = False.
res = stats.mann_whitney_u(hist1, hist2, False)
sci = scipy.stats.mannwhitneyu(arr1, arr2, False)
assert res.u == sci.statistic
assert round(res.p, 6) == round(sci.pvalue, 6)