Skip to content
Browse files

Merge pull request #325 from utaal/descriptiveStatistics

SERVER-7592 descriptive statistics estimators
  • Loading branch information...
2 parents 3d12b1b + 6c82e1b commit 0027639682ee13763b75f15007710f371574131c @RedBeard0531 RedBeard0531 committed
View
4 src/mongo/SConscript
@@ -92,6 +92,10 @@ env.CppUnitTest('namespacestring_test', ['db/namespacestring_test.cpp'],
env.CppUnitTest('bson_extract_test', ['bson/util/bson_extract_test.cpp'], LIBDEPS=['bson'])
+env.CppUnitTest('descriptive_stats_test',
+ ['util/descriptive_stats_test.cpp'],
+ LIBDEPS=['foundation', 'bson']);
+
commonFiles = [ "pch.cpp",
"buildinfo.cpp",
"db/hasher.cpp",
View
22 src/mongo/unittest/unittest.h
@@ -22,6 +22,7 @@
#pragma once
+#include <cmath>
#include <sstream>
#include <string>
#include <vector>
@@ -77,6 +78,13 @@
#a, #b , __FILE__ , __LINE__ ).assert##COMPARISON( (a), (b) )
/**
+ * Approximate equality assertion. Useful for comparisons on limited precision floating point
+ * values.
+ */
+#define ASSERT_APPROX_EQUAL(a,b,ABSOLUTE_ERR) ::mongo::unittest::assertApproxEqual( \
+ #a, #b, a, b, ABSOLUTE_ERR, __FILE__, __LINE__)
+
+/**
* Verify that the evaluation of "EXPRESSION" throws an exception of type EXCEPTION_TYPE.
*
* If "EXPRESSION" throws no exception, or one that is neither of type "EXCEPTION_TYPE" nor
@@ -408,6 +416,20 @@ namespace mongo {
};
/**
+ * Helper for ASSERT_APPROX_EQUAL to ensure that the arguments are evaluated only once.
+ */
+ template < typename A, typename B, typename ERR >
+ inline void assertApproxEqual(const std::string& aexp, const std::string& bexp,
+ const A& a, const B& b, const ERR& absoluteErr,
+ const std::string& file , unsigned line) {
+ if (std::abs(a - b) <= absoluteErr)
+ return;
+ TestAssertion(file, line).fail(mongoutils::str::stream()
+ << "Expected " << aexp << " and " << bexp << " to be within " << absoluteErr
+ << " of each other ((" << a << ") - (" << b << ") = " << (a - b) << ")");
+ }
+
+ /**
* Hack to support the runaway test observer in dbtests. This is a hook that
* unit test running harnesses (unittest_main and dbtests) must implement.
*/
View
9 src/mongo/unittest/unittest_test.cpp
@@ -8,6 +8,7 @@
#include "mongo/unittest/unittest.h"
+#include <limits>
#include <string>
namespace {
@@ -33,6 +34,7 @@ namespace {
ASSERT_GREATER_THAN(5, 1);
ASSERT_GREATER_THAN_OR_EQUALS(5, 1);
ASSERT_GREATER_THAN_OR_EQUALS(5, 5);
+ ASSERT_APPROX_EQUAL(5, 6, 1);
}
TEST(UnitTestSelfTest, TestNumericComparisonFailures) {
@@ -42,6 +44,13 @@ namespace {
ASSERT_TEST_FAILS(ASSERT_GREATER_THAN(10, 10LL));
ASSERT_TEST_FAILS(ASSERT_NOT_LESS_THAN(9, 10LL));
ASSERT_TEST_FAILS(ASSERT_NOT_GREATER_THAN(1, 0LL));
+ ASSERT_TEST_FAILS(ASSERT_APPROX_EQUAL(5.0, 6.1, 1));
+ if (std::numeric_limits<double>::has_quiet_NaN) {
+ ASSERT_TEST_FAILS(ASSERT_APPROX_EQUAL(5, std::numeric_limits<double>::quiet_NaN(), 1));
+ }
+ if (std::numeric_limits<double>::has_infinity) {
+ ASSERT_TEST_FAILS(ASSERT_APPROX_EQUAL(5, std::numeric_limits<double>::infinity(), 1));
+ }
}
TEST(UnitTestSelfTest, TestStringComparisons) {
View
213 src/mongo/util/descriptive_stats-inl.h
@@ -0,0 +1,213 @@
+/* Copyright 2012 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ *
+ * Based upon boost.accumulators (www.boost.org/libs/accumulators/),
+ * distributed under the Boost Software License, Version 1.0.
+ * See distrc/THIRD_PARTY_NOTICES for the full License Notice for Boost.
+ *
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <limits>
+
+#include "mongo/util/mongoutils/str.h"
+
+namespace mongo {
+
+ template <class Sample>
+ BasicEstimators<Sample>::BasicEstimators() :
+ _count(0),
+ _sum(0),
+ _diff(0),
+ _min(std::numeric_limits<Sample>::max()),
+ _max(std::numeric_limits<Sample>::min()) {
+
+ }
+
+ template <class Sample>
+ BasicEstimators<Sample>& BasicEstimators<Sample>::operator <<(const Sample sample) {
+ const double oldMean = (_count > 0) ? _sum / _count : 0;
+ const double delta = oldMean - static_cast<double>(sample);
+ const double weight = static_cast<double>(_count) / (_count + 1);
+ _diff += delta * delta * weight;
+ _sum += static_cast<double>(sample);
+ _count++;
+ _min = std::min(sample, _min);
+ _max = std::max(sample, _max);
+ return *this;
+ }
+
+ template <class Sample>
+ void BasicEstimators<Sample>::appendBasicToBSONObjBuilder(BSONObjBuilder& b) const {
+ b << "count" << static_cast<long long>(count())
+ << "mean" << mean()
+ << "stddev" << stddev()
+ << "min" << min()
+ << "max" << max();
+ }
+
+ template <std::size_t NumQuantiles>
+ DistributionEstimators<NumQuantiles>::DistributionEstimators() :
+ _count(0) {
+
+ for(std::size_t i = 0; i < NumMarkers; i++) {
+ _actual_positions[i] = i + 1;
+ }
+
+ for(std::size_t i = 0; i < NumMarkers; i++) {
+ _desired_positions[i] = 1.0 + (2.0 * (NumQuantiles + 1.0) * _positions_increments(i));
+ }
+ }
+
+ /*
+ * The quantile estimation follows the extended_p_square implementation in boost.accumulators.
+ * It differs by removing the ability to request arbitrary quantiles and computing exactly
+ * 'NumQuantiles' equidistant quantiles (plus minimum and maximum) instead.
+ * See http://www.boost.org/doc/libs/1_51_0/doc/html/boost/accumulators/impl/extended_p_square_impl.html ,
+ * R. Jain and I. Chlamtac, The P^2 algorithmus for dynamic calculation of quantiles and histograms without storing observations, Communications of the ACM, Volume 28 (October), Number 10, 1985, p. 1076-1085. and
+ * K. E. E. Raatikainen, Simultaneous estimation of several quantiles, Simulation, Volume 49, Number 4 (October), 1986, p. 159-164.
+ */
+ template <std::size_t NumQuantiles>
+ DistributionEstimators<NumQuantiles>&
+ DistributionEstimators<NumQuantiles>::operator <<(const double sample) {
+
+ // first accumulate num_markers samples
+ if (_count++ < NumMarkers) {
+ _heights[_count - 1] = sample;
+
+ if (_count == NumMarkers)
+ {
+ std::sort(_heights, _heights + NumMarkers);
+ }
+ }
+ else {
+ std::size_t sample_cell = 1;
+
+ // find cell k = sample_cell such that heights[k-1] <= sample < heights[k]
+ if(sample < _heights[0])
+ {
+ _heights[0] = sample;
+ sample_cell = 1;
+ }
+ else if (sample >= _heights[NumMarkers - 1])
+ {
+ _heights[NumMarkers - 1] = sample;
+ sample_cell = NumMarkers - 1;
+ }
+ else {
+ double* it = std::upper_bound(_heights,
+ _heights + NumMarkers,
+ sample);
+
+ sample_cell = std::distance(_heights, it);
+ }
+
+ // update actual positions of all markers above sample_cell index
+ for(std::size_t i = sample_cell; i < NumMarkers; i++) {
+ _actual_positions[i]++;
+ }
+
+ // update desired positions of all markers
+ for(std::size_t i = 0; i < NumMarkers; i++) {
+ _desired_positions[i] += _positions_increments(i);
+ }
+
+ // adjust heights and actual positions of markers 1 to num_markers-2 if necessary
+ for(std::size_t i = 1; i <= NumMarkers - 2; i++) {
+ // offset to desired position
+ double d = _desired_positions[i] - _actual_positions[i];
+
+ // offset to next position
+ double dp = _actual_positions[i + 1] - _actual_positions[i];
+
+ // offset to previous position
+ double dm = _actual_positions[i - 1] - _actual_positions[i];
+
+ // height ds
+ double hp = (_heights[i + 1] - _heights[i]) / dp;
+ double hm = (_heights[i - 1] - _heights[i]) / dm;
+
+ if((d >= 1 && dp > 1) || (d <= -1 && dm < -1))
+ {
+ short sign_d = static_cast<short>(d / std::abs(d));
+
+ double h = _heights[i] + sign_d / (dp - dm) * ((sign_d - dm)*hp
+ + (dp - sign_d) * hm);
+
+ // try adjusting heights[i] using p-squared formula
+ if(_heights[i - 1] < h && h < _heights[i + 1])
+ {
+ _heights[i] = h;
+ }
+ else
+ {
+ // use linear formula
+ if(d > 0)
+ {
+ _heights[i] += hp;
+ }
+ if(d < 0)
+ {
+ _heights[i] -= hm;
+ }
+ }
+ _actual_positions[i] += sign_d;
+ }
+ }
+ }
+
+ return *this;
+ }
+
+ template <std::size_t NumQuantiles>
+ void DistributionEstimators<NumQuantiles>::appendQuantilesToBSONArrayBuilder(
+ BSONArrayBuilder& arr) const {
+
+ verify(quantilesReady());
+ for (std::size_t i = 0; i <= NumQuantiles + 1; i++) {
+ arr << quantile(i);
+ }
+ }
+
+ template <std::size_t NumQuantiles>
+ inline double DistributionEstimators<NumQuantiles>::_positions_increments(std::size_t i) const {
+ return static_cast<double>(i) / (2 * (NumQuantiles + 1));
+ }
+
+ template <class Sample, std::size_t NumQuantiles>
+ BSONObj SummaryEstimators<Sample, NumQuantiles>::statisticSummaryToBSONObj() const {
+ BSONObjBuilder b;
+ this->BasicEstimators<Sample>::appendBasicToBSONObjBuilder(b);
+ if (this->DistributionEstimators<NumQuantiles>::quantilesReady()) {
+ // Not using appendQuantiles to be explicit about which probability each quantile
+ // refers to. This way the user does not need to count the quantiles or know in
+ // advance how many quantiles were computed to figure out their meaning.
+ BSONObjBuilder quantilesBuilder(b.subobjStart("quantiles"));
+ for (size_t i = 1; i <= NumQuantiles; i++) {
+ const double probability =
+ this->DistributionEstimators<NumQuantiles>::probability(i);
+ const double quantile =
+ this->DistributionEstimators<NumQuantiles>::quantile(i);
+ quantilesBuilder.append(std::string(mongoutils::str::stream() << probability),
+ quantile);
+ }
+ quantilesBuilder.doneFast();
+ }
+ return b.obj();
+ }
+
+} // namespace mongo
View
227 src/mongo/util/descriptive_stats.h
@@ -0,0 +1,227 @@
+/* Copyright 2012 10gen Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cmath>
+
+#include "mongo/db/jsobj.h"
+#include "mongo/util/assert_util.h"
+
+/**
+ * These classes provide online descriptive statistics estimator capable
+ * of computing the mean, standard deviation and quantiles.
+ * Exactness is traded for the ability to obtain reasonable estimates
+ * without the need to store all the samples or perform multiple passes
+ * over the data.
+ *
+ * NOTEs on the estimator accessors provide information about accuracy
+ * of the approximation.
+ *
+ * The implementation of the estimators is heavily inspired by the algorithms used in
+ * boost.accumulators (www.boost.org/libs/accumulators/).
+ * It differs by being tailored for typical descriptive statistics use cases
+ * thus providing a simpler (even though less flexible) interface.
+ */
+namespace mongo {
+
+ /**
+ * Collects count, minimum and maximum, calculates mean and standard deviation.
+ *
+ * The 'Sample' template parameter is the type of the samples. It does not affect the calculated
+ * mean and standard deviation as all values are converted to double.
+ * However, setting the correct sample type prevents unnecessary casting or precision loss
+ * for min and max.
+ */
+ template <class Sample>
+ class BasicEstimators {
+ public:
+ BasicEstimators();
+
+ /**
+ * Update estimators with another observed value.
+ */
+ BasicEstimators& operator <<(const Sample sample);
+
+ /**
+ * @return number of observations so far
+ */
+ inline size_t count() const { return _count; }
+
+ /**
+ * @return mean of the observations seen so far
+ * NOTE: exact (within the limits of IEEE floating point precision).
+ */
+ inline double mean() const { return _sum / _count; }
+
+ /**
+ * @return standard deviation of the observations so far
+ * NOTE: exact (within the limits of IEEE floating point precision).
+ */
+ inline double stddev() const { return std::sqrt(_diff / _count); }
+
+ /**
+ * @return minimum observed value so far
+ * NOTE: exact.
+ */
+ inline Sample min() const { return _min; }
+
+ /**
+ * @return maximum observed value so far
+ * NOTE: exact.
+ */
+ inline Sample max() const { return _max; }
+
+ /**
+ * Appends the basic estimators to the provided BSONObjBuilder.
+ */
+ void appendBasicToBSONObjBuilder(BSONObjBuilder& b) const;
+
+ private:
+ size_t _count;
+ double _sum;
+ double _diff; // sum of squares of differences from the (then-current) mean
+ Sample _min;
+ Sample _max;
+ };
+
+ /**
+ * Computes 'NumQuantiles' quantiles.
+ *
+ * The quantiles at probability 0 and 1 (minimum and maximum observations) are always computed.
+ * Thus DistributionEstimators<3> computes the the 1st, 2nd and 3rd quartiles (probabilities
+ * .25, .50, .75) and the default 0th and 5th (min and max).
+ *
+ * The quantile estimators are mean square consistent (they become a better approximation of the
+ * actual quantiles as the sample size grows).
+ */
+ template <std::size_t NumQuantiles>
+ class DistributionEstimators {
+ public:
+ DistributionEstimators();
+
+ DistributionEstimators& operator <<(const double sample);
+
+ /**
+ * Number of computed quantiles, excluding minimum and maximum.
+ */
+ static const size_t numberOfQuantiles = NumQuantiles;
+
+ /**
+ * Updates the estimators with another observed value.
+ */
+ inline double quantile(std::size_t i) const {
+ massert(16476, "the requested value is out of the range of the computed quantiles",
+ i <= NumQuantiles + 1);
+ return this->_heights[2 * i];
+ }
+
+ /**
+ * @return true when enough value has been observed to output sensible quantiles
+ */
+ inline bool quantilesReady() const {
+ return _count >= NumMarkers;
+ }
+
+ /**
+ * @return estimated minimum
+ *
+ * NOTE: use SimpleEstimators::min for an exact value.
+ */
+ inline double min() const {
+ return quantile(0);
+ }
+
+ /**
+ * @return estimated maximum
+ *
+ * NOTE: use SimpleEstimators::max for an exact value.
+ */
+ inline double max() const {
+ return quantile(NumQuantiles + 1);
+ }
+
+ /**
+ * @return estimated median (2nd quartile)
+ */
+ inline double median() const {
+ return icdf(.5);
+ }
+
+ /**
+ * @return probability associated with the i-th quantile
+ */
+ inline double probability(std::size_t i) const {
+ return i * 1. / (NumQuantiles + 1);
+ }
+
+ /**
+ * @return value for the nearest available quantile for probability 'prob'
+ */
+ inline double icdf(double prob) const {
+ int quant = static_cast<int>(prob * (NumQuantiles + 1) + 0.5);
+ return quantile(quant);
+ }
+
+ /**
+ * Appends the quantiles to the provided BSONArrayBuilder.
+ * REQUIRES e.quantilesReady() == true
+ */
+ void appendQuantilesToBSONArrayBuilder(BSONArrayBuilder& arr) const;
+
+ private:
+ inline double _positions_increments(std::size_t i) const;
+
+ int _count;
+ enum { NumMarkers = 2 * NumQuantiles + 3 };
+ double _heights[NumMarkers]; // q_i
+ double _actual_positions[NumMarkers]; // n_i
+ double _desired_positions[NumMarkers]; // d_i
+ };
+
+ /**
+ * Provides the funcionality of both BasicEstimators and DistributionEstimators.
+ */
+ template <class Sample, std::size_t NumQuantiles>
+ class SummaryEstimators :
+ // Multiple-inheritance
+ public BasicEstimators<Sample>,
+ public DistributionEstimators<NumQuantiles> {
+ public:
+ // Dispatch samples to the inherited estimators
+ inline SummaryEstimators& operator<<(const Sample sample) {
+ this->BasicEstimators<Sample>::operator<<(sample);
+ this->DistributionEstimators<NumQuantiles>::operator<<(sample);
+ return *this;
+ }
+
+ // Expose the exact values
+ inline Sample min() const {
+ return this->BasicEstimators<Sample>::min();
+ }
+
+ inline Sample max() const {
+ return this->BasicEstimators<Sample>::max();
+ }
+
+ /**
+ * @return a summary of the computed estimators as a BSONObj.
+ */
+ BSONObj statisticSummaryToBSONObj() const;
+ };
+
+} // namespace mongo
+
+#include "mongo/util/descriptive_stats-inl.h"
View
126 src/mongo/util/descriptive_stats_test.cpp
@@ -0,0 +1,126 @@
+/**
+ * Tests for mongo/util/descriptive_stats.h
+ */
+
+#include <cstdlib>
+#include <cmath>
+#include <limits>
+#include <string>
+
+#include "mongo/unittest/unittest.h"
+#include "mongo/util/descriptive_stats.h"
+
+using namespace std;
+
+namespace {
+
+ const size_t NumQuantiles = 99;
+
+ TEST(DistributionEstimators, TestNominalResults) {
+ mongo::DistributionEstimators<NumQuantiles> d;
+
+ for (int i = 0; i < 100000; i++) {
+ d << double(i) / 100000;
+ }
+ ASSERT_TRUE(d.quantilesReady());
+ for (size_t quant = 1; quant <= NumQuantiles; ++quant) {
+ ASSERT_EQUALS(d.probability(quant), double(quant) / 100);
+ ASSERT_APPROX_EQUAL(d.quantile(quant), double(quant) / 100, 0.05);
+ double prob = double(quant) / 100;
+ ASSERT_APPROX_EQUAL(d.icdf(prob), prob, 0.05);
+ }
+ ASSERT_APPROX_EQUAL(d.min(), 0.0, 0.05);
+ ASSERT_APPROX_EQUAL(d.max(), 1.0, 0.05);
+ ASSERT_APPROX_EQUAL(d.median(), 0.5, 0.05);
+ }
+
+ TEST(DistributionEstimators, TestAppendQuantilesToBSONArrayBuilder) {
+ mongo::DistributionEstimators<NumQuantiles> d;
+
+ for (int i = 0; i < 10000; i++) {
+ d << static_cast<double>(i) / 10000;
+ }
+
+ mongo::BSONArrayBuilder arrayBuilder;
+ d.appendQuantilesToBSONArrayBuilder(arrayBuilder);
+ mongo::BSONArray arr = arrayBuilder.arr();
+
+ for (size_t i = 0; i <= NumQuantiles + 1; i++) {
+ ASSERT_EQUALS(arr[i].Number(), d.quantile(i));
+ }
+ }
+
+ TEST(BasicEstimators, TestNominalResults) {
+ mongo::BasicEstimators<unsigned int> d;
+
+ unsigned int count = 0;
+ // [50, 51, 52, ..., 99949, 99950]
+ for (int i = 50; i <= 100000 - 50; i++) {
+ d << unsigned(i);
+ count++;
+ }
+ ASSERT_EQUALS(d.min(), 50u);
+ ASSERT_EQUALS(d.max(), 100000u - 50u);
+ ASSERT_APPROX_EQUAL(d.mean(), 100000 / 2, 1e-15);
+ ASSERT_APPROX_EQUAL(d.stddev(), sqrt((static_cast<double>(count) * count - 1) / 12), 1e-15);
+ }
+
+ TEST(BasicEstimators, TestAppendBasicToBSONObjBuilder) {
+ mongo::BasicEstimators<unsigned int> b;
+
+ for (int i = 0; i < 10000; i++) {
+ b << i;
+ }
+
+ mongo::BSONObjBuilder builder;
+ b.appendBasicToBSONObjBuilder(builder);
+ mongo::BSONObj obj = builder.obj();
+
+ ASSERT_EQUALS(obj["count"].Number(), b.count());
+ ASSERT_EQUALS(obj["mean"].Number(), b.mean());
+ ASSERT_EQUALS(obj["stddev"].Number(), b.stddev());
+ ASSERT_EQUALS(obj["min"].Number(), b.min());
+ ASSERT_EQUALS(obj["max"].Number(), b.max());
+ }
+
+ TEST(SummaryEstimators, TestNominalResults) {
+ mongo::SummaryEstimators<int, NumQuantiles> d;
+
+ for (int a = -200; a <= 200; a++) {
+ d << a;
+ }
+ ASSERT_TRUE(d.quantilesReady());
+ for (size_t i = 0; i < d.numberOfQuantiles; i++) {
+ ASSERT_APPROX_EQUAL(d.quantile(i), -200 + static_cast<int>(i) * 4, 1);
+ }
+ ASSERT_EQUALS(d.min(), -200);
+ ASSERT_EQUALS(d.max(), 200);
+ ASSERT_APPROX_EQUAL(d.mean(), 0, 1e-15);
+ ASSERT_APPROX_EQUAL(d.icdf(.25), -100, 1e-15);
+ }
+
+ TEST(SummaryEstimators, TestStatisticSummaryToBSONObj) {
+ mongo::SummaryEstimators<double, NumQuantiles> e;
+
+ for (int i = 0; i < 10000; i++) {
+ e << static_cast<double>(i) / 100;
+ }
+ verify(e.quantilesReady());
+
+ mongo::BSONObj obj = e.statisticSummaryToBSONObj();
+
+ ASSERT_EQUALS(obj["count"].Number(), e.count());
+ ASSERT_EQUALS(obj["mean"].Number(), e.mean());
+ ASSERT_EQUALS(obj["stddev"].Number(), e.stddev());
+ ASSERT_EQUALS(obj["min"].Number(), e.min());
+ ASSERT_EQUALS(obj["max"].Number(), e.max());
+
+ mongo::BSONObj quantiles = obj["quantiles"].Obj();
+ ASSERT_EQUALS(static_cast<size_t>(quantiles.nFields()), NumQuantiles);
+ for (mongo::BSONObjIterator it = quantiles.begin(); it.more(); ++it) {
+ ASSERT_EQUALS((*it).Number(), e.icdf(atof((*it).fieldName())));
+ }
+ }
+
+} // namespace
+

0 comments on commit 0027639

Please sign in to comment.
Something went wrong with that request. Please try again.