Skip to content

Commit

Permalink
stats: pre-calculate the lower and upper scalar (#4623)
Browse files Browse the repository at this point in the history
  • Loading branch information
alivxxx authored and winoros committed Sep 26, 2017
1 parent 19ecf92 commit f6ccdb7
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 63 deletions.
26 changes: 17 additions & 9 deletions statistics/histogram.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,13 @@ type Histogram struct {
// Repeat is the number of repeats of the bucket value, it can be used to find popular values.
//
type Bucket struct {
Count int64
UpperBound types.Datum
LowerBound types.Datum
Repeats int64
Count int64
UpperBound types.Datum
LowerBound types.Datum
Repeats int64
lowerScalar float64
upperScalar float64
commonPfxLen int // when the bucket value type is KindString or KindBytes, commonPfxLen is the common prefix length of the lower bound and upper bound.
}

// SaveToStorage saves the histogram to storage.
Expand Down Expand Up @@ -139,11 +142,15 @@ func histogramFromStorage(ctx context.Context, tableID int64, colID int64, tp *t
return nil, errors.Trace(err)
}
}
lowerScalar, upperScalar, commonLength := preCalculateDatumScalar(&lowerBound, &upperBound)
hg.Buckets[bucketID] = Bucket{
Count: count,
UpperBound: upperBound,
LowerBound: lowerBound,
Repeats: repeats,
Count: count,
UpperBound: upperBound,
LowerBound: lowerBound,
Repeats: repeats,
lowerScalar: lowerScalar,
upperScalar: upperScalar,
commonPfxLen: commonLength,
}
}
for i := 1; i < bucketSize; i++ {
Expand Down Expand Up @@ -244,7 +251,8 @@ func (hg *Histogram) lessRowCount(sc *variable.StatementContext, value types.Dat
if c <= 0 {
return prevCount, nil
}
frac := calcFraction(&hg.Buckets[index].LowerBound, &hg.Buckets[index].UpperBound, &value)
valueScalar := convertDatumToScalar(&value, hg.Buckets[index].commonPfxLen)
frac := calcFraction(hg.Buckets[index].lowerScalar, hg.Buckets[index].upperScalar, valueScalar)
return prevCount + (lessThanBucketValueCount-prevCount)*frac, nil
}

Expand Down
97 changes: 44 additions & 53 deletions statistics/scalar.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,93 +22,84 @@ import (

// calcFraction is used to calculate the fraction of the interval [lower, upper] that lies within the [lower, value]
// using the continuous-value assumption.
func calcFraction(lower, upper, value *types.Datum) float64 {
lowerScalar, upperScalar, valueScalar := convertToScalar(lower, upper, value)
if upperScalar <= lowerScalar {
func calcFraction(lower, upper, value float64) float64 {
if upper <= lower {
return 0.5
}
if valueScalar <= lowerScalar {
if value <= lower {
return 0
}
if valueScalar >= upperScalar {
if value >= upper {
return 1
}
frac := (valueScalar - lowerScalar) / (upperScalar - lowerScalar)
frac := (value - lower) / (upper - lower)
if math.IsNaN(frac) || math.IsInf(frac, 0) || frac < 0 || frac > 1 {
return 0.5
}
return frac
}

// convertToScalar converts the datum to scalar values.
// TODO: We may cache results for some types.
func convertToScalar(lower, upper, value *types.Datum) (float64, float64, float64) {
// preCalculateDatumScalar converts the lower and upper to scalar. When the datum type is KindString or KindBytes, we also
// calculate their common prefix length, because when a value falls between lower and upper, the common prefix
// of lower and upper equals to the common prefix of the lower, upper and the value.
func preCalculateDatumScalar(lower, upper *types.Datum) (float64, float64, int) {
common := 0
if lower.Kind() == types.KindString || lower.Kind() == types.KindBytes {
common = commonPrefixLength(lower.GetBytes(), upper.GetBytes())
}
return convertDatumToScalar(lower, common), convertDatumToScalar(upper, common), common
}

func convertDatumToScalar(value *types.Datum, commonPfxLen int) float64 {
switch value.Kind() {
case types.KindFloat32:
return float64(lower.GetFloat32()), float64(upper.GetFloat32()), float64(value.GetFloat32())
return float64(value.GetFloat32())
case types.KindFloat64:
return lower.GetFloat64(), upper.GetFloat64(), value.GetFloat64()
return value.GetFloat64()
case types.KindInt64:
return float64(lower.GetInt64()), float64(upper.GetInt64()), float64(value.GetInt64())
return float64(value.GetInt64())
case types.KindUint64:
return float64(lower.GetUint64()), float64(upper.GetUint64()), float64(value.GetUint64())
return float64(value.GetUint64())
case types.KindMysqlDecimal:
return convertDecimalToScalar(lower, upper, value)
scalar, err := value.GetMysqlDecimal().ToFloat64()
if err != nil {
return 0
}
return scalar
case types.KindMysqlDuration:
return float64(lower.GetMysqlDuration().Duration), float64(upper.GetMysqlDuration().Duration), float64(value.GetMysqlDuration().Duration)
return float64(value.GetMysqlDuration().Duration)
case types.KindMysqlTime:
lowerTime := lower.GetMysqlTime()
upperTime := upper.GetMysqlTime()
valueTime := value.GetMysqlTime()
return 0, float64(upperTime.Sub(&lowerTime).Duration), float64(valueTime.Sub(&lowerTime).Duration)
zeroTime := types.ZeroDatetime
zeroTime.Type = valueTime.Type
return float64(valueTime.Sub(&zeroTime).Duration)
case types.KindString, types.KindBytes:
return convertBytesToScalar(lower.GetBytes(), upper.GetBytes(), value.GetBytes())
bytes := value.GetBytes()
if len(bytes) <= commonPfxLen {
return 0
}
return convertBytesToScalar(bytes[commonPfxLen:])
default:
// do not know how to convert
return 0, 0, 0
}
}

// Decimal types are simply converted to their equivalent float64 values.
func convertDecimalToScalar(lower, upper, value *types.Datum) (float64, float64, float64) {
lowerScalar, err := lower.GetMysqlDecimal().ToFloat64()
if err != nil {
return 0, 0, 0
}
upperScalar, err := upper.GetMysqlDecimal().ToFloat64()
if err != nil {
return 0, 0, 0
}
valueScalar, err := value.GetMysqlDecimal().ToFloat64()
if err != nil {
return 0, 0, 0
return 0
}
return lowerScalar, upperScalar, valueScalar
}

// Bytes type is viewed as a base-256 value.
func convertBytesToScalar(lower, upper, value []byte) (float64, float64, float64) {
func commonPrefixLength(lower, upper []byte) int {
minLen := len(lower)
if len(upper) < minLen {
if minLen > len(upper) {
minLen = len(upper)
}
if len(value) < minLen {
minLen = len(value)
}
// remove their common prefix
common := 0
for common < minLen {
if lower[common] == upper[common] && lower[common] == value[common] {
common++
} else {
break
for i := 0; i < minLen; i++ {
if lower[i] != upper[i] {
return i
}
}
return convertOneBytesToScalar(lower[common:]), convertOneBytesToScalar(upper[common:]), convertOneBytesToScalar(value[common:])
return minLen
}

func convertOneBytesToScalar(value []byte) float64 {
// Since the base is 256, we only consider at most 8 bytes.
func convertBytesToScalar(value []byte) float64 {
// Bytes type is viewed as a base-256 value, so we only consider at most 8 bytes.
var buf [8]byte
copy(buf[:], value)
return float64(binary.BigEndian.Uint64(buf[:]))
Expand Down
4 changes: 3 additions & 1 deletion statistics/scalar_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,9 @@ func (t *testStatisticsSuite) TestCalcFraction(c *C) {
},
}
for _, test := range tests {
fraction := calcFraction(&test.lower, &test.upper, &test.value)
lower, upper, common := preCalculateDatumScalar(&test.lower, &test.upper)
value := convertDatumToScalar(&test.value, common)
fraction := calcFraction(lower, upper, value)
c.Check(math.Abs(fraction-test.fraction) < eps, IsTrue)
}
}
12 changes: 12 additions & 0 deletions statistics/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,13 @@ func buildPK(ctx context.Context, numBuckets, id int64, records ast.RecordSet) (
return b.Count, b.hist, nil
}

func calculateScalar(hist *Histogram) {
for i, bkt := range hist.Buckets {
bkt.lowerScalar, bkt.upperScalar, bkt.commonPfxLen = preCalculateDatumScalar(&bkt.LowerBound, &bkt.UpperBound)
hist.Buckets[i] = bkt
}
}

func (s *testStatisticsSuite) TestBuild(c *C) {
bucketCount := int64(256)
sketch, _, _ := buildFMSketch(s.rc.(*recordSet).data, 1000)
Expand All @@ -162,6 +169,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
Sketch: sketch,
}
col, err := BuildColumn(ctx, bucketCount, 2, collector)
calculateScalar(col)
c.Check(err, IsNil)
c.Check(len(col.Buckets), Equals, 232)
count, err := col.equalRowCount(sc, types.NewIntDatum(1000))
Expand Down Expand Up @@ -193,6 +201,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
c.Check(int(count), Equals, 9)

tblCount, col, err := BuildIndex(ctx, bucketCount, 1, ast.RecordSet(s.rc))
calculateScalar(col)
c.Check(err, IsNil)
c.Check(int(tblCount), Equals, 100000)
count, err = col.equalRowCount(sc, encodeKey(types.NewIntDatum(10000)))
Expand All @@ -210,6 +219,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) {

s.pk.(*recordSet).cursor = 0
tblCount, col, err = buildPK(ctx, bucketCount, 4, ast.RecordSet(s.pk))
calculateScalar(col)
c.Check(err, IsNil)
c.Check(int(tblCount), Equals, 100000)
count, err = col.equalRowCount(sc, types.NewIntDatum(10000))
Expand Down Expand Up @@ -362,6 +372,7 @@ func (s *testStatisticsSuite) TestColumnRange(c *C) {
Sketch: sketch,
}
hg, err := BuildColumn(ctx, bucketCount, 2, collector)
calculateScalar(hg)
c.Check(err, IsNil)
col := &Column{Histogram: *hg}
tbl := &Table{
Expand Down Expand Up @@ -427,6 +438,7 @@ func (s *testStatisticsSuite) TestIntColumnRanges(c *C) {

s.pk.(*recordSet).cursor = 0
rowCount, hg, err := buildPK(ctx, bucketCount, 0, s.pk)
calculateScalar(hg)
c.Check(err, IsNil)
c.Check(rowCount, Equals, int64(100000))
col := &Column{Histogram: *hg}
Expand Down

0 comments on commit f6ccdb7

Please sign in to comment.