Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stats: refine estimate in buckets #4601

Merged
merged 10 commits into from Sep 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion plan/cbo_test.go
Expand Up @@ -97,7 +97,7 @@ func (s *testAnalyzeSuite) TestIndexRead(c *C) {
},
{
sql: "select count(e) from t where t.b <= 20",
best: "IndexLookUp(Index(t.b_c)[[-inf <nil>,20 +inf]], Table(t)->HashAgg)->HashAgg",
best: "IndexLookUp(Index(t.b)[[-inf,20]], Table(t)->HashAgg)->HashAgg",
},
{
sql: "select count(e) from t where t.b <= 30",
Expand Down
3 changes: 2 additions & 1 deletion statistics/histogram.go
Expand Up @@ -244,7 +244,8 @@ func (hg *Histogram) lessRowCount(sc *variable.StatementContext, value types.Dat
if c <= 0 {
return prevCount, nil
}
return (prevCount + lessThanBucketValueCount) / 2, nil
frac := calcFraction(&hg.Buckets[index].LowerBound, &hg.Buckets[index].UpperBound, &value)
return prevCount + (lessThanBucketValueCount-prevCount)*frac, nil
}

// lessAndEqRowCount estimates the row count where the column less than or equal to value.
Expand Down
115 changes: 115 additions & 0 deletions statistics/scalar.go
@@ -0,0 +1,115 @@
// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package statistics

import (
"encoding/binary"
"math"

"github.com/pingcap/tidb/util/types"
)

// calcFraction is used to calculate the fraction of the interval [lower, upper] that lies within the [lower, value]
// using the continuous-value assumption.
func calcFraction(lower, upper, value *types.Datum) float64 {
lowerScalar, upperScalar, valueScalar := convertToScalar(lower, upper, value)
if upperScalar <= lowerScalar {
return 0.5
}
if valueScalar <= lowerScalar {
return 0
}
if valueScalar >= upperScalar {
return 1
}
frac := (valueScalar - lowerScalar) / (upperScalar - lowerScalar)
if math.IsNaN(frac) || math.IsInf(frac, 0) || frac < 0 || frac > 1 {
return 0.5
}
return frac
}

// convertToScalar converts the datum to scalar values.
// TODO: We may cache results for some types.
func convertToScalar(lower, upper, value *types.Datum) (float64, float64, float64) {
switch value.Kind() {
case types.KindFloat32:
return float64(lower.GetFloat32()), float64(upper.GetFloat32()), float64(value.GetFloat32())
case types.KindFloat64:
return lower.GetFloat64(), upper.GetFloat64(), value.GetFloat64()
case types.KindInt64:
return float64(lower.GetInt64()), float64(upper.GetInt64()), float64(value.GetInt64())
case types.KindUint64:
return float64(lower.GetUint64()), float64(upper.GetUint64()), float64(value.GetUint64())
case types.KindMysqlDecimal:
return convertDecimalToScalar(lower, upper, value)
case types.KindMysqlDuration:
return float64(lower.GetMysqlDuration().Duration), float64(upper.GetMysqlDuration().Duration), float64(value.GetMysqlDuration().Duration)
case types.KindMysqlTime:
lowerTime := lower.GetMysqlTime()
upperTime := upper.GetMysqlTime()
valueTime := value.GetMysqlTime()
return 0, float64(upperTime.Sub(&lowerTime).Duration), float64(valueTime.Sub(&lowerTime).Duration)
case types.KindString, types.KindBytes:
return convertBytesToScalar(lower.GetBytes(), upper.GetBytes(), value.GetBytes())
default:
// do not know how to convert
return 0, 0, 0
}
}

// Decimal types are simply converted to their equivalent float64 values.
func convertDecimalToScalar(lower, upper, value *types.Datum) (float64, float64, float64) {
lowerScalar, err := lower.GetMysqlDecimal().ToFloat64()
if err != nil {
return 0, 0, 0
}
upperScalar, err := upper.GetMysqlDecimal().ToFloat64()
if err != nil {
return 0, 0, 0
}
valueScalar, err := value.GetMysqlDecimal().ToFloat64()
if err != nil {
return 0, 0, 0
}
return lowerScalar, upperScalar, valueScalar
}

// Bytes type is viewed as a base-256 value.
func convertBytesToScalar(lower, upper, value []byte) (float64, float64, float64) {
minLen := len(lower)
if len(upper) < minLen {
minLen = len(upper)
}
if len(value) < minLen {
minLen = len(value)
}
// remove their common prefix
common := 0
for common < minLen {
if lower[common] == upper[common] && lower[common] == value[common] {
common++
} else {
break
}
}
return convertOneBytesToScalar(lower[common:]), convertOneBytesToScalar(upper[common:]), convertOneBytesToScalar(value[common:])
}

func convertOneBytesToScalar(value []byte) float64 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is complex, how about using BigEndian.Uint64?

var buf [8]byte
copy(buf[:], value)
return float64(BigEndian.Uint64(buf[:])

// Since the base is 256, we only consider at most 8 bytes.
var buf [8]byte
copy(buf[:], value)
return float64(binary.BigEndian.Uint64(buf[:]))
}
131 changes: 131 additions & 0 deletions statistics/scalar_test.go
@@ -0,0 +1,131 @@
// Copyright 2017 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

package statistics

import (
"math"

. "github.com/pingcap/check"
"github.com/pingcap/tidb/mysql"
"github.com/pingcap/tidb/util/types"
)

const eps = 1e-9

func getDecimal(value float64) *types.MyDecimal {
dec := &types.MyDecimal{}
dec.FromFloat64(value)
return dec
}

func getDuration(value string) types.Duration {
dur, _ := types.ParseDuration(value, 0)
return dur
}

func getTime(value string) types.Time {
t, _ := types.ParseTime(value, mysql.TypeDate, 0)
return t
}

func getBinaryLiteral(value string) types.BinaryLiteral {
b, _ := types.ParseBitStr(value)
return b
}

func (t *testStatisticsSuite) TestCalcFraction(c *C) {
tests := []struct {
lower types.Datum
upper types.Datum
value types.Datum
fraction float64
}{
{
lower: types.NewIntDatum(0),
upper: types.NewIntDatum(4),
value: types.NewIntDatum(1),
fraction: 0.25,
},
{
lower: types.NewIntDatum(0),
upper: types.NewIntDatum(4),
value: types.NewIntDatum(4),
fraction: 1,
},
{
lower: types.NewIntDatum(0),
upper: types.NewIntDatum(4),
value: types.NewIntDatum(-1),
fraction: 0,
},
{
lower: types.NewUintDatum(0),
upper: types.NewUintDatum(4),
value: types.NewUintDatum(1),
fraction: 0.25,
},
{
lower: types.NewFloat64Datum(0),
upper: types.NewFloat64Datum(4),
value: types.NewFloat64Datum(1),
fraction: 0.25,
},
{
lower: types.NewFloat32Datum(0),
upper: types.NewFloat32Datum(4),
value: types.NewFloat32Datum(1),
fraction: 0.25,
},
{
lower: types.NewDecimalDatum(getDecimal(0)),
upper: types.NewDecimalDatum(getDecimal(4)),
value: types.NewDecimalDatum(getDecimal(1)),
fraction: 0.25,
},
{
lower: types.NewMysqlBitDatum(getBinaryLiteral("0b0")),
upper: types.NewMysqlBitDatum(getBinaryLiteral("0b100")),
value: types.NewMysqlBitDatum(getBinaryLiteral("0b1")),
fraction: 0.5,
},
{
lower: types.NewDurationDatum(getDuration("0:00:00")),
upper: types.NewDurationDatum(getDuration("4:00:00")),
value: types.NewDurationDatum(getDuration("1:00:00")),
fraction: 0.25,
},
{
lower: types.NewTimeDatum(getTime("2017-01-01")),
upper: types.NewTimeDatum(getTime("2017-04-01")),
value: types.NewTimeDatum(getTime("2017-02-01")),
fraction: 0.34444444444444444,
},
{
lower: types.NewStringDatum("aasad"),
upper: types.NewStringDatum("addad"),
value: types.NewStringDatum("abfsd"),
fraction: 0.32280253984063745,
},
{
lower: types.NewBytesDatum([]byte("aasad")),
upper: types.NewBytesDatum([]byte("asdff")),
value: types.NewBytesDatum([]byte("abfsd")),
fraction: 0.0529216802217269,
},
}
for _, test := range tests {
fraction := calcFraction(&test.lower, &test.upper, &test.value)
c.Check(math.Abs(fraction-test.fraction) < eps, IsTrue)
}
}
22 changes: 11 additions & 11 deletions statistics/statistics_test.go
Expand Up @@ -172,10 +172,10 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
c.Check(int(count), Equals, 10000)
count, err = col.lessRowCount(sc, types.NewIntDatum(2000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 19964)
c.Check(int(count), Equals, 19995)
count, err = col.greaterRowCount(sc, types.NewIntDatum(2000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 80034)
c.Check(int(count), Equals, 80003)
count, err = col.lessRowCount(sc, types.NewIntDatum(200000000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 100000)
Expand All @@ -187,7 +187,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
c.Check(count, Equals, 0.0)
count, err = col.betweenRowCount(sc, types.NewIntDatum(3000), types.NewIntDatum(3500))
c.Check(err, IsNil)
c.Check(int(count), Equals, 5075)
c.Check(int(count), Equals, 5008)
count, err = col.lessRowCount(sc, types.NewIntDatum(1))
c.Check(err, IsNil)
c.Check(int(count), Equals, 9)
Expand All @@ -200,10 +200,10 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
c.Check(int(count), Equals, 1)
count, err = col.lessRowCount(sc, encodeKey(types.NewIntDatum(20000)))
c.Check(err, IsNil)
c.Check(int(count), Equals, 19983)
c.Check(int(count), Equals, 19999)
count, err = col.betweenRowCount(sc, encodeKey(types.NewIntDatum(30000)), encodeKey(types.NewIntDatum(35000)))
c.Check(err, IsNil)
c.Check(int(count), Equals, 4618)
c.Check(int(count), Equals, 4999)
count, err = col.lessRowCount(sc, encodeKey(types.NewIntDatum(0)))
c.Check(err, IsNil)
c.Check(int(count), Equals, 0)
Expand All @@ -217,13 +217,13 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
c.Check(int(count), Equals, 1)
count, err = col.lessRowCount(sc, types.NewIntDatum(20000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 20223)
c.Check(int(count), Equals, 20000)
count, err = col.betweenRowCount(sc, types.NewIntDatum(30000), types.NewIntDatum(35000))
c.Check(err, IsNil)
c.Check(int(count), Equals, 5120)
c.Check(int(count), Equals, 5000)
count, err = col.greaterAndEqRowCount(sc, types.NewIntDatum(1001))
c.Check(err, IsNil)
c.Check(int(count), Equals, 99232)
c.Check(int(count), Equals, 98999)
count, err = col.lessAndEqRowCount(sc, types.NewIntDatum(99999))
c.Check(err, IsNil)
c.Check(int(count), Equals, 100000)
Expand All @@ -232,7 +232,7 @@ func (s *testStatisticsSuite) TestBuild(c *C) {
c.Check(int(count), Equals, 0)
count, err = col.greaterRowCount(sc, types.NewIntDatum(1001))
c.Check(err, IsNil)
c.Check(int(count), Equals, 99231)
c.Check(int(count), Equals, 98998)
count, err = col.lessRowCount(sc, types.NewIntDatum(99999))
c.Check(err, IsNil)
c.Check(int(count), Equals, 99999)
Expand Down Expand Up @@ -406,12 +406,12 @@ func (s *testStatisticsSuite) TestColumnRange(c *C) {
ran[0].HighExcl = true
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 9964)
c.Assert(int(count), Equals, 9995)
ran[0].LowExcl = false
ran[0].HighExcl = false
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
c.Assert(int(count), Equals, 9965)
c.Assert(int(count), Equals, 9996)
ran[0].Low = ran[0].High
count, err = tbl.GetRowCountByColumnRanges(sc, 0, ran)
c.Assert(err, IsNil)
Expand Down