MDEV-27229: Estimation for filtered rows less precise ... #5

Fix special handling for values that are right next to buckets with ndv=1.
midenok · Jan 19, 2022 · 531dd70 · 531dd70
1 parent 67d4d04
commit 531dd70
Show file tree

Hide file tree

Showing 4 changed files with 148 additions and 55 deletions.
diff --git a/mysql-test/main/statistics_json.result b/mysql-test/main/statistics_json.result
@@ -4631,12 +4631,12 @@ test	t1_json	a	a-0	a-9	0.0000	3.0000	1.0000	10	JSON_HB	{
 }
 explain extended select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
-1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	68.71	Using where
+1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	60.00	Using where
 Warnings:
 Note	1003	select `test`.`t1_json`.`a` AS `a` from `test`.`t1_json` where `test`.`t1_json`.`a` between 'a-3a' and 'zzzzzzzzz'
 analyze select * from t1_json where a between 'a-3a' and 'zzzzzzzzz';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	10.00	68.71	60.00	Using where
+1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	10.00	60.00	60.00	Using where
 explain extended select * from t1_json where a < 'b-1a';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	filtered	Extra
 1	SIMPLE	t1_json	ALL	NULL	NULL	NULL	NULL	10	100.00	Using where
@@ -8014,7 +8014,7 @@ test.t1	analyze	status	OK
 analyze
 select c from t1 where c > '1';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
-1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	16	16.00	80.47	75.00	Using where
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	16	16.00	75.00	75.00	Using where
 drop table t1;
 #
 # MDEV-26849: JSON Histograms: point selectivity estimates are off for non-existent values
@@ -8211,3 +8211,33 @@ analyze select COUNT(*) FROM t1 WHERE a < 'a';
 id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
 1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	100	100.00	50.00	50.00	Using where
 drop table t1;
+#
+# MDEV-27229: Estimation for filtered rows less precise ... #5
+#
+create table t1 (id int, a varchar(8));
+insert into t1 select seq, 'bar' from seq_1_to_100;
+insert into t1 select id, 'qux' from t1;
+set histogram_type=JSON_HB;
+analyze table t1 persistent for all;
+Table	Op	Msg_type	Msg_text
+test.t1	analyze	status	Engine-independent statistics collected
+test.t1	analyze	status	OK
+analyze select COUNT(*) FROM t1 WHERE a > 'foo';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	50.00	50.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	100.00	100.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a >='aaa';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	100.00	100.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a > 'bar';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	50.00	50.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a >='bar';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	100.00	100.00	Using where
+analyze select COUNT(*) FROM t1 WHERE a <='bar';
+id	select_type	table	type	possible_keys	key	key_len	ref	rows	r_rows	filtered	r_filtered	Extra
+1	SIMPLE	t1	ALL	NULL	NULL	NULL	NULL	200	200.00	50.00	50.00	Using where
+drop table t1;
diff --git a/mysql-test/main/statistics_json.test b/mysql-test/main/statistics_json.test
@@ -390,3 +390,29 @@ analyze table t1 persistent for all;
 analyze select COUNT(*) FROM t1 WHERE a <> 'a';
 analyze select COUNT(*) FROM t1 WHERE a < 'a';
 drop table t1;
+
+--echo #
+--echo # MDEV-27229: Estimation for filtered rows less precise ... #5
+--echo #
+create table t1 (id int, a varchar(8));
+insert into t1 select seq, 'bar' from seq_1_to_100;
+insert into t1 select id, 'qux' from t1;
+
+set histogram_type=JSON_HB;
+analyze table t1 persistent for all;
+analyze select COUNT(*) FROM t1 WHERE a > 'foo';
+
+analyze select COUNT(*) FROM t1 WHERE a > 'aaa';
+analyze select COUNT(*) FROM t1 WHERE a >='aaa';
+
+analyze select COUNT(*) FROM t1 WHERE a > 'bar';
+analyze select COUNT(*) FROM t1 WHERE a >='bar';
+
+# Can enable these after get_avg_frequency issue is resolved:
+# analyze select COUNT(*) FROM t1 WHERE a < 'aaa';
+# analyze select COUNT(*) FROM t1 WHERE a <='aaa';
+# analyze select COUNT(*) FROM t1 WHERE a < 'bar';
+
+analyze select COUNT(*) FROM t1 WHERE a <='bar';
+
+drop table t1;
diff --git a/sql/opt_histogram_json.cc b/sql/opt_histogram_json.cc
@@ -910,12 +910,12 @@ double Histogram_json_hb::point_selectivity(Field *field, key_range *endpoint,
 
   // If the value is outside of the histogram's range, this will "clip" it to
   // first or last bucket.
-  bool equal;
-  int idx= find_bucket(field, key, &equal);
+  int endp_cmp;
+  int idx= find_bucket(field, key, &endp_cmp);
 
   double sel;
 
-  if (buckets[idx].ndv == 1 && !equal)
+  if (buckets[idx].ndv == 1 && (endp_cmp!=0))
   {
     /*
       The bucket has a single value and it doesn't match! Return a very
@@ -979,22 +979,27 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
 
     // Find the leftmost bucket that contains the lookup value.
     // (If the lookup value is to the left of all buckets, find bucket #0)
-    bool equal;
-    int idx= find_bucket(field, min_key, &equal);
-    if (equal && exclusive_endp && buckets[idx].ndv==1 &&
-        idx < (int)buckets.size()-1)
+    int endp_cmp;
+    int idx= find_bucket(field, min_key, &endp_cmp);
+
+    double sel;
+    // Special handling for buckets with ndv=1:
+    if (buckets[idx].ndv == 1)
     {
-      /*
-        The range is "col > $CONST" and we've found a bucket that contains
-        only the value $CONST. Move to the next bucket.
-      */
-      idx++;
+      if (endp_cmp < 0)
+        sel= 0.0;
+      else if (endp_cmp > 0)
+        sel= 1.0;
+      else // endp_cmp == 0.0
+        sel= (exclusive_endp)? 1.0 : 0.0;
+    }
+    else
+    {
+      sel= position_in_interval(field, min_key, min_key_len,
+				buckets[idx].start_value,
+				get_end_value(idx));
     }
     double left_fract= get_left_fract(idx);
-    double sel= position_in_interval(field, min_key, min_key_len,
-                                     buckets[idx].start_value,
-                                     get_end_value(idx));
-
     min= left_fract + sel * (buckets[idx].cum_fract - left_fract);
   }
   else
@@ -1012,42 +1017,49 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
       max_key++;
       max_key_len--;
     }
-    bool equal;
-    int idx= find_bucket(field, max_key, &equal);
+    int endp_cmp;
+    int idx= find_bucket(field, max_key, &endp_cmp);
 
-    if (equal && !inclusive_endp && idx > 0)
+    if ((endp_cmp == 0) && !inclusive_endp)
     {
       /*
         The range is "col < $CONST" and we've found a bucket starting with
-        $CONST. Move to the previous bucket.
+        $CONST.
       */
-      idx--;
-      equal= false;
+      if (idx > 0)
+      {
+        // Move to the previous bucket
+        endp_cmp= 1;
+        idx--;
+      }
+      else
+        endp_cmp= -1;
     }
-    double left_fract= get_left_fract(idx);
-
     double sel;
-    /* Special handling for singleton buckets */
-    if (buckets[idx].ndv == 1 && equal)
+
+    // Special handling for buckets with ndv=1:
+    if (buckets[idx].ndv == 1)
     {
-      if (inclusive_endp)
-        sel= 1.0;
-      else
+      if (endp_cmp < 0)
         sel= 0.0;
+      else if (endp_cmp > 0)
+        sel= 1.0;
+      else // endp_cmp == 0.0
+        sel= inclusive_endp? 1.0 : 0.0;
     }
     else
     {
       sel= position_in_interval(field, max_key, max_key_len,
                                 buckets[idx].start_value,
                                 get_end_value(idx));
     }
+    double left_fract= get_left_fract(idx);
     max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
   }
   else
     max= 1.0;
 
-  double sel = max - min;
-  return sel;
+  return max - min;
 }
 
 
@@ -1057,33 +1069,45 @@ void Histogram_json_hb::serialize(Field *field)
 }
 
 
+static int SGN(int x)
+{
+  if (!x)
+    return 0;
+  return (x < 0)? -1 : 1;
+}
+
+
 /*
   @brief
    Find the leftmost histogram bucket such that "lookup_val >= start_value".
 
   @param field        Field object (used to do value comparisons)
   @param lookup_val   The lookup value in KeyTupleFormat.
-  @param equal  OUT   TRUE<=> the found bucket has left_bound=lookup_val
-
+  @param cmp  OUT     How the lookup_val compares to found_bucket.left_bound:
+                      0  - lookup_val == bucket.left_bound
+                      >0 - lookup_val > bucket.left_bound (the most typical)
+                      <0 - lookup_val < bucket.left_bound. This can only happen
+                      for the first bucket, for all other buckets we would just
+                      pick the previous bucket and have cmp>=0.
   @return
      The bucket index
 */
 
 int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
-                                   bool *equal)
+                                   int *cmp)
 {
   int res;
   int low= 0;
   int high= (int)buckets.size() - 1;
-  *equal= false;
+  *cmp= 1; // By default, (bucket[retval].start_value < *lookup_val)
 
   while (low + 1 < high)
   {
     int middle= (low + high) / 2;
     res= field->key_cmp((uchar*)buckets[middle].start_value.data(), lookup_val);
     if (!res)
     {
-      *equal= true;
+      *cmp= res;
       low= middle;
       goto end;
     }
@@ -1104,31 +1128,44 @@ int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
   */
   if (low == 0)
   {
-    res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
-    if (!res)
-      *equal= true;
-    else if (res < 0) //  buckets[0] < lookup_val
+    res= field->key_cmp(lookup_val, (uchar*)buckets[0].start_value.data());
+    if (res <= 0)
+      *cmp= res;
+    else // res>0, lookup_val > buckets[0].start_value
     {
-      res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
-      if (!res)
-        *equal= true;
-      if (res <= 0) // buckets[high] <= lookup_val
+      res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
+      if (res >= 0)  // lookup_val >= buckets[high].start_value
+      {
+        // Move to that bucket
         low= high;
+        *cmp= res;
+      }
+      else
+        *cmp= 1;
     }
   }
   else if (high == (int)buckets.size() - 1)
   {
-    res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
-    if (!res)
-      *equal= true;
-    if (res <= 0)
+    res= field->key_cmp(lookup_val, (uchar*)buckets[high].start_value.data());
+    if (res >= 0)
+    {
+      // Ok the value is in the last bucket.
+      *cmp= res;
       low= high;
+    }
+    else
+    {
+      // The value is in the 'low' bucket.
+      res= field->key_cmp(lookup_val, (uchar*)buckets[low].start_value.data());
+      *cmp= res;
+    }
   }
 
 end:
-  // Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
-  DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
-                                         lookup_val)));
+  // Verification: *cmp has correct value
+  DBUG_ASSERT(SGN(*cmp) ==
+              SGN(field->key_cmp(lookup_val,
+                                 (uchar*)buckets[low].start_value.data())));
   // buckets[low] <= lookup_val, with one exception of the first bucket.
   DBUG_ASSERT(low == 0 ||
               field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);

diff --git a/sql/opt_histogram_json.h b/sql/opt_histogram_json.h
@@ -144,6 +144,6 @@ class Histogram_json_hb : public Histogram_base
 
   double get_left_fract(int idx);
   std::string& get_end_value(int idx);
-  int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
+  int find_bucket(const Field *field, const uchar *lookup_val, int *cmp);
 };