corr update (#166)

nigelbayliss · web-flow · commit 5619125ed7f4 · 2021-07-21T09:11:18.000-07:00
diff --git a/optimizer/column_groups/corr_from_plan.sql b/optimizer/column_groups/corr_from_plan.sql
@@ -13,6 +13,13 @@
 --     Y/N where Y - will create the column groups immediately
 --               N - will print the column group creation script only
 --
+  --
+  -- The cursor C1 includes some predicates I've commented out
+  -- If uncommented, they will reduce the number of columns comapared, but this
+  -- risks missing some correlated columns. I chose to leave these ideas
+  -- visible, but I think the best way to speed things up
+  -- is to reduce the row sample percentage.
+  --
 var create_now varchar2(1)
 set echo off
 column tab_owner format a20
@@ -83,7 +90,7 @@ declare
     select t1.column_name c1, t2.column_name c2
     from   w t1, w t2 /* , (select num_rows from dba_tables where owner = :ownname and table_name = :tabname) t */
     where  t1.column_name > t2.column_name
-    and    greatest(t1.num_distinct,t2.num_distinct)/least(t1.num_distinct,t2.num_distinct)<2 /* Similar number of distinct values */
+    --and    greatest(t1.num_distinct,t2.num_distinct)/least(t1.num_distinct,t2.num_distinct)<2 /* Similar number of distinct values? */
     --and    t1.num_distinct < t.num_rows/10   /* Perhaps eliminate sequenced columns? */
     order by t1.column_name;
   c number(6,5);
diff --git a/optimizer/column_groups/corr_from_sts.sql b/optimizer/column_groups/corr_from_sts.sql
@@ -5,7 +5,6 @@
 --    Correlation is set to 80% - an arbitary figure
 --    Data types limited
 --    Only columns with shorter strings compared 
---    Columns checked must have a 'similar' number of distinct values (NDVs must not differ by 2X)
 --    A sample of rows can be used to speed up execution time - which can be substantial
 --
 -- Parameters:
@@ -71,6 +70,12 @@ declare
   -- column pairs that have similar NDV - so some NULL cases will be missed.
   -- There is also an assumption that longer strings are rarely used in comparison
   --
+  -- The cursor W includes some predicates I've commented out
+  -- If uncommented, they will reduce the number of columns comapared, but this
+  -- risks missing some correlated columns. I chose to leave these ideas
+  -- visible, but I think the best way to speed things up
+  -- is to reduce the row sample percentage.
+  --
   cursor c1 is
     with w as (
     select column_name, num_distinct
@@ -86,7 +91,7 @@ declare
     select t1.column_name c1, t2.column_name c2
     from   w t1, w t2 /* , (select num_rows from dba_tables where owner = :ownname and table_name = :tabname) t */
     where  t1.column_name > t2.column_name
-    and    greatest(t1.num_distinct,t2.num_distinct)/least(t1.num_distinct,t2.num_distinct)<2 /* Similar number of distinct values */
+    --and    greatest(t1.num_distinct,t2.num_distinct)/least(t1.num_distinct,t2.num_distinct)<2 /* Similar number of distinct values? */
     --and    t1.num_distinct < t.num_rows/10   /* Perhaps eliminate sequenced columns? */
     order by t1.column_name;
   c number(6,5);
diff --git a/optimizer/column_groups/corr_from_table.sql b/optimizer/column_groups/corr_from_table.sql
@@ -5,7 +5,6 @@
 --    Correlation is set to 80% - an arbitary figure
 --    Data types limited
 --    Only columns with shorter strings compared 
---    Columns checked must have a 'similar' number of distinct values (NDVs must not differ by 2X)
 --    A sample of rows can be used to speed up execution time - which can be substantial
 --
 -- Parameters:
@@ -15,6 +14,13 @@
 --    Y/N - Y to create the column groups immediately
 --          N to spool SQL to create column groups
 --
+ --
+  -- The cursor C1 includes some predicates I've commented out
+  -- If uncommented, they will reduce the number of columns comapared, but this
+  -- risks missing some correlated columns. I chose to leave these ideas
+  -- visible, but I think the best way to speed things up
+  -- is to reduce the row sample percentage
+  --
 var create_now varchar2(1)
 set echo off
 column tab_owner format a20
@@ -80,7 +86,7 @@ declare
     select t1.column_name c1, t2.column_name c2
     from   w t1, w t2 /* , (select num_rows from dba_tables where owner = :ownname and table_name = :tabname) t */
     where  t1.column_name > t2.column_name
-    and    greatest(t1.num_distinct,t2.num_distinct)/least(t1.num_distinct,t2.num_distinct)<2 /* Similar number of distinct values */
+    --and    greatest(t1.num_distinct,t2.num_distinct)/least(t1.num_distinct,t2.num_distinct)<2 /* Similar number of distinct values? */
     --and    t1.num_distinct < t.num_rows/10   /* Perhaps eliminate sequenced columns? */
     order by t1.column_name;
   c number(6,5);