55-- Correlation is set to 80% - an arbitary figure
66-- Data types limited
77-- Only columns with shorter strings compared
8- -- Columns checked must have a 'similar' number of distinct values (NDVs must not differ by 2X)
98-- A sample of rows can be used to speed up execution time - which can be substantial
109--
1110-- Parameters:
@@ -71,6 +70,12 @@ declare
7170 -- column pairs that have similar NDV - so some NULL cases will be missed.
7271 -- There is also an assumption that longer strings are rarely used in comparison
7372 --
73+ -- The cursor W includes some predicates I've commented out
74+ -- If uncommented, they will reduce the number of columns comapared, but this
75+ -- risks missing some correlated columns. I chose to leave these ideas
76+ -- visible, but I think the best way to speed things up
77+ -- is to reduce the row sample percentage.
78+ --
7479 cursor c1 is
7580 with w as (
7681 select column_name, num_distinct
@@ -86,7 +91,7 @@ declare
8691 select t1 .column_name c1, t2 .column_name c2
8792 from w t1, w t2 /* , (select num_rows from dba_tables where owner = :ownname and table_name = :tabname) t */
8893 where t1 .column_name > t2 .column_name
89- and greatest(t1 .num_distinct ,t2 .num_distinct )/ least(t1 .num_distinct ,t2 .num_distinct )< 2 /* Similar number of distinct values */
94+ -- and greatest(t1.num_distinct,t2.num_distinct)/least(t1.num_distinct,t2.num_distinct)<2 /* Similar number of distinct values? */
9095 -- and t1.num_distinct < t.num_rows/10 /* Perhaps eliminate sequenced columns? */
9196 order by t1 .column_name ;
9297 c number (6 ,5 );
0 commit comments