Permalink
Browse files

First draft version of Rank Compare matview. Removed depricated funct…

…ions from databasetimefunctions. Postdeployment fixes on some 2.4 patches.
  • Loading branch information...
1 parent ba671f8 commit 79fb63dc16920c9676ba949e31be8724ae34a573 @jberkus jberkus committed Jan 17, 2012
@@ -58,72 +58,6 @@ utc_day_near
Checks whether the provided timestamp with time zone is within an hour of the provided UTC day, expressed as either a timestamp without time zone or a date. Used for matching when related records may cross over midnight.
-utc_day_begins_pacific
-----------------------
-
-::
-
- utc_day_begins_pacific (
- DATE
- )
- RETURNS TIMESTAMP WITHOUT TIME ZONE
-
- SELECT utc_day_begins_pacific ( '2011-11-28' );
- returns: '2011-11-27 16:00:00'
-
-Given as specific date as a UTC day, returns as a timestamp without time zone the date and time in the Pacific time zone when that day begins. Used primarily to compare UTC days against reports.date_processed. DST-sensitive.
-
-
-utc_day_ends_pacific
---------------------
-
-::
-
- utc_day_ends_pacific (
- DATE
- )
- RETURNS TIMESTAMP WITHOUT TIME ZONE
-
- SELECT utc_day_begins_pacific ( '2011-11-28' );
- returns: '2011-11-28 16:00:00'
-
- reports.date_processed >= utc_day_begins_pacific('2011-11-27')
- AND reports.date_processed < utc_day_ends_pacific('2011-11-29')
-
-
-Given as specific date as a UTC day, returns as a timestamp without time zone the date and time in the Pacific time zone when that day ends. Used primarily to compare UTC days against reports.date_processed. DST-sensitive.
-
-
-ts2pacific
-----------
-
-::
-
- ts2pacific (
- TIMESTAMP
- )
- RETURNS TIMESTAMPTZ
-
- SELECT ts2pacific ( '2011-11-25 13:33:20' );
-
-Given a timestamp without time zone, assumed to be in the Pacific time zone, converts it to a timestamp with time zone.
-
-
-tz2pac_ts
----------
-
-::
-
- tz2pac_ts (
- TIMESTAMPTZ
- )
- RETURNS TIMESTAMP
-
- SELECT tz2pac_ts ( '2011-11-25 21:33:20-00' );
-
-Given a timestamp with time zone, converts it to a timestamp without time zone in Pacific time.
-
-
week_begins_utc
---------------
@@ -68,7 +68,7 @@ def runload(load_command):
else:
# if this failed, check why.
sys.exit('unable to drop database %s probably because connections to it are still open: %s'
- % (database_name, code,))
+ % (options.database_name, code,))
cur.execute("""CREATE DATABASE %s""" % options.database_name)
@@ -0,0 +1,102 @@
+\set ON_ERROR_STOP 1
+
+-- create new ranking table
+-- note that it only holds ranking data for the current date
+-- it gets truncated and regenerated every day
+
+SELECT create_table_if_not_exists ( 'rank_compare', $q$
+create table rank_compare (
+ product_version_id int not null,
+ signature_id int not null,
+ rank_days int not null,
+ report_count int,
+ total_reports bigint,
+ rank_report_count int,
+ percent_of_total numeric,
+ constraint rank_compare_key primary key ( product_version_id, signature_id, rank_days )
+);$q$, 'breakpad_rw',
+ARRAY [ 'product_version_id,rank_report_count', 'signature_id' ]);
+
+-- update procedure
+
+-- daily update function
+CREATE OR REPLACE FUNCTION update_rank_compare (checkdata BOOLEAN default TRUE )
+RETURNS BOOLEAN
+LANGUAGE plpgsql
+SET work_mem = '512MB'
+SET temp_buffers = '512MB'
+SET client_min_messages = 'ERROR'
+AS $f$
+DECLARE updateday;
+BEGIN
+-- this function populates a daily matview
+-- for rankings of signatures on TCBS
+-- depends on the new reports_clean
+
+-- always being run for yesterday
+updateday := ( CURRENT_DATE -1 );
+
+-- don't care if we've been run
+-- since there's no historical data
+
+-- check if reports_clean is complete
+IF NOT reports_clean_done(updateday) THEN
+ IF checkdata THEN
+ RAISE EXCEPTION 'Reports_clean has not been updated to the end of %',updateday;
+ ELSE
+ RETURN TRUE;
+ END IF;
+END IF;
+
+-- create temporary table with totals from reports_clean
+
+CREATE TEMPORARY TABLE prod_sig_counts
+AS SELECT product_version_id, signature_id, count(*) as report_count
+FROM reports_clean
+WHERE utc_day_is(date_processed, updateday);
+
+-- truncate rank_compare since we don't need the old data
+
+TRUNCATE rank_compare CASCADE;
+
+-- now insert the new records
+INSERT INTO rank_compare (
+ product_version_id, signature_id,
+ rank_days,
+ report_count,
+ total_reports,
+ rank_report_count,
+ percent_of_total)
+SELECT product_version_id, signature_id,
+ rank_days, total_count,
+ count_rank,
+ round(( report_count::numeric / total_count ),5)
+FROM (
+ SELECT product_version_id, signature_id,
+ report_count,
+ sum(report_count) over () as total_count,
+ dense_rank() over (partition by product_version_id
+ order by report_count desc) as count_rank
+ FROM prod_sig_counts
+) as initrank;
+
+RETURN TRUE;
+END; $f$;
+
+-- now create a backfill function
+-- so that we can backfill missing data
+CREATE OR REPLACE FUNCTION backfill_rank_compare( )
+RETURNS BOOLEAN
+LANGUAGE plpgsql AS
+$f$
+BEGIN
+
+PERFORM update_rank_compare(false);
+
+RETURN TRUE;
+END; $f$;
+
+
+
+
+
@@ -1,95 +0,0 @@
-\set ON_ERROR_STOP 1
-
--- create new ranking table
--- note that we added this in 2.2 and then dropped it because it
--- wasn't being used.
-
-SELECT create_table_if_not_exists ( 'tcbs_ranking', $q$
-create table tcbs_ranking (
- product_version_id int not null,
- signature_id int not null,
- report_date DATE not null,
- process_type citext,
- total_reports bigint,
- rank_report_count int,
- percent_of_total numeric,
- constraint tcbs_ranking_key primary key ( product_version_id, signature_id, process_type, report_date )
-);$q$, 'breakpad_rw',
-ARRAY [ 'product_version_id', 'signature_id', 'report_date' ]);
-
--- update procedure
-
--- daily update function
-CREATE OR REPLACE FUNCTION update_tcbs_ranking (
- updateday DATE, checkdata BOOLEAN default TRUE )
-RETURNS BOOLEAN
-LANGUAGE plpgsql
-SET work_mem = '512MB'
-SET temp_buffers = '512MB'
-SET client_min_messages = 'ERROR'
-AS $f$
-BEGIN
--- this function populates a daily matview
--- for rankings of signatures on TCBS
--- depends on the new reports_clean
-
--- check if we've been run
-IF checkdata THEN
- PERFORM 1 FROM tcbs_ranking
- WHERE report_date = updateday
- LIMIT 1;
- IF FOUND THEN
- RAISE EXCEPTION 'tcbs_ranking has already been run for %.',updateday;
- END IF;
-END IF;
-
--- check if tcbs is complete
-PERFORM 1 FROM tcbs
-WHERE report_date = updateday
-LIMIT 1;
-IF NOT FOUND THEN
- IF checkdata THEN
- RAISE EXCEPTION 'Reports_clean has not been updated to the end of %',updateday;
- ELSE
- RETURN TRUE;
- END IF;
-END IF;
-
--- now insert the new records
-INSERT INTO tcbs_ranking (
- product_version_id, signature_id,
- process_type,
- total_reports, rank_report_count )
-SELECT product_version_id, signature_id,
- 'All',
- sum(report_count) over () as total_count,
- dense_rank() over (order by report_count desc) as tcbs_rank
-FROM (
- SELECT product_version_id, signature_id,
- sum(report_count) as report_count
- FROM tcbs
- WHERE report_date = updateday
- GROUP BY product_version_id, signature_id
-) as tcbs_r;
-
-RETURN TRUE;
-END; $f$;
-
--- now create a backfill function
--- so that we can backfill missing data
-CREATE OR REPLACE FUNCTION backfill_tcbs_ranking(
- updateday DATE )
-RETURNS BOOLEAN
-LANGUAGE plpgsql AS
-$f$
-BEGIN
-
-DELETE FROM tcbs_ranking WHERE report_date = updateday;
-PERFORM update_tcbs_ranking(updateday, false);
-
-RETURN TRUE;
-END; $f$;
-
-
-
-
@@ -16,3 +16,7 @@ END IF;
END;$f$;
+-- finally, analyze the whole database
+-- this takes a while
+ANALYSE VERBOSE;
+
@@ -29,8 +29,8 @@ echo 'bug 715333'
/data/socorro/application/scripts/parallel_sql_jobs.py --dbname breakpad -j 8 --stop < /tmp/partition_constraints.txt
echo '*********************************************'
-echo 'change data type on raw_adu'
-echo 'this can take up to 20 min'
+echo 'change data type on raw_adu and analyze the database'
+echo 'this can take up to 40 min'
echo 'bug 715333'
psql -f ${CURDIR}/fix_adu_date.sql breakpad

0 comments on commit 79fb63d

Please sign in to comment.