From d30cdc0dc59057dcc04ee213d7a30ccdb09d2825 Mon Sep 17 00:00:00 2001
From: Michael Lynch <mtlynch@google.com>
Date: Tue, 22 Dec 2015 10:27:10 -0500
Subject: [PATCH] Adding query generator to test table equivalence

Adding TableEquivalenceQueryGenerator which generates sets of queries
that can be executed to test whether two tables contain equivalent data.
---
 bigsanity/query_construct.py  | 121 ++++++++++++++
 tests/test_query_construct.py | 287 ++++++++++++++++++++++++++++++++++
 2 files changed, 408 insertions(+)
 create mode 100644 bigsanity/query_construct.py
 create mode 100644 tests/test_query_construct.py

diff --git a/bigsanity/query_construct.py b/bigsanity/query_construct.py
new file mode 100644
index 0000000..be62710
--- /dev/null
+++ b/bigsanity/query_construct.py
@@ -0,0 +1,121 @@
+# Copyright 2015 Measurement Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import re
+
+import constants
+import table_names
+
+
+def _construct_equivalence_query(tables, conditions):
+    """Constructs BigQuery SQL to be used in a table equivalence check.
+
+    Args:
+        tables: A list of BigQuery table names to query.
+        conditions: A list of BigQuery WHERE clauses to apply to the query.
+
+    Returns:
+        A BigQuery SQL query corresponding to the provided parameters.
+    """
+    return """
+SELECT
+    test_id
+FROM
+    {tables}
+WHERE
+    {conditions}
+ORDER BY
+    test_id""".format(tables=',\n\t'.join(tables),
+                      conditions='\n    AND '.join(conditions)).strip()
+
+
+def _format_project_condition(project):
+    return 'project = %d' % project
+
+
+def _to_unix_timestamp(dt):
+    """Converts a datetime to Unix timestamp."""
+    return int((dt - datetime.datetime(1970, 1, 1)).total_seconds())
+
+
+def _to_human_readable_date(dt):
+    """Converts a datetime to a human readable date of the form YYYY-MM-DD."""
+    return dt.strftime('%Y-%m-%d')
+
+
+def _project_has_intermediate_snapshots(project):
+    # TODO(mtlynch): This should just be applied to all web100 projects, but
+    # there is currently a bug with SideStream where snapshots are
+    # incorrectly marked as intermediate snapshots.
+    return (project == constants.PROJECT_ID_NDT or
+            project == constants.PROJECT_ID_NPAD)
+
+
+class TableEquivalenceQueryGenerator(object):
+    """Generates queries to test the equivalence of two M-Lab tables."""
+
+    def __init__(self, project, time_range_start, time_range_end):
+        """Creates a new TableEquivalenceQueryGenerator.
+
+        Args:
+            project: Numerical ID of M-Lab project in BigQuery (e.g. NDT = 0).
+            time_range_start: Start of window (inclusive) for which to generate
+                query (as datetime).
+            time_range_start: End of time window (not inclusive) for which to
+                generate query (as datetime).
+        """
+        self._project = project
+        self._time_range_start = time_range_start
+        self._time_range_end = time_range_end
+
+    def generate_queries(self):
+        """Generate two or more equivalance queries.
+
+        Generates two or more equivalence queries. That is, distinct queries
+        that should yield the exact same resulting data.
+
+        Returns:
+            A tuple of BigQuery SQL query strings.
+        """
+        return (self._generate_legacy_per_month_query(),
+                self._generate_per_project_query())
+
+    def _generate_legacy_per_month_query(self):
+        conditions = []
+        conditions.append(_format_project_condition(self._project))
+        if _project_has_intermediate_snapshots(self._project):
+            conditions.append('web100_log_entry.is_last_entry = True')
+        conditions.append(self._format_time_range_condition())
+        tables = table_names.legacy_monthly_tables(self._time_range_start,
+                                                   self._time_range_end)
+        return _construct_equivalence_query(tables, conditions)
+
+    def _generate_per_project_query(self):
+        tables = [table_names.per_project_table(self._project)]
+        conditions = [self._format_time_range_condition()]
+        return _construct_equivalence_query(tables, conditions)
+
+    def _format_time_range_condition(self):
+        start_time = _to_unix_timestamp(self._time_range_start)
+        start_time_human = _to_human_readable_date(self._time_range_start)
+        end_time = _to_unix_timestamp(self._time_range_end)
+        end_time_human = _to_human_readable_date(self._time_range_end)
+        return (
+            '((web100_log_entry.log_time >= {start_time}) AND\t-- {start_time_human}'
+            '\n\t (web100_log_entry.log_time < {end_time}))\t-- {end_time_human}'
+        ).format(start_time=start_time,
+                 start_time_human=start_time_human,
+                 end_time=end_time,
+                 end_time_human=end_time_human)
diff --git a/tests/test_query_construct.py b/tests/test_query_construct.py
new file mode 100644
index 0000000..09a3dc6
--- /dev/null
+++ b/tests/test_query_construct.py
@@ -0,0 +1,287 @@
+# Copyright 2015 Measurement Lab
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import os
+import re
+import sys
+import unittest
+
+sys.path.insert(1, os.path.abspath(os.path.join(
+    os.path.dirname(__file__), '../bigsanity')))
+import constants
+import query_construct
+
+
+def _normalize_whitespace(original):
+    return re.sub(r'\s+', ' ', original).strip()
+
+
+def _split_and_normalize_query(query_string):
+    lines = []
+    for line in query_string.strip().splitlines():
+        # omit blank lines
+        if not line:
+            continue
+        lines.append(_normalize_whitespace(line))
+    return lines
+
+
+class QueryGeneratorTest(unittest.TestCase):
+
+    def assertQueriesEqual(self, expected, actual):
+        expected_lines = _split_and_normalize_query(expected)
+        actual_lines = _split_and_normalize_query(actual)
+
+        self.assertSequenceEqual(expected_lines, actual_lines)
+
+    def assertQuerySetEqual(self, expected, actual):
+        if len(expected) != len(actual):
+            print '\n\nexpected: %s\n\nactual: %s' % ('\n'.join(expected),
+                                                      '\n'.join(actual))
+        self.assertEqual(len(expected), len(actual))
+        for i in range(len(expected)):
+            self.assertQueriesEqual(expected[i], actual[i])
+
+
+class TableEquivalenceQueryGeneratorTest(QueryGeneratorTest):
+
+    def _get_queries_actual(self, project, start_time, end_time):
+        return query_construct.TableEquivalenceQueryGenerator(
+            project, start_time, end_time).generate_queries()
+
+    def test_correct_query_generation_for_ndt_full_month(self):
+        """Queries on border of a month should spill into adjacent months.
+
+        If a query's time window falls within one day of the end of the month,
+        it should include tables from the adjacent months in the legacy
+        per-month query.
+        """
+        legacy_per_month_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.2009_02.all,
+            plx.google:m_lab.2009_03.all,
+            plx.google:m_lab.2009_04.all
+        WHERE
+            project = 0
+            AND web100_log_entry.is_last_entry = True
+            AND ((web100_log_entry.log_time >= 1235865600) AND  -- 2009-03-01
+                 (web100_log_entry.log_time <  1238544000))     -- 2009-04-01
+        ORDER BY
+            test_id"""
+
+        per_project_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.ndt.all
+        WHERE
+            ((web100_log_entry.log_time >= 1235865600) AND  -- 2009-03-01
+             (web100_log_entry.log_time <  1238544000))     -- 2009-04-01
+        ORDER BY
+            test_id"""
+
+        query_set_expected = (legacy_per_month_query_expected,
+                              per_project_query_expected)
+        start_time = datetime.datetime(2009, 3, 1)
+        end_time = datetime.datetime(2009, 4, 1)
+        query_set_actual = self._get_queries_actual(constants.PROJECT_ID_NDT,
+                                                    start_time, end_time)
+        self.assertQuerySetEqual(query_set_expected, query_set_actual)
+
+    def test_correct_query_generation_for_ndt_across_months(self):
+        """Queries not on border of months should not spill over.
+
+        Queries with time windows that do not start or end within one day of the
+        end of month should not include tables for the adjacent month in the
+        legacy per month query, even when the time window spans multiple months.
+        """
+        legacy_per_month_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.2009_03.all,
+            plx.google:m_lab.2009_04.all
+        WHERE
+            project = 0
+            AND web100_log_entry.is_last_entry = True
+            AND ((web100_log_entry.log_time >= 1237075200) AND  -- 2009-03-15
+                 (web100_log_entry.log_time <  1239753600))     -- 2009-04-15
+        ORDER BY
+            test_id"""
+
+        per_project_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.ndt.all
+        WHERE
+            ((web100_log_entry.log_time >= 1237075200) AND  -- 2009-03-15
+             (web100_log_entry.log_time <  1239753600))     -- 2009-04-15
+        ORDER BY
+            test_id"""
+
+        query_set_expected = (legacy_per_month_query_expected,
+                              per_project_query_expected)
+        start_time = datetime.datetime(2009, 3, 15)
+        end_time = datetime.datetime(2009, 4, 15)
+        query_set_actual = self._get_queries_actual(constants.PROJECT_ID_NDT,
+                                                    start_time, end_time)
+        self.assertQuerySetEqual(query_set_expected, query_set_actual)
+
+    def test_correct_query_generation_for_ndt_within_single_month(self):
+        """Queries not on border of months should not spill over.
+
+        When a query's time window is within a single month and does not come
+        within one day of the month boundary, we should not query any adjacent
+        month in the legacy per month query.
+        """
+        legacy_per_month_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.2009_03.all
+        WHERE
+            project = 0
+            AND web100_log_entry.is_last_entry = True
+            AND ((web100_log_entry.log_time >= 1237075200) AND  -- 2009-03-15
+                 (web100_log_entry.log_time <  1237507200))     -- 2009-03-20
+        ORDER BY
+            test_id"""
+
+        per_project_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.ndt.all
+        WHERE
+            ((web100_log_entry.log_time >= 1237075200) AND  -- 2009-03-15
+             (web100_log_entry.log_time <  1237507200))     -- 2009-03-20
+        ORDER BY
+            test_id"""
+
+        query_set_expected = (legacy_per_month_query_expected,
+                              per_project_query_expected)
+        start_time = datetime.datetime(2009, 3, 15)
+        end_time = datetime.datetime(2009, 3, 20)
+        query_set_actual = self._get_queries_actual(constants.PROJECT_ID_NDT,
+                                                    start_time, end_time)
+        self.assertQuerySetEqual(query_set_expected, query_set_actual)
+
+    def test_correct_query_generation_for_npad(self):
+        legacy_per_month_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.2009_02.all,
+            plx.google:m_lab.2009_03.all,
+            plx.google:m_lab.2009_04.all
+        WHERE
+            project = 1
+            AND web100_log_entry.is_last_entry = True
+            AND ((web100_log_entry.log_time >= 1235865600) AND  -- 2009-03-01
+                 (web100_log_entry.log_time <  1238544000))     -- 2009-04-01
+        ORDER BY
+            test_id"""
+
+        per_project_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.npad.all
+        WHERE
+            ((web100_log_entry.log_time >= 1235865600) AND  -- 2009-03-01
+             (web100_log_entry.log_time <  1238544000))     -- 2009-04-01
+        ORDER BY
+            test_id"""
+
+        query_set_expected = (legacy_per_month_query_expected,
+                              per_project_query_expected)
+        start_time = datetime.datetime(2009, 3, 1)
+        end_time = datetime.datetime(2009, 4, 1)
+        query_set_actual = self._get_queries_actual(constants.PROJECT_ID_NPAD,
+                                                    start_time, end_time)
+        self.assertQuerySetEqual(query_set_expected, query_set_actual)
+
+    def test_correct_query_generation_for_sidestream(self):
+        legacy_per_month_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.2014_12.all,
+            plx.google:m_lab.2015_01.all
+        WHERE
+            project = 2
+            AND ((web100_log_entry.log_time >= 1419724800) AND  -- 2014-12-28
+                 (web100_log_entry.log_time <  1420243200))     -- 2015-01-03
+        ORDER BY
+            test_id"""
+
+        per_project_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.sidestream.all
+        WHERE
+            ((web100_log_entry.log_time >= 1419724800) AND  -- 2014-12-28
+             (web100_log_entry.log_time <  1420243200))     -- 2015-01-03
+        ORDER BY
+            test_id"""
+
+        query_set_expected = (legacy_per_month_query_expected,
+                              per_project_query_expected)
+        start_time = datetime.datetime(2014, 12, 28)
+        end_time = datetime.datetime(2015, 1, 3)
+        query_set_actual = self._get_queries_actual(
+            constants.PROJECT_ID_SIDESTREAM, start_time, end_time)
+        self.assertQuerySetEqual(query_set_expected, query_set_actual)
+
+    def test_correct_query_generation_for_paris_traceroute(self):
+        legacy_per_month_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.2014_12.all,
+            plx.google:m_lab.2015_01.all
+        WHERE
+            project = 3
+            AND ((web100_log_entry.log_time >= 1419724800) AND  -- 2014-12-28
+                 (web100_log_entry.log_time <  1420243200))     -- 2015-01-03
+        ORDER BY
+            test_id"""
+
+        per_project_query_expected = """
+        SELECT
+            test_id
+        FROM
+            plx.google:m_lab.paris_traceroute.all
+        WHERE
+            ((web100_log_entry.log_time >= 1419724800) AND  -- 2014-12-28
+             (web100_log_entry.log_time <  1420243200))     -- 2015-01-03
+        ORDER BY
+            test_id"""
+
+        query_set_expected = (legacy_per_month_query_expected,
+                              per_project_query_expected)
+        start_time = datetime.datetime(2014, 12, 28)
+        end_time = datetime.datetime(2015, 1, 3)
+        query_set_actual = self._get_queries_actual(
+            constants.PROJECT_ID_PARIS_TRACEROUTE, start_time, end_time)
+        self.assertQuerySetEqual(query_set_expected, query_set_actual)
+
+
+if __name__ == '__main__':
+    unittest.main()