Merge remote-tracking branch 'origin/master' into tablesWithoutSchema…

…ShouldNotBeBackedUp
ocadotechnology · Aug 7, 2018 · f1151ef · f1151ef
2 parents 3758b1e + 408f67a
commit f1151ef
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -59,9 +59,10 @@ In such scenario we're not able to restore data using BigQuery build-in features
 * Dataset/table labels as they are not copied by BigQuery copy job (again, you can use [GCP Census](https://github.com/ocadotechnology/gcp-census) for that)
 
 ### Known caveats
-* Modification of table metadata (including table description) qualifies table to be backed up at the next cycle. It can be a problem for partitioned tables, where such change updates last modified time in every partition. Then BBQ will backup all partitions again, even though there was no actually change in partition data
-* There's 10,000 [copy jobs per project per day limit](https://cloud.google.com/bigquery/quotas#copy_jobs), which you may hit on the first day. This limit can be increased by Google Support
-* Data in table streaming buffer will be backed up on the next run, once the buffer is flushed. BBQ uses [copy-job](https://cloud.google.com/bigquery/docs/managing-tables#copy-table) for creating backups and *"Records in the streaming buffer are not considered when a copy or extract job runs"* (check [Life of a BigQuery streaming insert](https://cloud.google.com/blog/big-data/2017/06/life-of-a-bigquery-streaming-insert) for more details). 
+* Modification of table metadata (including table description) qualifies table to be backed up at the next cycle. It can be a problem for partitioned tables, where such change updates last modified time in every partition. Then BBQ will backup all partitions again, even though there was no actually change in partition data,
+* There's 10,000 [copy jobs per project per day limit](https://cloud.google.com/bigquery/quotas#copy_jobs), which you may hit on the first day. This limit can be increased by Google Support,
+* Data in table streaming buffer will be backed up on the next run, once the buffer is flushed. BBQ uses [copy-job](https://cloud.google.com/bigquery/docs/managing-tables#copy-table) for creating backups and *"Records in the streaming buffer are not considered when a copy or extract job runs"* (check [Life of a BigQuery streaming insert](https://cloud.google.com/blog/big-data/2017/06/life-of-a-bigquery-streaming-insert) for more details),
+* When table name is longer than 400 characters, then in rare cases BBQ may backup tables more than once. Such backup duplicates are automatically removed by retention process. 
 
 # High level architecture
 

diff --git a/src/backup/copy_job_async/copy_job/copy_job_task_name.py b/src/backup/copy_job_async/copy_job/copy_job_task_name.py
@@ -8,9 +8,18 @@ class CopyJobTaskName(object):
     def __init__(self, copy_job_request):
         self.__copy_job_request = copy_job_request
 
+    # Regarding the API restriction - task name needs to match
+    # following expression: "^[a-zA-Z0-9_-]{1,500}$".
+    #
+    # This method return None in case of name that exceeds 500 characters,
+    # what protect us from failures where source table name is very long,
+    # but still valid.
+    #
+    # The disadvantage is - if task_name is None, that means
+    # it's not unique and the same backup may be created more than once.
     def create(self):
         logging.info("INFO:  %s", self.__copy_job_request)
-        return '_'.join([
+        task_name = '_'.join([
             datetime.utcnow().strftime("%Y-%m-%d"),
             str(self.__copy_job_request.source_big_query_table),
             str(self.__copy_job_request.retry_count),
@@ -19,4 +28,4 @@ def create(self):
             .replace('$', '_')\
             .replace('.', '_')\
             .replace(':', '_')
-# The task name needs to match following expression: "^[a-zA-Z0-9_-]{1,500}$"
+        return task_name if len(task_name) <= 500 else None
diff --git a/tests/backup/copy_job_async/copy_job/test_copy_job_task_name.py b/tests/backup/copy_job_async/copy_job/test_copy_job_task_name.py
@@ -11,7 +11,7 @@ class TestCopyJobNameCreator(unittest.TestCase):
 
     @freeze_time("2017-12-06")
     def test_creating_task_name(self):
-        #given
+        # given
         copy_job_request = CopyJobRequest(
             task_name_suffix='task_name_suffix',
             copy_job_type_id="unknown-copying",
@@ -24,8 +24,31 @@ def test_creating_task_name(self):
             retry_count=0
         )
 
-        #when
+        # when
         copy_job_task_name = CopyJobTaskName(copy_job_request).create()
 
-        #then
-        self.assertEqual(copy_job_task_name, '2017-12-06_source_project_source_dataset_source_table_0_task_name_suffix')
+        # then
+        self.assertEqual(copy_job_task_name, '2017-12-06_source_project_source_dataset_source_table_0_task_name_suffix')
+
+    @freeze_time("2017-12-06")
+    def test_return_none_if_calculated_name_is_too_long(self):
+        # given
+        task_name_suffix = "x" * 501
+
+        copy_job_request = CopyJobRequest(
+            task_name_suffix=task_name_suffix,
+            copy_job_type_id="unknown-copying",
+            source_big_query_table=BigQueryTable('source_project',
+                                                 'source_dataset',
+                                                 'source_table'),
+            target_big_query_table=BigQueryTable('target_project',
+                                                 'target_dataset',
+                                                 'target_table'),
+            retry_count=0
+        )
+
+        # when
+        copy_job_task_name = CopyJobTaskName(copy_job_request).create()
+
+        # then
+        self.assertIsNone(copy_job_task_name)