Bug 1284432 - Make cycling data more efficient

Don't get all job guids per project at once, then chunk, then delete the chunks. Get a chunk, delete that chunk, then get another, delete that, etc. This should be more memory and time efficient.
mozilla · Mar 6, 2017 · abbacd9 · abbacd9
1 parent f0e483c
commit abbacd9
Showing 1 changed file with 11 additions and 15 deletions.
diff --git a/treeherder/model/models.py b/treeherder/model/models.py
@@ -593,21 +593,17 @@ def cycle_data(self, repository, cycle_interval, chunk_size, sleep_time):
 
         # Retrieve list of jobs to delete
         jobs_max_timestamp = datetime.datetime.now() - cycle_interval
-        job_guids_to_cycle = list(self.filter(
-            repository=repository,
-            submit_time__lt=jobs_max_timestamp).values_list('guid',
-                                                            flat=True))
 
-        if not job_guids_to_cycle:
-            return 0
-
-        # group the job in chunks
-        jobs_chunk_list = zip(*[iter(job_guids_to_cycle)] * chunk_size)
-        # append the remaining job data not fitting in a complete chunk
-        jobs_chunk_list.append(
-            job_guids_to_cycle[-(len(job_guids_to_cycle) % chunk_size):])
+        jobs_cycled = 0
+        while True:
+            jobs_chunk = list(self.filter(
+                repository=repository,
+                submit_time__lt=jobs_max_timestamp).values_list(
+                    'guid', flat=True)[:chunk_size])
+            if not jobs_chunk:
+                # no more jobs to cycle, we're done!
+                return jobs_cycled
 
-        for jobs_chunk in jobs_chunk_list:
             self.filter(guid__in=jobs_chunk).delete()
 
             # Remove ORM entries for these jobs that don't currently have a foreign key
@@ -635,12 +631,12 @@ def cycle_data(self, repository, cycle_interval, chunk_size, sleep_time):
                         failure_line_max_id = None
             failure_lines_to_delete.delete()
 
+            jobs_cycled += len(jobs_chunk)
+
             if sleep_time:
                 # Allow some time for other queries to get through
                 time.sleep(sleep_time)
 
-        return len(job_guids_to_cycle)
-
 
 class Job(models.Model):
     """