Skip to content

Commit

Permalink
Bug 1284432 - Make cycling data more efficient
Browse files Browse the repository at this point in the history
Don't get all job guids per project at once, then chunk, then delete the
chunks. Get a chunk, delete that chunk, then get another, delete that,
etc. This should be more memory and time efficient.
  • Loading branch information
wlach committed Mar 6, 2017
1 parent f0e483c commit abbacd9
Showing 1 changed file with 11 additions and 15 deletions.
26 changes: 11 additions & 15 deletions treeherder/model/models.py
Expand Up @@ -593,21 +593,17 @@ def cycle_data(self, repository, cycle_interval, chunk_size, sleep_time):

# Retrieve list of jobs to delete
jobs_max_timestamp = datetime.datetime.now() - cycle_interval
job_guids_to_cycle = list(self.filter(
repository=repository,
submit_time__lt=jobs_max_timestamp).values_list('guid',
flat=True))

if not job_guids_to_cycle:
return 0

# group the job in chunks
jobs_chunk_list = zip(*[iter(job_guids_to_cycle)] * chunk_size)
# append the remaining job data not fitting in a complete chunk
jobs_chunk_list.append(
job_guids_to_cycle[-(len(job_guids_to_cycle) % chunk_size):])
jobs_cycled = 0
while True:
jobs_chunk = list(self.filter(
repository=repository,
submit_time__lt=jobs_max_timestamp).values_list(
'guid', flat=True)[:chunk_size])
if not jobs_chunk:
# no more jobs to cycle, we're done!
return jobs_cycled

for jobs_chunk in jobs_chunk_list:
self.filter(guid__in=jobs_chunk).delete()

# Remove ORM entries for these jobs that don't currently have a foreign key
Expand Down Expand Up @@ -635,12 +631,12 @@ def cycle_data(self, repository, cycle_interval, chunk_size, sleep_time):
failure_line_max_id = None
failure_lines_to_delete.delete()

jobs_cycled += len(jobs_chunk)

if sleep_time:
# Allow some time for other queries to get through
time.sleep(sleep_time)

return len(job_guids_to_cycle)


class Job(models.Model):
"""
Expand Down

0 comments on commit abbacd9

Please sign in to comment.