Skip to content

Commit

Permalink
[autoroller] All commits in updates(), only roll interesting ones.
Browse files Browse the repository at this point in the history
Previously `updates` would only include the 'interesting' updates, which
meant that the new autoroller algorithm skips over chunks of the repo
history. If multiple repos have chunks of non-interesting history, the
autoroller can end up not proposing any good candidates.

This CL simplifies the repo interface by having `updates()` always
return the full list of commits, but adds a new field to the
CommitMetadata which indicates that a given commit was `interesting`.
This way the full history of every repo is available when picking
candidates (e.g. for `CommitList.advance_to`), but find_best_rev will
only consider rolling to interesting commits.

R=dnj@chromium.org, phajdan.jr@chromium.org
BUG=

Review-Url: https://codereview.chromium.org/2833723003
  • Loading branch information
riannucci authored and Commit bot committed Apr 20, 2017
1 parent de548b9 commit 30925bd
Show file tree
Hide file tree
Showing 7 changed files with 154 additions and 99 deletions.
13 changes: 9 additions & 4 deletions recipe_engine/autoroll_impl/candidate_algorithm.py
Expand Up @@ -31,8 +31,13 @@ def get_commitlists(deps):
def find_best_rev(repos):
"""Returns the project_id of the best repo to roll.
"Best" is determined by "moves the least amount of commits, globally". There
are two ways that rolling a repo can move commits:
"Best" is determined by "is an interesting commit and moves the least amount
of commits, globally".
"Interesting" means "the commit modifies one or more recipe related files",
and is defined by CommitMetadata.roll_candidate.
There are two ways that rolling a repo can move commits:
1) As dependencies. Rolling repo A that depends on (B, C) will take
a penalty for each commit that B and C need to move in order to be
Expand Down Expand Up @@ -77,13 +82,12 @@ def find_best_rev(repos):

for project_id, clist in repos.iteritems():
assert isinstance(clist, CommitList)
candidate = clist.next
candidate, movement_score = clist.next_roll_candidate
if not candidate:
continue

unaccounted_repos = set(repo_set)

movement_score = 0
# first, determine if rolling this repo will force other repos to move.
for d_pid, dep in candidate.spec.deps.iteritems():
unaccounted_repos.discard(d_pid)
Expand Down Expand Up @@ -204,4 +208,5 @@ def get_roll_candidates(context, package_spec):

print('found %d/%d good/bad candidates in %0.2f seconds' % (
len(ret_good), len(ret_bad), time.time()-start))
sys.stdout.flush()
return ret_good, ret_bad, repos
23 changes: 23 additions & 0 deletions recipe_engine/autoroll_impl/commit_list.py
Expand Up @@ -24,6 +24,7 @@ def __init__(self, commit_list):
assert all(isinstance(c, CommitMetadata) for c in commit_list)
self._commits = list(commit_list)
self._cur_idx = 0
self._next_roll_candidate_idx = None

# This maps from commit hash -> index in _commits.
self._rev_idx = {}
Expand Down Expand Up @@ -69,6 +70,28 @@ def next(self):
return None
return self._commits[nxt_idx]

@property
def next_roll_candidate(self):
"""Gets the next CommitMetadata and distance with roll_candidate==True
without advancing the current index.
Returns (CommitMetadata, <distance>), or (None, None) if there is no next
roll_candidate.
"""
# Compute and cache the next roll candidate index.
nxt_idx = self._next_roll_candidate_idx
if nxt_idx is None or nxt_idx <= self._cur_idx:
nxt_idx = None
for i, commit in enumerate(self._commits[self._cur_idx+1:]):
if commit.roll_candidate:
nxt_idx = self._cur_idx + i + 1
break
self._next_roll_candidate_idx = nxt_idx

if nxt_idx is not None:
return self._commits[nxt_idx], nxt_idx - self._cur_idx
return (None, None)

def advance(self):
"""Advances the current CommitMetadata by one.
Expand Down
82 changes: 44 additions & 38 deletions recipe_engine/fetch.py
Expand Up @@ -31,6 +31,14 @@
LOGGER = logging.getLogger(__name__)


def has_interesting_changes(spec, changed_files):
# TODO(iannucci): analyze bundle_extra_paths.txt too.
return (
'infra/config/recipes.cfg' in changed_files or
any(f.startswith(spec.recipes_path) for f in changed_files)
)


class FetchError(Exception):
pass

Expand All @@ -48,9 +56,12 @@ class UnresolvedRefspec(Exception):
# commit_timestamp (int): the unix commit timestamp for this commit
# message_lines (tuple(str)): the message of this commit
# spec (package_pb2.Package): the parsed infra/config/recipes.cfg file or None.
# roll_candidate (bool): if this commit contains changes which are known to
# affect the behavior of the recipes (i.e. modifications within recipe_path
# and/or modifications to recipes.cfg)
CommitMetadata = namedtuple(
'_CommitMetadata',
'revision author_email commit_timestamp message_lines spec')
'revision author_email commit_timestamp message_lines spec roll_candidate')


class Backend(object):
Expand Down Expand Up @@ -114,11 +125,7 @@ def commit_metadata(self, refspec):
The refspec will be resolved if it's not absolute.
Returns {
'author': '<author name>',
'message': '<commit message>',
'spec': package_pb2.Package or None, # the parsed recipes.cfg file.
}
Returns (CommitMetadata).
"""
revision = self.resolve_refspec(refspec)
cache = self._GIT_METADATA_CACHE.setdefault(self.repo_url, {})
Expand All @@ -140,18 +147,16 @@ def resolve_refspec(self, refspec):
return refspec
return self._resolve_refspec_impl(refspec)

def updates(self, revision, other_revision, paths):
def updates(self, revision, other_revision):
"""Returns a list of revisions |revision| through |other_revision|
(inclusive).
If |paths| is a non-empty list, the history is scoped just to these paths.
Returns list(CommitMetadata) - The commit metadata in the range
(revision,other_revision].
"""
self.assert_resolved(revision)
self.assert_resolved(other_revision)
return self._updates_impl(revision, other_revision, paths)
return self._updates_impl(revision, other_revision)

### direct overrides. These are public methods which must be overridden.

Expand Down Expand Up @@ -185,12 +190,10 @@ def checkout(self, refspec):
### private overrides. Override these in the implementations, but don't call
### externally.

def _updates_impl(self, revision, other_revision, paths):
def _updates_impl(self, revision, other_revision):
"""Returns a list of revisions |revision| through |other_revision|. This
includes |revision| and |other_revision|.
If |paths| is a non-empty list, the history is scoped just to these paths.
Args:
revision (str) - the first git commit
other_revision (str) - the second git commit
Expand Down Expand Up @@ -327,15 +330,13 @@ def checkout(self, refspec):
except GitError:
self._git('reset', '-q', '--hard', revision)

def _updates_impl(self, revision, other_revision, paths):
def _updates_impl(self, revision, other_revision):
args = [
'rev-list',
'--reverse',
'--topo-order',
'%s..%s' % (revision, other_revision),
]
if paths:
args.extend(['--'] + paths)
return [
self.commit_metadata(rev)
for rev in self._git(*args).strip().split('\n')
Expand Down Expand Up @@ -367,9 +368,14 @@ def _commit_metadata_impl(self, revision):
except GitError:
spec = None

# check diff to see if it touches anything interesting.
changed_files = set(self._git(
'diff-tree', '-r', '--no-commit-id', '--name-only', '%s^!' % revision)
.splitlines())

return CommitMetadata(revision, meta[0],
int(meta[1]), tuple(meta[2:]),
spec)
spec, has_interesting_changes(spec, changed_files))

class GitilesFetchError(FetchError):
"""An HTTP error that occurred during Gitiles fetching."""
Expand All @@ -393,16 +399,22 @@ def transient(e):
# commit (str) - the git commit hash
# author_email (str) - the author email for this commit
# message_lines (tuple) - the lines of the commit message
# changed_files (frozenset) - all paths touched by this commit
class _GitilesCommitJson(namedtuple(
'_GitilesCommitJson',
'commit author_email commit_timestamp message_lines')):
'commit author_email commit_timestamp message_lines changed_files')):
@classmethod
def from_raw_json(cls, raw):
mod_files = set()
for entry in raw['tree_diff']:
mod_files.add(entry['old_path'])
mod_files.add(entry['new_path'])
return cls(
raw['commit'],
raw['author']['email'],
calendar.timegm(time.strptime(raw['committer']['time'])),
tuple(raw['message'].splitlines()),
frozenset(mod_files),
)


Expand Down Expand Up @@ -432,9 +444,14 @@ def _fetch_gitiles(self, url_fmt, *args):
raise GitilesFetchError(resp.status_code, resp.text)
return resp

def _fetch_gitiles_json(self, url_fmt, *args):
def _fetch_gitiles_committish_json(self, url_fmt, *args):
"""Fetches a remote URL path and expects a JSON object on success.
This appends two GET params to url_fmt:
format=JSON - Does what you expect
name-status=1 - Ensures that commit objects returned have a 'tree_diff'
member which shows the diff for that commit.
Args:
url_fmt (str) - the url path fragment as a python %format string, like
'%s/foo/bar?something=value'
Expand All @@ -443,7 +460,7 @@ def _fetch_gitiles_json(self, url_fmt, *args):
Returns the decoded JSON object
"""
resp = self._fetch_gitiles(url_fmt, *args)
resp = self._fetch_gitiles(url_fmt+'?name-status=1&format=JSON', *args)
if not resp.text.startswith(self._GERRIT_XSRF_HEADER):
raise GitilesFetchError(resp.status_code, 'Missing XSRF prefix')
return json.loads(resp.text[len(self._GERRIT_XSRF_HEADER):])
Expand All @@ -466,7 +483,7 @@ def _fetch_commit_json(self, refspec):
if refspec in c:
return c[refspec]

raw = self._fetch_gitiles_json('+/%s?format=JSON', refspec)
raw = self._fetch_gitiles_committish_json('+/%s', refspec)
ret = _GitilesCommitJson.from_raw_json(raw)
if self.is_resolved_revision(refspec):
c[refspec] = ret
Expand Down Expand Up @@ -525,33 +542,21 @@ def checkout(self, refspec):
with tarfile.open(fileobj=StringIO(archive_response.content)) as tf:
tf.extractall(recipes_path)

def _updates_impl(self, revision, other_revision, paths):
def _updates_impl(self, revision, other_revision):
self.assert_remote('_updates_impl')

# TODO(iannucci): implement paging

# To include info about touched paths (tree_diff), pass name-status=1 below.
log_json = self._fetch_gitiles_json(
'+log/%s..%s?name-status=1&format=JSON', revision, other_revision)
log_json = self._fetch_gitiles_committish_json(
'+log/%s..%s', revision, other_revision)

c = self._COMMIT_JSON_CACHE.setdefault(self.repo_url, {})

results = []
for entry in log_json['log']:
commit = entry['commit']
c[commit] = _GitilesCommitJson.from_raw_json(entry)

matched = False
for path in paths:
for diff_entry in entry['tree_diff']:
if (diff_entry['old_path'].startswith(path) or
diff_entry['new_path'].startswith(path)):
matched = True
break
if matched:
break
if matched or not paths:
results.append(commit)
results.append(commit)

results.reverse()
return map(self.commit_metadata, results)
Expand All @@ -576,4 +581,5 @@ def _commit_metadata_impl(self, revision):
rev_json.author_email,
rev_json.commit_timestamp,
rev_json.message_lines,
spec)
spec,
has_interesting_changes(spec, rev_json.changed_files))
17 changes: 4 additions & 13 deletions recipe_engine/package.py
Expand Up @@ -209,18 +209,8 @@ def updates(self):
Returns list(CommitMetadata)
"""
spec = self.spec_pb()

paths = []
subdir = spec.recipes_path
if subdir:
# We add package_file to the list of paths to check because it might
# contain other upstream rolls, which we want.
paths.extend([subdir + os.path.sep,
InfraRepoConfig().relative_recipes_cfg])

other_revision = self.backend.resolve_refspec(self._branch_for_remote)
return self.backend.updates(self.revision, other_revision, paths)
return self.backend.updates(
self.revision, self.backend.resolve_refspec(self._branch_for_remote))

def _components(self):
return (self.project_id, self.repo, self.revision, self.path)
Expand Down Expand Up @@ -250,7 +240,8 @@ def current(self):
'',
0,
(),
self.spec_pb()
self.spec_pb(),
False
)

def updates(self):
Expand Down

0 comments on commit 30925bd

Please sign in to comment.