Skip to content

Commit 42bb5a5

Browse files
committed
[libc++] Add a simple way to find outliers in historical benchmark data
1 parent 2bbc740 commit 42bb5a5

File tree

1 file changed

+47
-1
lines changed

1 file changed

+47
-1
lines changed

libcxx/utils/visualize-historical

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,12 @@ class Commit:
4242
raise RuntimeError(f'Error when trying to obtain the commit order for {self._sha} and {other._sha}')
4343
return res.returncode == 0
4444

45+
def __hash__(self):
46+
"""
47+
Return the full revision for this commit.
48+
"""
49+
return hash(self.fullrev)
50+
4551
def show(self, include_diff=False):
4652
"""
4753
Return the commit information equivalent to `git show` associated to this commit.
@@ -153,6 +159,29 @@ def parse_lnt(lines):
153159
results[name][metric].append(float(value))
154160
return results
155161

162+
def find_outliers(xs, ys, threshold):
163+
"""
164+
Given a list of x coordinates and a list of y coordinates, find (x, y) pairs where the y
165+
value differs from the previous y value by more than the given relative difference.
166+
167+
The threshold is given as a floating point representing a percentage, e.g. 0.25 will result in
168+
detecting points that differ from their previous value by more than 25%. The difference is in
169+
absolute value, i.e. both positive and negative spikes are detected.
170+
"""
171+
outliers = []
172+
previous = None
173+
for (x, y) in zip(xs, ys):
174+
if y is None: # skip data points that don't contain values
175+
continue
176+
177+
if previous is not None:
178+
diff = y - previous
179+
if (diff / previous) > threshold:
180+
outliers.append((x, y))
181+
previous = y
182+
return outliers
183+
184+
156185
def main(argv):
157186
parser = argparse.ArgumentParser(
158187
prog='visualize-historical',
@@ -176,6 +205,13 @@ def main(argv):
176205
'Since the chart is interactive, it generally makes most sense to include all the benchmarks '
177206
'and to then filter them in the browser, but in some cases producing a chart with a reduced '
178207
'number of data series is useful.')
208+
parser.add_argument('--find-outliers', metavar='FLOAT', type=float, required=False,
209+
help='When building the chart, detect commits that show a large spike (more than the given relative threshold) '
210+
'with the previous result and print those to standard output. This can be used to generate a list of '
211+
'potential outliers that we might want to re-generate the data for. The threshold is expressed as a '
212+
'floating point number, e.g. 0.25 will detect points that differ by more than 25%% from their previous '
213+
'result. This option respects --filter, i.e. only benchmarks that match the filter will be analyzed for '
214+
'outliers.')
179215
parser.add_argument('--git-repo', type=directory_path, default=pathlib.Path(os.getcwd()),
180216
help='Path to the git repository to use for ordering commits in time. '
181217
'By default, the current working directory is used.')
@@ -214,10 +250,20 @@ def main(argv):
214250
regex = re.compile(args.filter)
215251
benchmarks = {b for b in benchmarks if regex.search(b)}
216252

253+
# If requested, perform a basic pass to detect outliers
254+
if args.find_outliers is not None:
255+
threshold = args.find_outliers
256+
outliers = set()
257+
for benchmark in benchmarks:
258+
commits = [commit for (commit, _) in historical_data]
259+
series = [commit_data.get(benchmark, None) for (_, commit_data) in historical_data]
260+
outliers |= set(commit for (commit, _) in find_outliers(commits, series, threshold=threshold))
261+
print(f'Outliers (more than {threshold * 100}%): {" ".join(str(x) for x in outliers)}')
262+
217263
# Plot the data for all the required benchmarks
218264
figure = create_plot([commit for (commit, _) in historical_data],
219265
sorted(list(benchmarks)),
220-
[data for (_, data) in historical_data])
266+
[commit_data for (_, commit_data) in historical_data])
221267
do_open = args.output is None or args.open
222268
output = args.output if args.output is not None else tempfile.NamedTemporaryFile(suffix='.html').name
223269
plotly.io.write_html(figure, file=output, auto_open=do_open)

0 commit comments

Comments
 (0)