Permalink
Browse files

Start benchmarking the parser performance, and other changes.

benchmarks/osh-parser.sh compares OSH vs other shells.

- benchmarks/time.py: Replacement for 'time' to get elapsed time /
  exit code in CSV
- osh-parser.R: reshape and analyze the results
- osh-parser.sh: output an HTML report

Wild test changes in wild.sh:

- Parsing 150K lines of shell code in FreeBSD.
- find more shell scripts with a 'find' expression for extensionless
  executables.
- search for unparsed features like a[x]=1

Oil code:

- Add --ast-format none, so we get more accurate measurements of parsing
  time
- Experiment to disable ASDL's dynamic type checking.
  - Parsing performance improved from ~1.7 lines per ms to ~2.0.  I
    thought it would be a bigger change!

Other tweaks:

- id_kind_test.py: Print stats about number of IDs and Kinds
- osh/lex.py: Make note of remaining tokens in OSH lexer
- osh/osh.asdl: update comments about code representation
- Update the line count script.  Look at the parser vs. the runtime.
  • Loading branch information...
Andy Chu
Andy Chu committed Oct 19, 2017
1 parent b260e9a commit 2f864e5e1d22f2b09bc2a5aad9f1124d727f48f8
View
@@ -179,6 +179,8 @@ def _Init(self, args, kwargs):
raise AssertionError('Duplicate assignment of field %r' % name)
self.__setattr__(name, val)
# Disable type checking here
#return
for name in self.FIELDS:
if not self._assigned[name]:
# If anything was set, then required fields raise an error.
@@ -195,22 +197,23 @@ def CheckUnassigned(self):
if unassigned:
raise ValueError("Fields %r were't be assigned" % unassigned)
def __setattr__(self, name, value):
if name == '_assigned':
self.__dict__[name] = value
return
try:
desc = self.DESCRIPTOR_LOOKUP[name]
except KeyError:
raise AttributeError('Object of type %r has no attribute %r' %
(self.__class__.__name__, name))
if 1: # Disable type checking here
def __setattr__(self, name, value):
if name == '_assigned':
self.__dict__[name] = value
return
try:
desc = self.DESCRIPTOR_LOOKUP[name]
except KeyError:
raise AttributeError('Object of type %r has no attribute %r' %
(self.__class__.__name__, name))
if not _CheckType(value, desc):
raise AssertionError("Field %r should be of type %s, got %r (%s)" %
(name, desc, value, value.__class__))
if not _CheckType(value, desc):
raise AssertionError("Field %r should be of type %s, got %r (%s)" %
(name, desc, value, value.__class__))
self._assigned[name] = True # check this later when encoding
self.__dict__[name] = value
self._assigned[name] = True # check this later when encoding
self.__dict__[name] = value
def __repr__(self):
ast_f = fmt.TextOutput(util.Buffer()) # No color by default.
@@ -0,0 +1,11 @@
# These files were selected to be big with test/wild.sh count-lines
/home/andy/git/alpine/abuild/abuild
/home/andy/git/other/staticpython/build.sh
/home/andy/git/other/git/t/t9300-fast-import.sh
/home/andy/git/other/kubernetes/hack/make-rules/test-cmd-util.sh
/home/andy/git/other/kubernetes/cluster/gce/gci/configure-helper.sh
/home/andy/src/mksh/Build.sh
/home/andy/git/basis-build/_tmp/debootstrap/functions
/home/andy/git/other/git/t/t4014-format-patch.sh
/home/andy/git/other/kythe/third_party/proto/configure
/home/andy/git/other/kythe/third_party/proto/ltmain.sh
View
@@ -0,0 +1,88 @@
#!/usr/bin/Rscript
#
# osh-parser.R
#
# Analyze output from shell scripts.
library(dplyr)
library(tidyr)
Log = function(fmt, ...) {
cat(sprintf(fmt, ...))
cat('\n')
}
main = function(argv) {
# num_lines, path
lines = read.csv(argv[[1]])
# status, elapsed, shell, path
times = read.csv(argv[[2]])
out_dir = argv[[3]]
# TODO:
# - compute lines per second for every cell?
#print(lines)
#print(times)
# Remove failures
times %>% filter(status == 0) %>% select(-c(status)) -> times
# Add the number of lines, joining on path, and compute lines/sec
# TODO: Is there a better way compute lines_per_ms and then drop lines_per_sec?
times %>%
left_join(lines, by = c('path')) %>%
mutate(elapsed_ms = elapsed_secs * 1000,
lines_per_ms = num_lines / elapsed_ms) %>%
select(-c(elapsed_secs)) ->
joined
#print(joined)
# Summarize rates
joined %>%
group_by(shell) %>%
summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>%
mutate(lines_per_ms = total_lines / total_ms) ->
rate_summary
# Put OSH last!
first = rate_summary %>% filter(shell != 'osh')
last = rate_summary %>% filter(shell == 'osh')
rate_summary = bind_rows(list(first, last))
print(rate_summary)
# Elapsed seconds by file and shell
joined %>%
select(-c(lines_per_ms)) %>%
spread(key = shell, value = elapsed_ms) %>%
arrange(num_lines) %>%
select(c(bash, dash, mksh, zsh, osh, num_lines, path)) ->
elapsed
print(elapsed)
# Rates by file and shell
joined %>%
select(-c(elapsed_ms)) %>%
spread(key = shell, value = lines_per_ms) %>%
arrange(num_lines) %>%
select(c(bash, dash, mksh, zsh, osh, num_lines, path)) ->
rate
print(rate)
write.csv(elapsed, file.path(out_dir, 'elapsed.csv'), row.names = F)
write.csv(rate, file.path(out_dir, 'rate.csv'), row.names = F)
write.csv(rate_summary, file.path(out_dir, 'rate_summary.csv'), row.names = F)
Log('Wrote %s', out_dir)
Log('PID %d done', Sys.getpid())
}
if (length(sys.frames()) == 0) {
# increase ggplot font size globally
#theme_set(theme_grey(base_size = 20))
main(commandArgs(TRUE))
}
View
@@ -0,0 +1,190 @@
#!/bin/bash
#
# Usage:
# ./osh-parser.sh <function name>
set -o nounset
set -o pipefail
set -o errexit
readonly BASE_DIR=_tmp/osh-parser
readonly SORTED=$BASE_DIR/input/sorted.txt
readonly TIMES_CSV=$BASE_DIR/raw/times.csv
readonly LINES_CSV=$BASE_DIR/raw/line-counts.csv
# NOTE --ast-format none eliminates print time! That is more than half of it!
# ( 60 seconds with serialization, 29 seconds without.)
#
# TODO: Lines per second is about 1700
# Run each file twice and compare timing?
# TODO: Use the compiled version without our Python, not system Python!
# Compilation flags are different.
# - Well maybe we want both.
osh-parse-one() {
local path=$1
echo "--- $path ---"
TIMEFORMAT="%R osh $path" # elapsed time
benchmarks/time.py \
--output $TIMES_CSV \
--field osh --field "$path" -- \
bin/osh -n --ast-format none $path
}
sh-one() {
local sh=$1
local path=$2
echo "--- $sh -n $path ---"
# Since we're running benchmarks serially, just append to the same file.
TIMEFORMAT="%R $sh $path" # elapsed time
# exit code, time in seconds, sh, path. \0 would have been nice here!
benchmarks/time.py \
--output $TIMES_CSV \
--field "$sh" --field "$path" -- \
$sh -n $path || echo FAILED
}
write-sorted-manifest() {
local files=${1:-benchmarks/osh-parser-files.txt}
local counts=$BASE_DIR/raw/line-counts.txt
local csv=$LINES_CSV
# Remove comments and sort by line count
grep -v '^#' $files | xargs wc -l | sort -n > $counts
# Raw list of paths
cat $counts | awk '$2 != "total" { print $2 }' > $SORTED
# Make a LINES_CSV from wc output
cat $counts | awk '
BEGIN { print "num_lines,path" }
$2 != "total" { print $1 "," $2 }' \
> $csv
cat $SORTED
echo ---
cat $csv
}
run() {
mkdir -p $BASE_DIR/{input,raw,stage1,www}
write-sorted-manifest
local sorted=$SORTED
# Header
echo 'status,elapsed_secs,shell,path' > $TIMES_CSV
# 20ms for ltmain.sh; 34ms for configure
cat $sorted | xargs -n 1 $0 sh-one bash || true
# Wow dash is a lot faster, 5 ms / 6 ms. It even gives one syntax error.
cat $sorted | xargs -n 1 $0 sh-one dash || true
# mksh is in between: 11 / 23 ms.
cat $sorted | xargs -n 1 $0 sh-one mksh || true
# zsh really slow: 45 ms and 124 ms.
cat $sorted | xargs -n 1 $0 sh-one zsh || true
# 4 s and 15 s. So 1000x speedup would be sufficient, not 10,000x!
time cat $sorted | xargs -n 1 $0 osh-parse-one
cat $TIMES_CSV
}
summarize() {
local out=_tmp/osh-parser/stage1
mkdir -p $out
benchmarks/osh-parser.R $LINES_CSV $TIMES_CSV $out
tree $BASE_DIR
}
_print-report() {
local base_url='../../../web/table'
cat <<EOF
<!DOCTYPE html>
<html>
<head>
<title>OSH Parser Benchmark</title>
<script type="text/javascript" src="$base_url/table-sort.js"></script>
<link rel="stylesheet" type="text/css" href="$base_url/table-sort.css" />
<style>
td { text-align: right; }
body {
margin: 0 auto;
width: 60em;
}
code { color: green; }
</style>
</head>
<body>
<h2>OSH Parser Benchmark</h2>
<p>We run <code>\$sh -n \$file</code> for various files under various
shells. This means that shell startup time is included in the
elapsed time measurements, but long files are chosen to minimize its
effect.</p>
<h3>Elasped Time by File and Shell (milliseconds)</h3>
<table id="elapsed">
EOF
web/table/csv_to_html.py < $BASE_DIR/stage1/elapsed.csv
cat <<EOF
</table>
<h3>Parsing Rate by File and Shell (lines/millisecond)</h3>
<table id="rate">
EOF
web/table/csv_to_html.py < $BASE_DIR/stage1/rate.csv
cat <<EOF
</table>
<h3>Summary</h3>
<table id="rate-summary">
EOF
web/table/csv_to_html.py < $BASE_DIR/stage1/rate_summary.csv
cat <<EOF
</table>
</body>
</html>
EOF
}
report() {
local out=$BASE_DIR/www/summary.html
_print-report > $out
echo "Wrote $out"
}
# TODO:
# - Parse the test file -> csv. Have to get rid of syntax errors?
# - I really want --output.
# - benchmarks/time.py is probably appropriate now.
# - reshape, total, and compute lines/sec
# - that is really a job for R
# - maybe you need awk to massage wc output into LINES_CSV
# - csv_to_html.py
# - Then a shell script here to put CSS and JS around it.
# - wild-static
# - Publish to release/0.2.0/benchmarks/MACHINE/wild/
time-test() {
benchmarks/time.py \
--field bash --field foo.txt --output _tmp/bench.csv \
sleep 0.123
cat _tmp/bench.csv
}
"$@"
View
@@ -0,0 +1,59 @@
#!/usr/bin/python
"""
time.py -- Replacement for coreutils 'time'.
The interface of this program is modelled after:
/usr/bin/time --append --output foo.txt --format '%x %e'
Problems with /usr/bin/time:
- elapsed time only has 2 digits of precision
Problems with bash time builtin
- has no way to get the exit code
- writes to stderr, so you it's annoying to get both process stderr and
and
This program also writes CSV directly, so you can have commas in fields, etc.
"""
import csv
import optparse
import sys
import subprocess
import time
def Options():
"""Returns an option parser instance."""
p = optparse.OptionParser('time.py [options] ARGV...')
p.add_option(
'-o', '--output', dest='output', default=None,
help='Name of output file to write to')
p.add_option(
'--field', dest='fields', default=[], action='append',
help='A string to append to each row, after the exit code and status')
return p
def main(argv):
(opts, child_argv) = Options().parse_args(argv[1:])
start_time = time.time()
exit_code = subprocess.call(child_argv)
elapsed = time.time() - start_time
fields = tuple(opts.fields)
with open(opts.output, 'a') as f:
out = csv.writer(f)
row = (exit_code, '%.4f' % elapsed) + fields
out.writerow(row)
# Preserve the command's exit code. (This means you can't distinguish
# between a failure of time.py and the command, but that's better than
# swallowing the error.)
return exit_code
if __name__ == '__main__':
sys.exit(main(sys.argv))
Oops, something went wrong.

0 comments on commit 2f864e5

Please sign in to comment.