Permalink
Please sign in to comment.
Browse files
Start benchmarking the parser performance, and other changes.
benchmarks/osh-parser.sh compares OSH vs other shells.
- benchmarks/time.py: Replacement for 'time' to get elapsed time /
exit code in CSV
- osh-parser.R: reshape and analyze the results
- osh-parser.sh: output an HTML report
Wild test changes in wild.sh:
- Parsing 150K lines of shell code in FreeBSD.
- find more shell scripts with a 'find' expression for extensionless
executables.
- search for unparsed features like a[x]=1
Oil code:
- Add --ast-format none, so we get more accurate measurements of parsing
time
- Experiment to disable ASDL's dynamic type checking.
- Parsing performance improved from ~1.7 lines per ms to ~2.0. I
thought it would be a bigger change!
Other tweaks:
- id_kind_test.py: Print stats about number of IDs and Kinds
- osh/lex.py: Make note of remaining tokens in OSH lexer
- osh/osh.asdl: update comments about code representation
- Update the line count script. Look at the parser vs. the runtime.- Loading branch information...
Showing
with
504 additions
and 29 deletions.
- +17 −14 asdl/py_meta.py
- +11 −0 benchmarks/osh-parser-files.txt
- +88 −0 benchmarks/osh-parser.R
- +190 −0 benchmarks/osh-parser.sh
- +59 −0 benchmarks/time.py
- +6 −3 bin/oil.py
- +7 −3 core/id_kind_test.py
- +20 −1 osh/lex.py
- +2 −1 osh/osh.asdl
- +19 −3 scripts/count.sh
- +1 −1 test/sh_spec.py
- +1 −1 test/shebang.sh
- +80 −0 test/wild.sh
- +3 −2 web/table/csv_to_html.py
| @@ -0,0 +1,11 @@ | ||
| # These files were selected to be big with test/wild.sh count-lines | ||
| /home/andy/git/alpine/abuild/abuild | ||
| /home/andy/git/other/staticpython/build.sh | ||
| /home/andy/git/other/git/t/t9300-fast-import.sh | ||
| /home/andy/git/other/kubernetes/hack/make-rules/test-cmd-util.sh | ||
| /home/andy/git/other/kubernetes/cluster/gce/gci/configure-helper.sh | ||
| /home/andy/src/mksh/Build.sh | ||
| /home/andy/git/basis-build/_tmp/debootstrap/functions | ||
| /home/andy/git/other/git/t/t4014-format-patch.sh | ||
| /home/andy/git/other/kythe/third_party/proto/configure | ||
| /home/andy/git/other/kythe/third_party/proto/ltmain.sh |
| @@ -0,0 +1,88 @@ | ||
| #!/usr/bin/Rscript | ||
| # | ||
| # osh-parser.R | ||
| # | ||
| # Analyze output from shell scripts. | ||
| library(dplyr) | ||
| library(tidyr) | ||
| Log = function(fmt, ...) { | ||
| cat(sprintf(fmt, ...)) | ||
| cat('\n') | ||
| } | ||
| main = function(argv) { | ||
| # num_lines, path | ||
| lines = read.csv(argv[[1]]) | ||
| # status, elapsed, shell, path | ||
| times = read.csv(argv[[2]]) | ||
| out_dir = argv[[3]] | ||
| # TODO: | ||
| # - compute lines per second for every cell? | ||
| #print(lines) | ||
| #print(times) | ||
| # Remove failures | ||
| times %>% filter(status == 0) %>% select(-c(status)) -> times | ||
| # Add the number of lines, joining on path, and compute lines/sec | ||
| # TODO: Is there a better way compute lines_per_ms and then drop lines_per_sec? | ||
| times %>% | ||
| left_join(lines, by = c('path')) %>% | ||
| mutate(elapsed_ms = elapsed_secs * 1000, | ||
| lines_per_ms = num_lines / elapsed_ms) %>% | ||
| select(-c(elapsed_secs)) -> | ||
| joined | ||
| #print(joined) | ||
| # Summarize rates | ||
| joined %>% | ||
| group_by(shell) %>% | ||
| summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>% | ||
| mutate(lines_per_ms = total_lines / total_ms) -> | ||
| rate_summary | ||
| # Put OSH last! | ||
| first = rate_summary %>% filter(shell != 'osh') | ||
| last = rate_summary %>% filter(shell == 'osh') | ||
| rate_summary = bind_rows(list(first, last)) | ||
| print(rate_summary) | ||
| # Elapsed seconds by file and shell | ||
| joined %>% | ||
| select(-c(lines_per_ms)) %>% | ||
| spread(key = shell, value = elapsed_ms) %>% | ||
| arrange(num_lines) %>% | ||
| select(c(bash, dash, mksh, zsh, osh, num_lines, path)) -> | ||
| elapsed | ||
| print(elapsed) | ||
| # Rates by file and shell | ||
| joined %>% | ||
| select(-c(elapsed_ms)) %>% | ||
| spread(key = shell, value = lines_per_ms) %>% | ||
| arrange(num_lines) %>% | ||
| select(c(bash, dash, mksh, zsh, osh, num_lines, path)) -> | ||
| rate | ||
| print(rate) | ||
| write.csv(elapsed, file.path(out_dir, 'elapsed.csv'), row.names = F) | ||
| write.csv(rate, file.path(out_dir, 'rate.csv'), row.names = F) | ||
| write.csv(rate_summary, file.path(out_dir, 'rate_summary.csv'), row.names = F) | ||
| Log('Wrote %s', out_dir) | ||
| Log('PID %d done', Sys.getpid()) | ||
| } | ||
| if (length(sys.frames()) == 0) { | ||
| # increase ggplot font size globally | ||
| #theme_set(theme_grey(base_size = 20)) | ||
| main(commandArgs(TRUE)) | ||
| } |
| @@ -0,0 +1,190 @@ | ||
| #!/bin/bash | ||
| # | ||
| # Usage: | ||
| # ./osh-parser.sh <function name> | ||
| set -o nounset | ||
| set -o pipefail | ||
| set -o errexit | ||
| readonly BASE_DIR=_tmp/osh-parser | ||
| readonly SORTED=$BASE_DIR/input/sorted.txt | ||
| readonly TIMES_CSV=$BASE_DIR/raw/times.csv | ||
| readonly LINES_CSV=$BASE_DIR/raw/line-counts.csv | ||
| # NOTE --ast-format none eliminates print time! That is more than half of it! | ||
| # ( 60 seconds with serialization, 29 seconds without.) | ||
| # | ||
| # TODO: Lines per second is about 1700 | ||
| # Run each file twice and compare timing? | ||
| # TODO: Use the compiled version without our Python, not system Python! | ||
| # Compilation flags are different. | ||
| # - Well maybe we want both. | ||
| osh-parse-one() { | ||
| local path=$1 | ||
| echo "--- $path ---" | ||
| TIMEFORMAT="%R osh $path" # elapsed time | ||
| benchmarks/time.py \ | ||
| --output $TIMES_CSV \ | ||
| --field osh --field "$path" -- \ | ||
| bin/osh -n --ast-format none $path | ||
| } | ||
| sh-one() { | ||
| local sh=$1 | ||
| local path=$2 | ||
| echo "--- $sh -n $path ---" | ||
| # Since we're running benchmarks serially, just append to the same file. | ||
| TIMEFORMAT="%R $sh $path" # elapsed time | ||
| # exit code, time in seconds, sh, path. \0 would have been nice here! | ||
| benchmarks/time.py \ | ||
| --output $TIMES_CSV \ | ||
| --field "$sh" --field "$path" -- \ | ||
| $sh -n $path || echo FAILED | ||
| } | ||
| write-sorted-manifest() { | ||
| local files=${1:-benchmarks/osh-parser-files.txt} | ||
| local counts=$BASE_DIR/raw/line-counts.txt | ||
| local csv=$LINES_CSV | ||
| # Remove comments and sort by line count | ||
| grep -v '^#' $files | xargs wc -l | sort -n > $counts | ||
| # Raw list of paths | ||
| cat $counts | awk '$2 != "total" { print $2 }' > $SORTED | ||
| # Make a LINES_CSV from wc output | ||
| cat $counts | awk ' | ||
| BEGIN { print "num_lines,path" } | ||
| $2 != "total" { print $1 "," $2 }' \ | ||
| > $csv | ||
| cat $SORTED | ||
| echo --- | ||
| cat $csv | ||
| } | ||
| run() { | ||
| mkdir -p $BASE_DIR/{input,raw,stage1,www} | ||
| write-sorted-manifest | ||
| local sorted=$SORTED | ||
| # Header | ||
| echo 'status,elapsed_secs,shell,path' > $TIMES_CSV | ||
| # 20ms for ltmain.sh; 34ms for configure | ||
| cat $sorted | xargs -n 1 $0 sh-one bash || true | ||
| # Wow dash is a lot faster, 5 ms / 6 ms. It even gives one syntax error. | ||
| cat $sorted | xargs -n 1 $0 sh-one dash || true | ||
| # mksh is in between: 11 / 23 ms. | ||
| cat $sorted | xargs -n 1 $0 sh-one mksh || true | ||
| # zsh really slow: 45 ms and 124 ms. | ||
| cat $sorted | xargs -n 1 $0 sh-one zsh || true | ||
| # 4 s and 15 s. So 1000x speedup would be sufficient, not 10,000x! | ||
| time cat $sorted | xargs -n 1 $0 osh-parse-one | ||
| cat $TIMES_CSV | ||
| } | ||
| summarize() { | ||
| local out=_tmp/osh-parser/stage1 | ||
| mkdir -p $out | ||
| benchmarks/osh-parser.R $LINES_CSV $TIMES_CSV $out | ||
| tree $BASE_DIR | ||
| } | ||
| _print-report() { | ||
| local base_url='../../../web/table' | ||
| cat <<EOF | ||
| <!DOCTYPE html> | ||
| <html> | ||
| <head> | ||
| <title>OSH Parser Benchmark</title> | ||
| <script type="text/javascript" src="$base_url/table-sort.js"></script> | ||
| <link rel="stylesheet" type="text/css" href="$base_url/table-sort.css" /> | ||
| <style> | ||
| td { text-align: right; } | ||
| body { | ||
| margin: 0 auto; | ||
| width: 60em; | ||
| } | ||
| code { color: green; } | ||
| </style> | ||
| </head> | ||
| <body> | ||
| <h2>OSH Parser Benchmark</h2> | ||
| <p>We run <code>\$sh -n \$file</code> for various files under various | ||
| shells. This means that shell startup time is included in the | ||
| elapsed time measurements, but long files are chosen to minimize its | ||
| effect.</p> | ||
| <h3>Elasped Time by File and Shell (milliseconds)</h3> | ||
| <table id="elapsed"> | ||
| EOF | ||
| web/table/csv_to_html.py < $BASE_DIR/stage1/elapsed.csv | ||
| cat <<EOF | ||
| </table> | ||
| <h3>Parsing Rate by File and Shell (lines/millisecond)</h3> | ||
| <table id="rate"> | ||
| EOF | ||
| web/table/csv_to_html.py < $BASE_DIR/stage1/rate.csv | ||
| cat <<EOF | ||
| </table> | ||
| <h3>Summary</h3> | ||
| <table id="rate-summary"> | ||
| EOF | ||
| web/table/csv_to_html.py < $BASE_DIR/stage1/rate_summary.csv | ||
| cat <<EOF | ||
| </table> | ||
| </body> | ||
| </html> | ||
| EOF | ||
| } | ||
| report() { | ||
| local out=$BASE_DIR/www/summary.html | ||
| _print-report > $out | ||
| echo "Wrote $out" | ||
| } | ||
| # TODO: | ||
| # - Parse the test file -> csv. Have to get rid of syntax errors? | ||
| # - I really want --output. | ||
| # - benchmarks/time.py is probably appropriate now. | ||
| # - reshape, total, and compute lines/sec | ||
| # - that is really a job for R | ||
| # - maybe you need awk to massage wc output into LINES_CSV | ||
| # - csv_to_html.py | ||
| # - Then a shell script here to put CSS and JS around it. | ||
| # - wild-static | ||
| # - Publish to release/0.2.0/benchmarks/MACHINE/wild/ | ||
| time-test() { | ||
| benchmarks/time.py \ | ||
| --field bash --field foo.txt --output _tmp/bench.csv \ | ||
| sleep 0.123 | ||
| cat _tmp/bench.csv | ||
| } | ||
| "$@" |
| @@ -0,0 +1,59 @@ | ||
| #!/usr/bin/python | ||
| """ | ||
| time.py -- Replacement for coreutils 'time'. | ||
| The interface of this program is modelled after: | ||
| /usr/bin/time --append --output foo.txt --format '%x %e' | ||
| Problems with /usr/bin/time: | ||
| - elapsed time only has 2 digits of precision | ||
| Problems with bash time builtin | ||
| - has no way to get the exit code | ||
| - writes to stderr, so you it's annoying to get both process stderr and | ||
| and | ||
| This program also writes CSV directly, so you can have commas in fields, etc. | ||
| """ | ||
| import csv | ||
| import optparse | ||
| import sys | ||
| import subprocess | ||
| import time | ||
| def Options(): | ||
| """Returns an option parser instance.""" | ||
| p = optparse.OptionParser('time.py [options] ARGV...') | ||
| p.add_option( | ||
| '-o', '--output', dest='output', default=None, | ||
| help='Name of output file to write to') | ||
| p.add_option( | ||
| '--field', dest='fields', default=[], action='append', | ||
| help='A string to append to each row, after the exit code and status') | ||
| return p | ||
| def main(argv): | ||
| (opts, child_argv) = Options().parse_args(argv[1:]) | ||
| start_time = time.time() | ||
| exit_code = subprocess.call(child_argv) | ||
| elapsed = time.time() - start_time | ||
| fields = tuple(opts.fields) | ||
| with open(opts.output, 'a') as f: | ||
| out = csv.writer(f) | ||
| row = (exit_code, '%.4f' % elapsed) + fields | ||
| out.writerow(row) | ||
| # Preserve the command's exit code. (This means you can't distinguish | ||
| # between a failure of time.py and the command, but that's better than | ||
| # swallowing the error.) | ||
| return exit_code | ||
| if __name__ == '__main__': | ||
| sys.exit(main(sys.argv)) |
Oops, something went wrong.
0 comments on commit
2f864e5