View
@@ -1,79 +1,90 @@
#!/usr/bin/Rscript
#
# osh-parser.R
# osh-parser.R -- Analyze output from shell scripts.
#
# Analyze output from shell scripts.
# Usage:
# osh-parser.R OUT_DIR [TIMES_CSV...]
library(dplyr)
library(tidyr)
options(stringsAsFactors = F)
Log = function(fmt, ...) {
cat(sprintf(fmt, ...))
cat('\n')
}
main = function(argv) {
# num_lines, path
lines = read.csv(argv[[1]])
# status, elapsed, shell, path
times = read.csv(argv[[2]])
out_dir = argv[[3]]
# TODO:
# - compute lines per second for every cell?
#print(lines)
#print(times)
# Remove failures
times %>% filter(status == 0) %>% select(-c(status)) -> times
# Add the number of lines, joining on path, and compute lines/sec
# TODO: Is there a better way compute lines_per_ms and then drop lines_per_sec?
times %>%
left_join(lines, by = c('path')) %>%
mutate(elapsed_ms = elapsed_secs * 1000,
lines_per_ms = num_lines / elapsed_ms) %>%
select(-c(elapsed_secs)) ->
joined
#print(joined)
# Summarize rates
joined %>%
group_by(shell) %>%
out_dir = argv[[1]]
hosts = list()
for (i in 2:length(argv)) {
times_path = argv[[i]]
# Find it in the same directory
lines_path = gsub('.times.', '.lines.', times_path, fixed = T)
Log('times: %s', times_path)
Log('lines: %s', lines_path)
times = read.csv(times_path)
lines = read.csv(lines_path)
# Remove failures
times %>% filter(status == 0) %>% select(-c(status)) -> times
# Add the number of lines, joining on path, and compute lines/sec
# TODO: Is there a better way compute lines_per_ms and then drop
# lines_per_sec?
times %>%
left_join(lines, by = c('path')) %>%
mutate(elapsed_ms = elapsed_secs * 1000,
lines_per_ms = num_lines / elapsed_ms) %>%
select(-c(elapsed_secs)) ->
host_rows
hosts[[i-1]] = host_rows
}
all_times = bind_rows(hosts)
print(all_times)
# Summarize rates by platform/shell
all_times %>%
group_by(shell_id, platform_id) %>%
summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>%
mutate(lines_per_ms = total_lines / total_ms) ->
rate_summary
# Put OSH last!
first = rate_summary %>% filter(shell != 'osh')
last = rate_summary %>% filter(shell == 'osh')
rate_summary = bind_rows(list(first, last))
print(rate_summary)
# Elapsed seconds by file and shell
joined %>%
# Elapsed seconds for each shell by platform and file
all_times %>%
select(-c(lines_per_ms)) %>%
spread(key = shell, value = elapsed_ms) %>%
arrange(num_lines) %>%
select(c(bash, dash, mksh, zsh, osh, num_lines, path)) ->
spread(key = shell_id, value = elapsed_ms) %>%
arrange(platform_id, num_lines) ->
elapsed
#select(c(bash, dash, mksh, zsh, osh, num_lines, path)) ->
Log('\n')
Log('ELAPSED')
print(elapsed)
# Rates by file and shell
joined %>%
all_times %>%
select(-c(elapsed_ms)) %>%
spread(key = shell, value = lines_per_ms) %>%
arrange(num_lines) %>%
select(c(bash, dash, mksh, zsh, osh, num_lines, path)) ->
spread(key = shell_id, value = lines_per_ms) %>%
arrange(platform_id, num_lines) ->
rate
#select(c(bash, dash, mksh, zsh, osh, num_lines, path)) ->
Log('\n')
Log('RATE')
print(rate)
write.csv(rate_summary,
file.path(out_dir, 'rate_summary.csv'), row.names = F)
write.csv(elapsed, file.path(out_dir, 'elapsed.csv'), row.names = F)
write.csv(rate, file.path(out_dir, 'rate.csv'), row.names = F)
write.csv(rate_summary, file.path(out_dir, 'rate_summary.csv'), row.names = F)
Log('Wrote %s', out_dir)
View
@@ -9,43 +9,46 @@ set -o nounset
set -o pipefail
set -o errexit
# TODO: The raw files should be published. In both
# ~/git/oilshell/benchmarks-data and also in the /release/ hierarchy?
readonly BASE_DIR=_tmp/osh-parser
readonly SORTED=$BASE_DIR/input/sorted.txt
readonly TIMES_CSV=$BASE_DIR/raw/times.csv
readonly LINES_CSV=$BASE_DIR/raw/line-counts.csv
readonly SORTED=$BASE_DIR/tmp/sorted.txt
# NOTE --ast-format none eliminates print time! That is more than half of it!
# ( 60 seconds with serialization, 29 seconds without.)
import-files() {
grep -v '^#' benchmarks/osh-parser-originals.txt |
xargs --verbose -I {} -- cp {} benchmarks/testdata
}
# NOTE --ast-format none eliminates print time! That is more than
# half of it! ( 60 seconds with serialization, 29 seconds without.)
# TODO: That is the only difference... hm.
#
# TODO:
# - Have OSH --parse-and-dump-path
# - it can dump /proc/self/meminfo
osh-parse-one() {
local append_out=$1
local path=$2
echo "--- $path ---"
benchmarks/time.py \
--output $append_out \
--field osh --field "$path" -- \
bin/osh -n --ast-format none $path
}
sh-one() {
local append_out=$1
local sh=$2
local path=$3
echo "--- $sh -n $path ---"
local platform_id=$3
local shell_id=$4
local path=$5
echo "--- $sh $path ---"
# Since we're running benchmarks serially, just append to the same file.
TIMEFORMAT="%R $sh $path" # elapsed time
# Can't use array because of set -u bug!!! Only fixed in bash
# 4.4.
extra_args=''
# exit code, time in seconds, sh, path. \0 would have been nice here!
if [[ $sh == */osh ]]; then
extra_args='--ast-format none'
fi
# exit code, time in seconds, platform_id, shell_id, path. \0
# would have been nice here!
benchmarks/time.py \
--output $append_out \
--field "$sh" --field "$path" -- \
$sh -n $path || echo FAILED
--field "$platform_id" --field "$shell_id" --field "$path" -- \
"$sh" -n $extra_args "$path" || echo FAILED
}
import-files() {
@@ -56,7 +59,7 @@ import-files() {
write-sorted-manifest() {
local files=${1:-benchmarks/osh-parser-files.txt}
local counts=$BASE_DIR/raw/line-counts.txt
local csv=$LINES_CSV
local csv=$2
# Remove comments and sort by line count
grep -v '^#' $files | xargs wc -l | sort -n > $counts
@@ -75,47 +78,67 @@ write-sorted-manifest() {
cat $csv
}
# runtime_id, platform_id, toolchain_id (which sometimes you don't know)
run() {
mkdir -p $BASE_DIR/{input,raw,stage1,www}
local preview=${1:-}
local job_id
job_id="$(hostname).$(date +%Y-%m-%d__%H-%M-%S)"
write-sorted-manifest
local out_dir='../benchmark-data/osh-parser/'
local out="$out_dir/$job_id.times.csv"
local lines_out="$out_dir/$job_id.lines.csv"
mkdir -p \
$(dirname $out) \
$BASE_DIR/{tmp,raw,stage1,www}
write-sorted-manifest '' $lines_out
local sorted=$SORTED
# This file is appended to
local out=$TIMES_CSV
# Write Header of the CSV file that is appended to.
echo 'status,elapsed_secs,platform_id,shell_id,path' > $out
local tmp_dir=_tmp/platform-id/$(hostname)
benchmarks/id.sh dump-platform-id $tmp_dir
local shell_id
local platform_id
# Header
echo 'status,elapsed_secs,shell,path' > $TIMES_CSV
platform_id=$(benchmarks/id.sh publish-platform-id $tmp_dir)
echo $platform_id
# 20ms for ltmain.sh; 34ms for configure
cat $sorted | xargs -n 1 $0 sh-one $out bash || true
#for sh_path in bash dash mksh zsh; do
for sh_path in bash dash mksh zsh bin/osh _bin/osh; do
# There will be two different OSH
local name=$(basename $sh_path)
# Wow dash is a lot faster, 5 ms / 6 ms. It even gives one syntax error.
cat $sorted | xargs -n 1 $0 sh-one $out dash || true
tmp_dir=_tmp/shell-id/$name
benchmarks/id.sh dump-shell-id $sh_path $tmp_dir
# mksh is in between: 11 / 23 ms.
cat $sorted | xargs -n 1 $0 sh-one $out mksh || true
shell_id=$(benchmarks/id.sh publish-shell-id $tmp_dir)
# zsh really slow: 45 ms and 124 ms.
cat $sorted | xargs -n 1 $0 sh-one $out zsh || true
echo "ID $shell_id"
# TODO:
# - Run OSH under OVM
# - Run OSH compiled with OPy
# Maybe these are gradual release upgrades?
return
# TODO: Shell ID should be separate columns?
# It's really shell_version_id?
# 4 s and 15 s. So 1000x speedup would be sufficient, not 10,000x!
time cat $sorted | xargs -n 1 $0 osh-parse-one $out
if ! test -n "$preview"; then
# 20ms for ltmain.sh; 34ms for configure
cat $sorted | xargs -n 1 -- $0 \
sh-one $out $sh_path $platform_id $shell_id || true
fi
done
cat $TIMES_CSV
echo $TIMES_CSV
cat $out
echo "Wrote $out"
}
summarize() {
local out=_tmp/osh-parser/stage1
mkdir -p $out
benchmarks/osh-parser.R $LINES_CSV $TIMES_CSV $out
benchmarks/osh-parser.R $out ../benchmark-data/osh-parser/*.times.csv
tree $BASE_DIR
}
@@ -148,6 +171,13 @@ _print-report() {
elapsed time measurements, but long files are chosen to minimize its
effect.</p>
<h3>Summary</h3>
<table id="rate-summary">
EOF
web/table/csv_to_html.py < $BASE_DIR/stage1/rate_summary.csv
cat <<EOF
<h3>Elasped Time by File and Shell (milliseconds)</h3>
<table id="elapsed">
@@ -164,12 +194,6 @@ EOF
cat <<EOF
</table>
<h3>Summary</h3>
<table id="rate-summary">
EOF
web/table/csv_to_html.py < $BASE_DIR/stage1/rate_summary.csv
cat <<EOF
</table>
</body>
</html>
@@ -182,76 +206,6 @@ report() {
echo "Wrote $out"
}
#
# Record Provenance: Code, Data, Env
#
# - code: We will run against different shells (bash, dash, OSH). The OSH
# code will improve over time
# - env: we test it on different machines (machine architecture, OS, distro,
# etc.)
# - data ID: (name, num_lines) is sufficient I think. Don't bother with hash.
# - or does (name, hash) make sense?
# TODO:
# - add code_id to CSV (time.py), and code-id.txt?
code-id() {
# columns for osh:
# vm,compiler
# columns for other:
# --version
# osh --version?
# git branch, etc.?
# running system python, or OVM?
echo TODO
}
# Just hash the files?
data-id() {
echo TODO
}
# Events that will change the env for a given machine:
# - kernel upgrade
# - distro upgrade
env-id() {
local out_dir=${1:-_tmp/env-id-$(hostname)}
mkdir -p $out_dir
hostname > $out_dir/hostname.txt
# does it make sense to do individual fields like -m?
# avoid parsing?
# We care about the kernel and the CPU architecture.
# There is a lot of redundant information there.
uname -m > $out_dir/machine.txt
# machine
{ uname --kernel-release
uname --kernel-version
} > $out_dir/kernel.txt
cat /proc/cpuinfo > $out_dir/cpuinfo.txt
# mem info doesn't make a difference? I guess it's just nice to check that
# it's not swapping. But shouldn't be part of the hash.
cat /proc/meminfo > $out_dir/meminfo.txt
cat /etc/lsb-release > $out_dir/lsb-release.txt
cat /etc/debian_version > $out_dir/debian_version.txt
head $out_dir/*
# Now should I create a hash from this?
# like x86_64__linux__distro?
# There is already concept of the triple?
}
_banner() {
echo -----
echo "$@"