Permalink
Browse files

Incorporate virtual memory measurements into the osh-parser benchmark.

- Added stage1:
  - csv_concat.py to prepare times data
  - virtual_memory.py to parse saved /proc/$PID/status
- The R code in stage2 is now simpler.
  - dplyr is awesome for manipulating VM data, very concise!
- Display virtual memory in the HTML report.  37 - 214 MB!

Also: make a CSV for vm-baseline using virtual_memory.py.  Not published
yet.
  • Loading branch information...
Andy Chu
Andy Chu committed Nov 19, 2017
1 parent 3a6c86e commit 371c3cda1e15ed978d9631f3b7b0e742b4b27d04
Showing with 292 additions and 57 deletions.
  1. +58 −38 benchmarks/osh-parser.R
  2. +76 −19 benchmarks/osh-parser.sh
  3. +14 −0 benchmarks/virtual-memory.sh
  4. +75 −0 benchmarks/virtual_memory.py
  5. +34 −0 tools/csv-concat-test.sh
  6. +35 −0 tools/csv_concat.py
View
@@ -20,41 +20,46 @@ sourceUrl = function(path) {
sprintf('https://github.com/oilshell/oil/blob/master/%s', path)
}
# Takes a filename, not a path.
sourceUrl2 = function(filename) {
sprintf(
'https://github.com/oilshell/oil/blob/master/benchmarks/testdata/%s',
filename)
}
main = function(argv) {
out_dir = argv[[1]]
# Merge all the inputs
hosts = list()
raw_times_list = list()
for (i in 2:length(argv)) {
times_path = argv[[i]]
# Find it in the same directory
lines_path = gsub('.times.', '.lines.', times_path, fixed = T)
Log('times: %s', times_path)
Log('lines: %s', lines_path)
times = read.csv(times_path)
lines = read.csv(lines_path)
# Remove failures
times %>% filter(status == 0) %>% select(-c(status)) -> times
# Add the number of lines, joining on path, and compute lines/sec
# TODO: Is there a better way compute lines_per_ms and then drop
# lines_per_sec?
times %>%
left_join(lines, by = c('path')) %>%
mutate(elapsed_ms = elapsed_secs * 1000,
lines_per_ms = num_lines / elapsed_ms) %>%
select(-c(elapsed_secs)) ->
host_rows
hosts[[i-1]] = host_rows
raw_times_list[[i-1]] = times_path
}
in_dir = argv[[1]]
out_dir = argv[[2]]
times = read.csv(file.path(in_dir, 'times.csv'))
lines = read.csv(file.path(in_dir, 'lines.csv'))
raw_data = read.csv(file.path(in_dir, 'raw-data.csv'))
vm = read.csv(file.path(in_dir, 'virtual-memory.csv'))
# For joining by filename
lines_by_filename = data_frame(
num_lines = lines$num_lines,
filename = basename(lines$path)
)
# Remove failures
times %>% filter(status == 0) %>% select(-c(status)) -> times
# Add the number of lines, joining on path, and compute lines/sec
# TODO: Is there a better way compute lines_per_ms and then drop
# lines_per_sec?
times %>%
left_join(lines, by = c('path')) %>%
mutate(elapsed_ms = elapsed_secs * 1000,
lines_per_ms = num_lines / elapsed_ms) %>%
select(-c(elapsed_secs)) ->
all_times
#print(head(times))
#print(head(lines))
#print(head(vm))
#print(head(all_times))
all_times = bind_rows(hosts)
print(summary(all_times))
#
@@ -66,7 +71,6 @@ main = function(argv) {
distinct_hosts$host_label = distinct_hosts$host_name
print(distinct_hosts)
all_times %>% distinct(shell_name, shell_hash) -> distinct_shells
print(distinct_shells)
@@ -136,6 +140,20 @@ main = function(argv) {
num_lines, filename, filename_HREF)) ->
rate
# Just show osh-ovm because we know from the 'baseline' benchmark that it
# uses significantly less than osh-cpython.
vm %>%
left_join(distinct_shells, by = c('shell_name', 'shell_hash')) %>%
select(-c(shell_name, shell_hash)) %>%
filter(shell_label == 'osh-ovm') %>%
select(-c(shell_label)) %>%
spread(key = metric_name, value = metric_value) %>%
left_join(lines_by_filename, by = c('filename')) %>%
arrange(host, num_lines) %>%
mutate(filename_HREF = sourceUrl2(filename)) %>%
select(c(host, VmPeak, VmRSS, num_lines, filename, filename_HREF)) ->
vm_table
Log('\n')
Log('RATE')
print(rate)
@@ -166,19 +184,21 @@ main = function(argv) {
)
print(shell_table)
raw_times = data_frame(
filename = basename(as.character(raw_times_list)),
raw_data_table = data_frame(
filename = basename(as.character(raw_data$path)),
filename_HREF = benchmarkDataLink('osh-parser', filename, '')
)
print(raw_times)
print(raw_data_table)
writeCsv(host_table, file.path(out_dir, 'hosts'))
writeCsv(shell_table, file.path(out_dir, 'shells'))
writeCsv(raw_times, file.path(out_dir, 'raw_times'))
writeCsv(raw_data_table, file.path(out_dir, 'raw-data'))
writeCsv(shell_summary, file.path(out_dir, 'summary'))
writeCsv(elapsed, file.path(out_dir, 'elapsed'))
writeCsv(rate, file.path(out_dir, 'rate'))
writeCsv(vm_table, file.path(out_dir, 'virtual-memory'))
Log('Wrote %s', out_dir)
Log('PID %d done', Sys.getpid())
View
@@ -9,6 +9,8 @@ set -o nounset
set -o pipefail
set -o errexit
source test/common.sh # die
# TODO: The raw files should be published. In both
# ~/git/oilshell/benchmarks-data and also in the /release/ hierarchy?
readonly BASE_DIR=_tmp/osh-parser
@@ -156,19 +158,60 @@ run() {
echo "Wrote $times_out, $lines_out, and $vm_out_dir/"
}
# TODO:
summarize() {
#
# Data Preparation and Analysis
#
csv-concat() {
tools/csv_concat.py "$@"
}
stage1() {
local out=_tmp/osh-parser/stage1
mkdir -p $out
local vm_csv=$out/virtual-memory.csv
local -a x=(../benchmark-data/osh-parser/flanders.*.virtual-memory)
local -a y=(../benchmark-data/osh-parser/lisa.*.virtual-memory)
benchmarks/virtual_memory.py osh-parser ${x[-1]} ${y[-1]} > $vm_csv
local times_csv=$out/times.csv
# Globs are in lexicographical order, which works for our dates.
local -a m1=(../benchmark-data/osh-parser/flanders.*.times.csv)
local -a m2=(../benchmark-data/osh-parser/lisa.*.times.csv)
local -a a=(../benchmark-data/osh-parser/flanders.*.times.csv)
local -a b=(../benchmark-data/osh-parser/lisa.*.times.csv)
csv-concat ${a[-1]} ${b[-1]} > $times_csv
# Construct a one-column CSV file
local raw_data_csv=$out/raw-data.csv
{ echo 'path'
echo ${a[-1]}
echo ${b[-1]}
} > $raw_data_csv
# Verify that the files are equal, and pass one of them.
local lines_csv=$out/lines.csv
local -a c=(../benchmark-data/osh-parser/flanders.*.lines.csv)
local -a d=(../benchmark-data/osh-parser/lisa.*.lines.csv)
local left=${c[-1]}
local right=${d[-1]}
if ! diff $left $right; then
die "Benchmarks were run on different files ($left != $right)"
fi
# They are the same, output one of them.
cat $left > $lines_csv
head $out/*
wc -l $out/*
}
# The last one
local -a latest=(${m1[-1]} ${m2[-1]})
stage2() {
local out=_tmp/osh-parser/stage2
mkdir -p $out
benchmarks/osh-parser.R $out "${latest[@]}"
benchmarks/osh-parser.R _tmp/osh-parser/stage1 $out
tree $BASE_DIR
}
@@ -180,6 +223,7 @@ summarize() {
# NOTE: not bothering to make it sortable now. Just using the CSS.
_print-report() {
local in_dir=$1
local base_url='../../web/table'
cat <<EOF
@@ -240,46 +284,59 @@ _print-report() {
elapsed time measurements, but long files are chosen to minimize its
effect.</p>
<h3>Summary</h3>
<h3>Parse Time Summary</h3>
EOF
web/table/csv2html.py $in_dir/summary.csv
cat <<EOF
<h3>Memory Used to Parse</h3>
<p>For <code>osh-ovm</code>.</p>
EOF
web/table/csv2html.py $BASE_DIR/stage1/summary.csv
web/table/csv2html.py $in_dir/virtual-memory.csv
cat <<EOF
<h3>Shell and Host Details</h3>
EOF
web/table/csv2html.py $BASE_DIR/stage1/shells.csv
web/table/csv2html.py $BASE_DIR/stage1/hosts.csv
web/table/csv2html.py $in_dir/shells.csv
web/table/csv2html.py $in_dir/hosts.csv
cat <<EOF
<h3>Raw Timing Data</h3>
<h3>Raw Data</h3>
EOF
web/table/csv2html.py $BASE_DIR/stage1/raw_times.csv
cat <<EOF
web/table/csv2html.py $in_dir/raw-data.csv
<h3>Per-File Breakdown</h3>
cat <<EOF
<h3>Parse Time Breakdown by File</h3>
<h4>Elasped Time in milliseconds</h4>
EOF
web/table/csv2html.py $BASE_DIR/stage1/elapsed.csv
web/table/csv2html.py $in_dir/elapsed.csv
cat <<EOF
<h4>Parsing Rate in lines/millisecond</h4>
EOF
web/table/csv2html.py $BASE_DIR/stage1/rate.csv
web/table/csv2html.py $in_dir/rate.csv
cat <<EOF
</body>
</html>
EOF
}
report() {
stage3() {
local out=$BASE_DIR/index.html
mkdir -p $(dirname $out)
_print-report > $out
_print-report $BASE_DIR/stage2 > $out
echo "Wrote $out"
}
report() {
stage1
stage2
stage3
}
_banner() {
echo -----
echo "$@"
@@ -48,6 +48,20 @@ baseline() {
done
}
baseline-csv() {
local out=_tmp/vm-baseline/stage1
mkdir -p $out
# Globs are in lexicographical order, which works for our dates.
local -a m1=(../benchmark-data/vm-baseline/flanders.*)
local -a m2=(../benchmark-data/vm-baseline/lisa.*)
# The last one
local -a latest=(${m1[-1]} ${m2[-1]})
benchmarks/virtual_memory.py baseline "${latest[@]}"
}
# TODO: parse 10 osh-parser files, measure virtual memory at the end. However
# this only applies to OSH, because you need a hook to dump the /proc/$$/status
# file.
@@ -0,0 +1,75 @@
#!/usr/bin/python
"""
virtual_memory.py
"""
import csv
import os
import sys
import re
# VmSize, VmData might be interesting too.
METRIC_RE = re.compile('^(VmPeak|VmRSS):\s*(\d+)')
def main(argv):
action = argv[1]
if action == 'baseline':
input_dirs = argv[2:]
out = csv.writer(sys.stdout)
out.writerow(
('host', 'shell_name', 'shell_hash', 'metric_name', 'metric_value'))
for input_dir in input_dirs:
d = os.path.basename(input_dir)
host, job_id = d.split('.')
for name in os.listdir(input_dir):
n, _ = os.path.splitext(name)
shell_name, shell_hash = n.split('-')
path = os.path.join(input_dir, name)
with open(path) as f:
for line in f:
m = METRIC_RE.match(line)
if m:
name, value = m.groups()
row = (host, shell_name, shell_hash, name, value)
out.writerow(row)
elif action == 'osh-parser':
input_dirs = argv[2:]
out = csv.writer(sys.stdout)
HEADER = (
'host', 'shell_name', 'shell_hash', 'filename', 'metric_name',
'metric_value')
out.writerow(HEADER)
for input_dir in input_dirs:
d = os.path.basename(input_dir)
host, job_id, _ = d.split('.')
for name in os.listdir(input_dir):
n, _ = os.path.splitext(name)
shell_id, filename = n.split('__')
shell_name, shell_hash = shell_id.split('-')
path = os.path.join(input_dir, name)
with open(path) as f:
for line in f:
m = METRIC_RE.match(line)
if m:
name, value = m.groups()
row = (host, shell_name, shell_hash, filename, name, value)
out.writerow(row)
else:
raise RuntimeError('Invalid action')
if __name__ == '__main__':
try:
main(sys.argv)
except RuntimeError as e:
print >>sys.stderr, 'FATAL: %s' % e
sys.exit(1)
Oops, something went wrong.

0 comments on commit 371c3cd

Please sign in to comment.