Skip to content

Commit

Permalink
[release] Polish / tweak bytecode metrics for release.
Browse files Browse the repository at this point in the history
- Print the data frames in a wider format.  That is, don't wrap.
- Add ShowFrame() and ShowValue().
- Generally, sort by descending frequency.
  • Loading branch information
Andy Chu committed Oct 6, 2018
1 parent 7444e2b commit 6c6cf55
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 76 deletions.
172 changes: 103 additions & 69 deletions benchmarks/bytecode.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,51 @@ library(stringr)

source('benchmarks/common.R')

options(stringsAsFactors = F)
options(stringsAsFactors = F,
# Make the report wide. tibble.width doesn't appear to do this?
width=200
)

ShowFrame = function(description, df) {
Log(description)
print(df)
Log('')
}

ShowValue = function(msg, ...) {
cat('-- '); Log(msg, ...)
Log('')
}

Basic = function(ctx) {
Banner('BASIC METRICS')

# Number of files
ctx$frames %>% count(path) -> by_path
ShowValue('Number of files: %d', nrow(by_path))

# 216K
b = sum(ctx$frames$bytecode_bytes)
ShowValue('Total bytecode bytes: %d', b)

num_insts = nrow(ctx$ops)
ShowValue('Total instructions: %d', num_insts)

# Hm this isn't reliable because the code name isn't unique! I think we need
# firstlineno
ctx$frames %>% count(path, code_name) %>% arrange(desc(n)) %>% head() -> f1
ShowFrame('Duplicate path/name', f1)
}

BigStrings = function(consts) {
Banner('BIG STRINGS')

strs = consts %>% filter(type == 'str') %>% arrange(desc(len_or_val))
strs %>% head(20) %>% print()
total_bytes = sum(strs$len_or_val)

# 184 KB of strings! That's just the payload; the header is probably more.
Log('total string bytes: %d', total_bytes)
ShowValue('total string bytes: %d', total_bytes)

# This plot says:
#
Expand All @@ -34,71 +70,70 @@ BigStrings = function(consts) {
}

Consts = function(consts) {
Banner('CONSTS')

# count of types of constants. Strings dominate of course.
# But there are only 7 or so immutable types!

# - only 2 float constants.
# - get rid of the unicode constants in posixpath.

consts %>% count(type) %>% arrange(n) %>% tail(20)
consts %>% count(type) %>% arrange(desc(n)) %>% head(20) -> frequent
ShowFrame('Types of constants', frequent)
}

# Frames by number of consts, number of ops, etc.
Frames = function(ctx) {
Log('Frames with many consts')
ctx$consts %>% count(path, code_name, sort=T) %>% print()
Banner('FRAMES')

Log('Frames with many ops')
ctx$ops %>% count(path, code_name, sort=T) %>% print()
ctx$consts %>% count(path, code_name, sort=T) -> f1
ShowFrame('Frames with many consts', f1)

Log('Frames with large stacksize')
ctx$frames %>% arrange(desc(stacksize)) %>% head(10) %>% print()
ctx$ops %>% count(path, code_name, sort=T) -> f2
ShowFrame('Frames with many ops', f2)

Log('Frames with many locals')
ctx$frames %>% arrange(desc(nlocals)) %>% head(10) %>% print()
ctx$frames %>% arrange(desc(stacksize)) %>% head(10) -> f3
ShowFrame('Frames with large stacksize', f3)

ctx$frames %>% arrange(desc(nlocals)) %>% head(10) -> f4
ShowFrame('Frames with many locals', f4)
}

Ops = function(ops) {
ops %>% count(op_name) %>% arrange(n) -> op_freq
Banner('OPS')

ops %>% count(op_name) %>% arrange(desc(n)) -> op_freq

Log('common:')
op_freq %>% tail(n=20) %>% print()
Log('rare:')
op_freq %>% head(n=20) %>% print()
op_freq %>% head(n=20) -> common
ShowFrame('Common:', common)

op_freq %>% tail(n=20) -> rare
ShowFrame('Rare:', rare)

# These are all the big jump targets! Max is 3,852, which is a lot less than
# 65,536. We don't need EXTENDED_ARG!
ops %>% arrange(op_arg) %>% tail(10) %>% print()
ops %>% arrange(desc(op_arg)) %>% head(10) -> f1
ShowFrame('Large op_arg (jump targets):', f1)
}

Flags = function(flags) {
flags %>% count(flag) %>% arrange(n) %>% print()
Banner('FLAGS')

flags %>% count(flag) %>% arrange(desc(n)) -> f1
ShowFrame('Common flags', f1)
}

Names = function(names) {
Banner('NAMES')

# Common types: free, cell, etc.
names %>% count(kind) %>% arrange(desc(n)) %>% print()
names %>% count(kind) %>% arrange(desc(n)) -> f1
ShowFrame('Common types', f1)

# Common names:
# self, None, True, False, append, len
names %>% count(name) %>% arrange(desc(n)) %>% print()
}

Basic = function(ctx) {
# Number of files
ctx$frames %>% count(path) -> by_path
Log('number of files: %d', nrow(by_path))

# Hm this isn't reliable because the code name isn't unique! I think we need
# firstlineno
ctx$frames %>% count(path, code_name) %>% print()

# 216K
b = sum(ctx$frames$bytecode_bytes)
Log('Total bytecode bytes: %d', b)

num_insts = nrow(ctx$ops)
Log('Total instructions: %d', num_insts)
names %>% count(name) %>% arrange(desc(n)) -> f2
ShowFrame('Common names', f2)
}

# Hm max unique ops is 58
Expand All @@ -110,34 +145,35 @@ Basic = function(ctx) {

# Written by opy/metrics.sh. Could get rid of that file.
UniqueOpsByFile = function(ops, ops_defined = '_tmp/opcodes-defined.txt') {
Banner('UNIQUE OPS')

# This is a row for every path/op_name
u = ops %>% group_by(path) %>% distinct(op_name)
u %>% count(path) %>% arrange(n) -> ops_by_file

Log('files with few ops:')
ops_by_file %>% head(20) %>% print()
ops_by_file %>% head(20) -> f1
ShowFrame('Files with few ops:', f1)

ops_by_file %>% tail(10) -> f2
ShowFrame('Files with many ops:', f2)

Log('files with many ops:')
ops_by_file %>% tail(10) %>% print()
ops_by_file %>% filter(grepl('reader|lex|parse', path)) -> f3
ShowFrame('Unique ops for files that just parse:', f3) # 17, 23, 34, 34, 46

Log('parsing:') # 17, 23, 34, 34, 46
ops_by_file %>% filter(grepl('reader|lex|parse', path)) %>% print()
ops %>% filter(grepl('reader|lex|parse', path)) %>% distinct(op_name) -> string_ops
Log('Total for parsing: %d', nrow(string_ops))
ops %>% filter(grepl('reader|lex|parse', path)) %>% distinct(op_name) ->
string_ops
ShowValue('Unique opcodes for parsing: %d', nrow(string_ops))

Log('')
u2 = ops %>% distinct(op_name)
Log('Total unique opcodes: %d', nrow(u2))
ShowValue('Total unique opcodes: %d', nrow(u2))

if (ops_defined != '') {
defined = read.table(ops_defined, header=F)
colnames(defined) = c('op_name')

Log('Unused opcodes:')
setdiff(defined, u2) %>% print()
setdiff(defined, u2) -> f4
ShowFrame('Unused opcodes:', f4)
}

list(string_ops = string_ops)
}

# OPy emits 88 distinct opcodes out of 119. Interesting.
Expand Down Expand Up @@ -212,33 +248,33 @@ Load = function(in_dir) {
)
}

# TODO: Just take a table of (py_path, pyc_path, key) and then produce bytes
# for py_path and pyc_path. Does R have getsize?
#
# file.info()$size of both. And then
# This takes a table of (py_path, pyc_path) and calls file.info()$size on both.
# Then it computes the ratio.

MeasureFileSizes = function(all_deps_py) {
FileSizes = function(all_deps_py, pyc_base_dir) {
py_pyc = read.table(all_deps_py, header=F)
colnames(py_pyc) = c('py_path', 'pyc_path')

py_pyc$py_bytes = file.info(py_pyc$py_path)$size

pyc_paths = file.path('_build/oil/bytecode-opy', py_pyc$pyc_path)
pyc_paths = file.path(pyc_base_dir, py_pyc$pyc_path)
py_pyc$pyc_bytes = file.info(pyc_paths)$size

py_pyc %>% mutate(ratio = pyc_bytes / py_bytes) %>% arrange(ratio) -> py_pyc
py_pyc %>% filter(py_bytes != 0) %>% mutate(ratio = pyc_bytes / py_bytes) %>%
arrange(ratio) -> py_pyc

Log('small .pyc files:')
py_pyc %>% head(10) %>% print()
py_pyc %>% head(10) -> small
ShowFrame('small .pyc files:', small)

Log('big .pyc files:')
py_pyc %>% tail(10) %>% print()
py_pyc %>% tail(10) -> big
ShowFrame('big .pyc files:', big)

# This ratio is a ltitle misleading because it counts comments.
py_total = sum(py_pyc$py_bytes)
pyc_total = sum(py_pyc$pyc_bytes)
Log('Overall: %d bytes of .py -> %d bytes of .pyc', py_total, pyc_total)
Log('Ratio: %f', pyc_total / py_total)

ShowValue('Overall: %d bytes of .py -> %d bytes of .pyc', py_total, pyc_total)
ShowValue('Ratio: %f', pyc_total / py_total)

py_pyc
}
Expand All @@ -248,20 +284,18 @@ main = function(argv) {

if (action == 'metrics') {
in_dir = argv[[2]]

ctx = Load(in_dir)
#out_dir = argv[[3]]
Report(ctx)

} else if (action == 'pyc-ratio') { # This takes different inputs
} else if (action == 'src-bin-ratio') { # This takes different inputs
all_deps_py = argv[[2]]
ctx = MeasureFileSizes(all_deps_py)
pyc_base_dir = argv[[3]]
ctx = FileSizes(all_deps_py, pyc_base_dir)

} else {
Log("Invalid action '%s'", action)
quit(status = 1)
}
Log('PID %d done', Sys.getpid())
}

if (length(sys.frames()) == 0) {
Expand Down
39 changes: 32 additions & 7 deletions benchmarks/bytecode.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
#!/bin/bash
#
# Metrics for Oil bytecode produced by the OPy compiler.
#
# This is more like a metric than a benchmark. In particular, we do NOT need
# to run it on multiple machines! It doesn't need the provenance of binaries
# and so forth.
#
# But it IS like a benchmark in that we use R to analyze data and want HTML
# reports.
#
# NOTE: We will eventually have benchmarks for OPy compile time.
#
# Usage:
# ./opy.sh <function name>
# ./bytecode.sh <function name>

set -o nounset
set -o pipefail
Expand All @@ -26,7 +37,7 @@ for n in names:
# are ~131K rows in ~8.5 MB altogether. The biggest table is the 'ops' table.

dis-tables() {
local out_dir=$BASE_DIR
local out_dir=$BASE_DIR/opy
mkdir -p $out_dir

# Pass the .pyc files in the bytecode-opy.zip file to 'opyc dis'
Expand Down Expand Up @@ -76,13 +87,27 @@ report() {
R_LIBS_USER=$R_PATH benchmarks/bytecode.R "$@"
}

# Reads the 5 tables and produces some metrics
metrics() {
report metrics $BASE_DIR
# Reads the 5 tables and produces some metrics.
metrics-opy() {
report metrics $BASE_DIR/opy
}

pyc-ratio() {
report pyc-ratio _build/oil/all-deps-py.txt
# Reads a .py / .pyc manifest and calculates the ratio of input/output file
# sizes.
src-bin-ratio() {
# Pass the manifest and the base directory of .pyc files.
report src-bin-ratio _build/oil/all-deps-py.txt _build/oil/bytecode-opy
}

run-for-release() {
dis-tables
dis-tables-cpython

report metrics $BASE_DIR/opy > $BASE_DIR/opy-metrics.txt
report metrics $BASE_DIR/cpython > $BASE_DIR/cpython-metrics.txt

src-bin-ratio > $BASE_DIR/src-bin-ratio.txt
log "Wrote $BASE_DIR/src-bin-ratio.txt"
}

# TODO:
Expand Down
5 changes: 5 additions & 0 deletions benchmarks/common.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ Log = function(fmt, ...) {
cat('\n')
}

Banner = function(fmt, ...) {
cat('===== '); Log(fmt, ...)
cat('\n')
}

# Same precision for all columns.
SamePrecision = function(precision = 1) {
return(function(column_name) {
Expand Down
3 changes: 3 additions & 0 deletions scripts/release.sh
Original file line number Diff line number Diff line change
Expand Up @@ -667,6 +667,9 @@ metrics() {

line-counts $out/line-counts

# NOTE: Could move these files and scripts/count.sh to a metrics/ dir?
benchmarks/bytecode.sh run-for-release

tree $out
}

Expand Down

0 comments on commit 6c6cf55

Please sign in to comment.