Permalink
Browse files

Fix an issue that made OPy's bytecode output nondeterministic.

The flow graph can be flattened correctly in many ways, but we want a
consistent order.

I used sorted(), and that works, but it should probably be made more
efficient.

- Add an 'opyc dis-md5' command to verify the results.  It skips the
  timestamp in the .pyc file.
  • Loading branch information...
Andy Chu
Andy Chu committed Mar 17, 2018
1 parent 7e4beb8 commit 04fedae770a90d9352363335b5def019e9145f73
Showing with 102 additions and 9 deletions.
  1. +3 −3 opy/compiler2/pyassem.py
  2. +47 −0 opy/determinism.sh
  3. +19 −0 opy/opy_main.py
  4. +33 −6 opy/smoke.sh
View
@@ -120,7 +120,7 @@ def order_blocks(start_block, exit_block):
# A block is dominated by another block if that block must be emitted
# before it.
dominators = {}
for b in remaining:
for b in sorted(remaining): # ANDY FIX: sorted() determinism
if __debug__ and b.next:
assert b is b.next[0].prev[0], (b, b.next)
# Make sure every block appears in dominators, even if no
@@ -140,8 +140,8 @@ def order_blocks(start_block, exit_block):
def find_next():
# Find a block that can be emitted next.
for b in remaining:
for c in dominators[b]:
for b in sorted(remaining): # ANDY FIX: sorted determinism
for c in sorted(dominators[b]): # ditto
if c in remaining:
break # can't emit yet, dominated by a remaining block
else:
View
@@ -13,12 +13,53 @@ set -o nounset
set -o pipefail
set -o errexit
# Trying to reproduce problem with pyassem.py and Block() in order_blocks, but
# this does NOT do it.
# I had to add sorted() to make it stable there, but here I do not? Why?
# See also: https://github.com/NixOS/nixpkgs/issues/22570
#
# "No, the sets are built as real sets and then marshalled to .pyc files in a
# separate step. So on CPython an essentially random order will end up in the
# .pyc file. Even CPython 3.6 gives a deterministic order to dictionaries but
# not sets. You could ensure sets are marshalled in a known order by changing
# the marshalling code, e.g. to emit them in sorted order (on Python 2.x; on
# 3.x it is more messy because different types are more often non-comparable)."
#
# Is that accurate? The issue here is not sets as marshalled constants; it's
# USING sets in the compiler.
#
# set([1, 2, 3]) and {'a': 'b'} do not produce literal constants!
dictset() {
local n=${1:-30}
local python=${2:-python}
seq $n | $python -c '
import sys
class Block:
def __init__(self, x):
self.x = x
def __repr__(self):
return str(self.x)
s = set()
hashes = []
for line in sys.stdin:
b = Block(line.strip())
hashes.append(hash(b))
s.add(b)
print s
print hashes
'
}
dictset2() {
local n=${1:-30}
local python=${2:-python}
seq $n | $python -c '
import sys
d = {}
s = set()
for line in sys.stdin:
@@ -39,6 +80,12 @@ compare-iters() {
}
# Changing the seed changes the order.
# Aha! hash(Block()) is still not deterministic with a fixed seed, because it
# uses the address?
#
# https://stackoverflow.com/questions/11324271/what-is-the-default-hash-in-python
compare-seed() {
for seed in 1 2 3; do
echo "seed = $seed"
View
@@ -5,6 +5,7 @@
from __future__ import print_function
import cStringIO
import hashlib
import optparse
import os
import sys
@@ -278,6 +279,24 @@ def py2st(gr, raw_node):
v.Report(report_f)
elif action == 'dis-md5':
pyc_paths = argv[1:]
if not pyc_paths:
raise args.UsageError('dis-md5: At least one .pyc path is required.')
for path in pyc_paths:
h = hashlib.md5()
with open(path) as f:
magic = f.read(4)
h.update(magic)
ignored_timestamp = f.read(4)
while True:
b = f.read(64 * 1024)
if not b:
break
h.update(b)
print('%s %s' % (h.hexdigest(), path))
# NOTE: Unused
elif action == 'old-compile':
py_path = argv[1]
View
@@ -198,20 +198,44 @@ determinism-loop() {
$func $file _tmp/det/$name.1
$func $file _tmp/det/$name.2
local size1=$(stat --format '%s' _tmp/det/$name.1)
local size2=$(stat --format '%s' _tmp/det/$name.2)
if test $size1 != $size2; then
echo "mismatched sizes: $size1 $size2"
# TODO: Import from run.sh
opyc-dis-md5 _tmp/det/$name.{1,2} | tee _tmp/det/md5.txt
awk '
NR == 1 { left = $1 }
NR == 2 { right = $1 }
END { if (NR != 2) {
print "Expected two rows, got " NR
exit(1)
}
if (left != right) {
print "FATAL: .pyc files are different!"
exit(1)
}
}
' _tmp/det/md5.txt
local status=$?
if test $status != 0; then
compare-bytecode _tmp/det/$name.{1,2}
echo "Found problem after $i iterations"
break
fi
#local size1=$(stat --format '%s' _tmp/det/$name.1)
#local size2=$(stat --format '%s' _tmp/det/$name.2)
#if test $size1 != $size2; then
# echo "mismatched sizes: $size1 $size2"
# # TODO: Import from run.sh
# compare-bytecode _tmp/det/$name.{1,2}
# echo "Found problem after $i iterations"
# break
#fi
done
}
opyc-dis() { ../bin/opyc dis "$@"; }
opyc-compile() { ../bin/opyc compile "$@"; }
opyc-dis() { ../bin/opyc dis "$@"; }
opyc-dis-md5() { ../bin/opyc dis-md5 "$@"; }
stdlib-compile() { misc/stdlib_compile.py "$@"; }
# FAILS
@@ -228,6 +252,9 @@ stdlib-determinism-loop() {
determinism-loop stdlib-compile $file
}
# BUG: FlowGraph flattening was non-deterministic. It's a graph that is
# correct in several orders. See order_blocks() in pyassem.py.
# Not able to reproduce the non-determinism with d.keys()? Why not?
hash-determinism() {
local in=$1

0 comments on commit 04fedae

Please sign in to comment.