In [None]:
%%javascript
$('#run_all_cells_below').click()

In [None]:
from hypothesis import settings
settings.register_profile("presentation", settings(database_file=None, max_examples=100))
settings.load_profile("presentation")

# Property Based Testing
## (Using Hypothesis)<br><br><br>
### Amsterdam Python Meetup
### 26 April 2017<br><br><br>
### Daniel Bradburn

Property based testing

Choosing properties

Generating data

Model based testing

Django

Examples

## Property Based Testing

say we have a run length encoding function. We encode a string as characters and the number of consecutive occurrences of that character. let's just test this out with something simple

In [None]:
def encode(input_string):
    count = 1
    prev = ''
    lst = []
    for character in input_string:
        if character != prev:
            if prev:
                lst.append((prev, count))
            count = 1
            prev = character
        else:
            count += 1
    else:
        lst.append((character, count))
    return ''.join(x + str(n) for x, n in lst)

In [149]:
encode('hellllllo')

'h1e1l6o1'

and we also have a decode function which reconstructs the string let's just check this function, let's use the output from the encode

In [None]:
def decode(lst):
    curr_digits, curr_letter, output = '', '', ''
    for c in lst:
        if c in map(str, range(10)):
            curr_digits += c
        else:
            if curr_digits:
                output += curr_letter * int(curr_digits)
            curr_letter, curr_digits = c, ''
    output += curr_letter * int(curr_digits)
    return output

In [151]:
decode('h1e1l6o1')

'hellllllo'

but it's probably best to formalize this in a unit test. I'm using pytest here, but you could use unittest or your favourite test runner, the principal is the same.

In [None]:
def test_run_length_encode():
    input_data = 'hello'
    expected = 'h1e1l2o1'
    actual = encode(input_data)
    assert actual == expected

In [152]:
!sh pytest_run.sh test_run_length_encode

.


1 passed, 34 deselected in 0.40 seconds


In [None]:
def test_run_length_decode():
    input_data = 'h1e1l2o1'
    expected = 'hello'
    actual = decode(input_data)
    assert actual == expected

In [153]:
!sh pytest_run.sh test_run_length_decode

.


1 passed, 34 deselected in 0.39 seconds


In [None]:
import pytest

examples = ['hello', 'python', 'uhm...']

@pytest.mark.parametrize('input_data', examples)
def test_parameterized_run_length_encode_decode(input_data):
    assert decode(encode(input_data)) == input_data

In [66]:
!sh pytest_run.sh test_parameterized_run_length_encode_decode

...


3 passed, 68 deselected in 0.39 seconds


In [184]:
from random import seed, choice, randint
from itertools import repeat

seed(0)

randletter = lambda _: chr(choice(range(1, 255)))
randrange = lambda length: range(randint(0, length))
randword = lambda length: ''.join(map(randletter, randrange(length)))
randwords = lambda num, length: (randword(length) for _ in range(num))

@pytest.mark.parametrize('input_data', randwords(num=10, length=10))
def test_fuzzed_run_length_encode_decode(input_data):
    assert decode(encode(input_data)) == input_data

In [185]:
!sh pytest_run.sh test_fuzzed_run_length_encode_decode

..........


10 passed, 25 deselected in 0.42 seconds


In [None]:
from hypothesis import strategies as st, given, assume

@given(st.text())
def test_property_based_run_length_encode_decode(input_data):
    assert decode(encode(input_data)) == input_data

In [70]:
!sh pytest_run.sh test_property_based_run_length_encode_decode

F



____________ test_property_based_run_length_encode_decode ____________


test.py:76: in test_property_based_run_length_encode_decode
    def test_property_based_run_length_encode_decode(input_data):

.venv/hypothesis/core.py:524: in wrapped_test
    print_example=True, is_final=True

.venv/hypothesis/executors.py:58: in default_new_style_executor
    return function(data)

.venv/hypothesis/core.py:111: in run
    return test(*args, **kwargs)

test.py:77: in test_property_based_run_length_encode_decode
    assert decode(encode(input_data)) == input_data

test.py:19: in encode
    lst.append((character, count))
E   UnboundLocalError: local variable 'character' referenced before assignment

----------------------------- Hypothesis -----------------------------

Falsifying example: test_property_based_run_length_encode_decode(input_data='')


1 failed, 70 deselected in 0.49 seconds


In [None]:
def encode_fixed(input_string):
    count = 1
    prev = ''
    lst = []
    character = ''
    for character in input_string:
        if character != prev:
            if prev:
                lst.append((prev, count))
            count = 1
            prev = character
        else:
            count += 1
    else:
        lst.append((character, count))
    return ''.join(x + str(n) for x, n in lst)

In [None]:
@given(st.text())
def test_property_based_fixed_run_length_encode_decode(input_data):
    assert decode(encode_fixed(input_data)) == input_data

In [71]:
!sh pytest_run.sh test_property_based_fixed_run_length_encode_decode

F



_________ test_property_based_fixed_run_length_encode_decode _________


test.py:99: in test_property_based_fixed_run_length_encode_decode
    def test_property_based_fixed_run_length_encode_decode(input_data):

.venv/hypothesis/core.py:524: in wrapped_test
    print_example=True, is_final=True

.venv/hypothesis/executors.py:58: in default_new_style_executor
    return function(data)

.venv/hypothesis/core.py:111: in run
    return test(*args, **kwargs)

test.py:100: in test_property_based_fixed_run_length_encode_decode
    assert decode(encode_fixed(input_data)) == input_data
E   AssertionError

----------------------------- Hypothesis -----------------------------

Falsifying example: test_property_based_fixed_run_length_encode_decode(input_data='0')


1 failed, 70 deselected in 0.51 seconds


In [None]:
from hypothesis import settings, Verbosity

@settings(verbosity=Verbosity.verbose)
@given(st.text())
def test_property_based_show_fixed_run_length_encode_decode(input_data):
    assert decode(encode_fixed(input_data)) == input_data

In [72]:
!sh pytest_run.sh test_property_based_show_fixed_run_length_encode_decode

F



______ test_property_based_show_fixed_run_length_encode_decode _______


test.py:106: in test_property_based_show_fixed_run_length_encode_decode
    @given(st.text())

.venv/hypothesis/core.py:524: in wrapped_test
    print_example=True, is_final=True

.venv/hypothesis/executors.py:58: in default_new_style_executor
    return function(data)

.venv/hypothesis/core.py:111: in run
    return test(*args, **kwargs)

test.py:108: in test_property_based_show_fixed_run_length_encode_decode
    assert decode(encode_fixed(input_data)) == input_data
E   AssertionError

----------------------------- Hypothesis -----------------------------

Trying example: test_property_based_show_fixed_run_length_encode_decode(input_data='G\nd\x1c겊')
Trying example: test_property_based_show_fixed_run_length_encode_decode(input_data='O')
Trying example: test_property_based_show_fixed_run_length_encode_decode(input_data='Or\n')
Trying example: test_property_based_show_fixed_run_leng

In [180]:
@pytest.mark.parametrize('input_data', randwords(num=10, length=20))
def test_fuzzed_more_run_length_encode_decode(input_data):
    assert decode(encode_fixed(input_data)) == input_data

In [187]:
!sh pytest_run.sh test_fuzzed_more_run_length_encode_decode

...F..FF..


__ test_fuzzed_more_run_length_encode_decode[\xb5\xdf\x111\xeb\x929>\xce\xf8] __

test.py:114: in test_fuzzed_more_run_length_encode_decode
    assert decode(encode_fixed(input_data)) == input_data
E   AssertionError
 test_fuzzed_more_run_length_encode_decode[\xef~\x1cN\x8eK\xb5 \x8dV\xd1\xed\x8b5\xf7\xcd] 
 test_fuzzed_more_run_length_encode_decode[\x8d\x97Jr\x18\x99\xcdcR\x94>K01\xd30\t\x9d\xfc] 


3 failed, 7 passed, 30 deselected in 0.45 seconds


Summary (TODO)

## Property patterns

In [None]:
from hypothesis import given, strategies as st

@given(st.lists(st.integers(), min_size=1))
def test_round_and_around(c):
    assert c[::-1][::-1] == c

In [77]:
!sh pytest_run.sh test_round_and_around

.


1 passed, 30 deselected in 0.44 seconds


In [None]:
@given(st.integers(), st.integers())
def test_different_paths_same_destination_add(x, y):
    assert x + y == y + x

In [78]:
!sh pytest_run.sh test_different_paths_same_destination_add

.


1 passed, 30 deselected in 0.41 seconds


In [80]:
@given(st.integers(), st.integers())
def test_there_and_back_again_add(m, n):
    assert m + n - n == m

@given(st.text())
def test_there_and_back_again_encode_decode(t):
    assert t.encode('utf-8').decode('utf-8') == t

In [82]:
!sh pytest_run.sh test_there_and_back_again

..


2 passed, 29 deselected in 0.48 seconds


In [None]:
from heapq import heapify, heappop

@given(st.lists(st.integers(), min_size=1))
def test_some_things_never_change(c):
    smallest = min(c)
    heapify(c)
    assert heappop(c) == smallest

In [83]:
!sh pytest_run.sh test_some_things_never_change

.


1 passed, 30 deselected in 0.43 seconds


In [None]:
@given(st.lists(st.integers()))
def test_the_more_things_change_the_more_they_stay_the_same(c):
    assert set(c) == set(set(c))

In [84]:
!sh pytest_run.sh test_the_more_things_change_the_more_they_stay_the_same

.


1 passed, 30 deselected in 0.43 seconds


In [None]:
def test_hard_to_prove_easy_to_verify():
    pass

In [85]:
!sh pytest_run.sh test_hard_to_prove_easy_to_verify

.


1 passed, 30 deselected in 0.38 seconds


In [None]:
def test_two_heads_are_better_than_one():
    pass

In [86]:
!sh pytest_run.sh test_two_heads_are_better_than_one

.


1 passed, 30 deselected in 0.38 seconds


Summary (TODO)

## Data generation

In [95]:
from hypothesis import strategies as st

st.integers().example()

-113

In [103]:
st.text().example()

'ê\U00014cde\x0b觡瀾\x1ewÔ壞ÂƇ⚦㺎\U00060425'

In [120]:
st.floats().example()

0.3333333333333333

In [134]:
st.lists(st.integers()).example()

[-10490,
 -73,
 62652,
 -6695,
 82,
 177,
 -22358454268,
 -166,
 24254,
 -243,
 42165,
 52024,
 15,
 -45,
 60716,
 -61,
 -109,
 188,
 -73,
 124,
 39]

In [136]:
nodes = st.floats() | st.booleans() | st.text() | st.none()
children = lambda x: st.lists(x) | st.dictionaries(st.text(), x)
st.recursive(nodes, children).example()

{'': [1.333431775331741e-273,
  [],
  -2.630755031582877e+18,
  [None, None],
  {'\U0005accc\x13\U000ae5f7': ''},
  {'ő\U0005c4d1': True},
  -7.494265036033925e+18,
  ['H\x18', None],
  {'': '\U00032ed08Ū컣ßŌ\x11䛵㉪\U0007ba8aý\x18\nI',
   '1橼\n\U000a5bfb\n𖤛\x08F': False,
   'G\x92\U0010c2f4,': None,
   'æ*': -1.7976931348623157e+308,
   'ŋ\U00032791': '',
   'ୠm\n\U000c9700\nə绿': -inf,
   '᪐¥ꋡWR\x013ŏ(悇\x15杌\xadŞ': True,
   '뛡\U0010565fw㱠': None,
   '\U0004228b': None,
   '\U00069ca7¦㲠': None,
   '\U000df3cb{7': -3.3932096819187e+18,
   '\U000f494f\n墩\t\x0b︼\x13\x17': '\U000f263b\U00064a00³\x12pæ⏪'},
  nan,
  {'': None,
   '\x14\U000d71b3怃-𞡏ꀳ': 8.612447227626119e+18,
   '\U000916b7': 'ā\U0006abb4'}],
 '偋#\n쵒I瘮Ü\U000c168eü\U000bc182\U00042969\n䩯': '\t効\n謕\U00045649',
 "姝%\U0004f48cU'bÃ{": [None],
 '즌\x17©\U0005ded0\U001023d1': [None, False, False, True]}

In [None]:
@st.composite
def composite_strategy(draw):
    pass

Summary (TODO)

## Model Based Testing

In [None]:
class Queue(object):

    def __init__(self, max_size):
        self._buffer = [None] * max_size
        self._in, self._out, self.max_size = 0, 0, max_size

    def put(self, item):
        self._buffer[self._in] = item
        self._in = (self._in + 1) % self.max_size

    def get(self):
        result = self._buffer[self._out]
        self._out = (self._out + 1) % self.max_size
        return result

    def __len__(self):
        return (self._in - self._out) % self.max_size

In [None]:
import itertools

In [137]:
operations = 'new', 'put', 'get', 'size'
list(itertools.permutations(operations))

[('new', 'put', 'get', 'size'),
 ('new', 'put', 'size', 'get'),
 ('new', 'get', 'put', 'size'),
 ('new', 'get', 'size', 'put'),
 ('new', 'size', 'put', 'get'),
 ('new', 'size', 'get', 'put'),
 ('put', 'new', 'get', 'size'),
 ('put', 'new', 'size', 'get'),
 ('put', 'get', 'new', 'size'),
 ('put', 'get', 'size', 'new'),
 ('put', 'size', 'new', 'get'),
 ('put', 'size', 'get', 'new'),
 ('get', 'new', 'put', 'size'),
 ('get', 'new', 'size', 'put'),
 ('get', 'put', 'new', 'size'),
 ('get', 'put', 'size', 'new'),
 ('get', 'size', 'new', 'put'),
 ('get', 'size', 'put', 'new'),
 ('size', 'new', 'put', 'get'),
 ('size', 'new', 'get', 'put'),
 ('size', 'put', 'new', 'get'),
 ('size', 'put', 'get', 'new'),
 ('size', 'get', 'new', 'put'),
 ('size', 'get', 'put', 'new')]

What about performing the same operation 2 or 3 times in a row? Or passing different values
to the various arguments these operations take? As we can see, in even
the simplest of systems the number of test cases for a brute force method
is unmanageable. There must be a better way.
Of course not all permutations are valid, for example we don't want a test case of the
form ``put``, ``get``, ``new``, ``size`` - it doesn't make sense to do perform any of
the other operations on a queue until after it is created. What we need is a way to
specify the valid operations for the system under test.
In hypothesis we can derive a class from ``RuleBasedStateMachine`` where methods decorated
with ``@rule`` are treated as states in the system. The ``@rule`` decorator is a bit like the
``@given`` decorator, defining the strategies to use for generating argument values. However
``@rule`` is only allowed in the context of a ``RuleBasedStateMachine``.
All transitions between states are assumed valid, but the ``@precondition`` decorator can be
used on a method to indicate whether a transition to this state is valid or not. In this
way we can build a specification for the system under test.
Using the ``Queue`` example, this is how the specification would look. To create
a new queue we have the precondition that the queue must not already be
created. For all other operations we check the precondition that the queue
must have been created. And to get an item from the queue, we want to check
that the queue is not empty.

In [None]:
from hypothesis import strategies as st
from hypothesis.stateful import RuleBasedStateMachine, rule, precondition

class QueueMachine(RuleBasedStateMachine):

    Actual, Model = Queue, list

    def is_created(self):
        return hasattr(self, 'actual')

    @precondition(lambda self: not self.is_created())
    @rule(max_size=st.integers(min_value=1, max_value=10))
    def new(self, max_size):
        self.actual, self.model = self.Actual(max_size), self.Model()
        self.max_size = max_size

    @precondition(is_created)
    @rule(item=st.integers())
    def put(self, item):
        self.actual.put(item)
        self.model.append(item)

    def is_not_empty(self):
        return self.is_created() and len(self.model)

    @precondition(is_not_empty)
    @rule()
    def get(self):
        actual, model = self.actual.get(), self.model.pop()
        assert actual == model

    @precondition(is_created)
    @rule()
    def size(self):
        actual, model = len(self.actual), len(self.model)
        assert actual == model
        
test_model_based_1 = QueueMachine.TestCase

Besides specifying the various states and valid transitions, we want these
methods to actually invoke the operation it represents on the system under
test, but also we want to have a model which represents our system under
test, and perform some similar operation on the model as well. We can then
assert post conditions comparing the model and the system under test to
determine if we got the expected behaviour or not. Of course you don't have
to use a model, you can assert other properties about the system under test
using this method.
So in our queue example we can use a list as our model.

In [143]:
!sh pytest_run.sh test_model_based_1

F



_____________________ test_model_based_1.runTest _____________________


.venv/hypothesis/stateful.py:182: in runTest
    run_state_machine_as_test(state_machine_class)

.venv/hypothesis/stateful.py:109: in run_state_machine_as_test
    breaker.run(state_machine_factory(), print_steps=True)

.venv/hypothesis/stateful.py:237: in run
    state_machine.execute_step(value)

.venv/hypothesis/stateful.py:512: in execute_step
    result = rule.function(self, **data)

test.py:220: in get
    assert actual == model
E   AssertionError

----------------------------- Hypothesis -----------------------------

Step #1: new(max_size=2)
Step #2: put(item=0)
Step #3: put(item=1)
Step #4: get()


1 failed, 32 deselected in 0.52 seconds


RuleBasedStateMachine exposes a TestCase class which can be used like the
standard TestCase classes.
Let's run some tests.
As you can see we got some failures, let's have a look at the error, it appears
we created a new queue of size 2, we put a 0 into it, then put a 1 into it, and
the performed a get. If we look at the assertion failure, we see that by the get
our system under test gave us a 0, while the model gave us a 1. Now this is a
FIFO queue, which is the behaviour we see from the actual system under test. We
have here a bug in our model, our model is insufficient to represent the system
under test. Luckily we can fix that by changing the append to a prepend in the
put state.

In [140]:
class QueueMachine2(QueueMachine):

    @precondition(QueueMachine.is_created)
    @rule(item=st.integers())
    def put(self, item):
        self.actual.put(item)
        self.model.insert(0, item)
        
test_model_based_2 = QueueMachine2.TestCase

Let's run the tests again and see what happens.

In [144]:
!sh pytest_run.sh test_model_based_2

F



_____________________ test_model_based_2.runTest _____________________


.venv/hypothesis/stateful.py:182: in runTest
    run_state_machine_as_test(state_machine_class)

.venv/hypothesis/stateful.py:109: in run_state_machine_as_test
    breaker.run(state_machine_factory(), print_steps=True)

.venv/hypothesis/stateful.py:237: in run
    state_machine.execute_step(value)

.venv/hypothesis/stateful.py:512: in execute_step
    result = rule.function(self, **data)

test.py:226: in size
    assert actual == model
E   AssertionError

----------------------------- Hypothesis -----------------------------

Step #1: new(max_size=1)
Step #2: put(item=0)
Step #3: size()


1 failed, 32 deselected in 0.52 seconds


This time we encounter a different error. We create a new queue of size 1, we
put a 0 into it, then put another 0 into it, and then ask for the size. Our
actual system under test reports 0, while the model reports 2. Now the 0 is
clearly wrong, but what is going on here? Well I create a queue of size 1
and then I put 2 items into it. It's debatable about what the system should
actually do here, maybe raise an exception, but for the sake of the tests
we generated an invalid test case here, so this is a bug in our specification.
We can fix this by altering the pre condition to ensure we don't try and put
items on the queue if it is already full.

In [None]:
class QueueMachine3(QueueMachine2):

    def is_not_full(self):
        return self.is_created() and len(self.model) < self.max_size

    @precondition(is_not_full)
    @rule(item=st.integers())
    def put(self, item):
        self.actual.put(item)
        self.model.insert(0, item)
        
test_model_based_3 = QueueMachine3.TestCase

Let's run the tests again with the updated model and specification.

In [145]:
!sh pytest_run.sh test_model_based_3




33 deselected in 0.40 seconds


Now we get yet another error, this time with create a new queue of size 1, we put
an item into it and we ask the size. Our model gives the correct answer, 1,
but our actual system under test gives a size of 0, that's clearly wrong, I
think this is a bug in the actual implementation of the system under test. Let's
take a look at what is going here.

In [None]:
class Queue2(Queue):
    def __init__(self, max_size):
        super(Queue2, self).__init__(max_size + 1)

class QueueMachine4(QueueMachine3):
    Actual = Queue2
    
test_model_based_4 = QueueMachine4.TestCase

In [146]:
!sh pytest_run.sh test_model_based_4

.


1 passed, 34 deselected in 0.47 seconds


&nbsp;

Tackles the problem of testing interactions between features

Complexity, is this a bug in the spec, model or system under test?

## Django

## Real World Examples

In [61]:
%%javascript
$('#clear_all_output').click()

<IPython.core.display.Javascript object>

In [63]:
%%HTML
<link href="https://fonts.googleapis.com/css?family=ABeeZee" rel="stylesheet">
<style>body { font-family: 'ABeeZee', serif !important; }</style>