In [None]:
%%javascript
$('#run_all_cells_below').click()

# Property Based Testing
## (Using Hypothesis)<br><br><br>
### Amsterdam Python Meetup
### 26 April 2017<br><br><br>
### Daniel Bradburn

Property based testing

Choosing properties

Generating data

Model based testing

Django

Examples

## Property Based Testing

say we have a run length encoding function. We encode a string as characters and the number of consecutive occurrences of that character. let's just test this out with something simple

In [None]:
def encode(input_string):
    count = 1
    prev = ''
    lst = []
    for character in input_string:
        if character != prev:
            if prev:
                lst.append((prev, count))
            count = 1
            prev = character
        else:
            count += 1
    else:
        lst.append((character, count))
    return lst

In [None]:
encode('hello')

and we also have a decode function which reconstructs the string let's just check this function, let's use the output from the encode

In [None]:
def decode(lst):
    return ''.join(c * n for c, n in lst)

In [None]:
decode([('h', 1), ('e', 1), ('l', 2), ('o', 1)])

but it's probably best to formalize this in a unit test. I'm using pytest here, but you could use unittest or your favourite test runner, the principal is the same.

In [None]:
def test_run_length_encode():
    input_data = "hello"
    expected = [('h', 1), ('e', 1), ('l', 2), ('o', 1)]
    actual = encode(input_data)
    assert actual == expected

In [None]:
!sh pytest_run.sh test_run_length_encode

In [None]:
def test_run_length_decode():
    input_data = [('h', 1), ('e', 1), ('l', 2), ('o', 1)]
    expected = "hello"
    actual = decode(input_data)
    assert actual == expected

In [None]:
!sh pytest_run.sh test_run_length_decode

In [None]:
import pytest

examples = ['hello', 'python', 'uhm...']

@pytest.mark.parametrize('input_data', examples)
def test_parameterized_run_length_encode_decode(input_data):
    assert decode(encode(input_data)) == input_data

In [None]:
!sh pytest_run.sh test_parameterized_run_length_encode_decode

In [None]:
import random, string

random.seed(0)

random_letter = lambda: random.choice(string.ascii_letters)
random_range = lambda m: range(random.randint(0, m))
random_word = lambda m: (random_letter() for i in random_range(m))
random_words = lambda n, m: (''.join(random_word(m)) for n in range(n))

@pytest.mark.parametrize('input_data', random_words(5, 10))
def test_fuzzed_run_length_encode_decode(input_data):
    assert decode(encode(input_data)) == input_data

In [None]:
!sh pytest_run.sh test_fuzzed_run_length_encode_decode

In [None]:
from hypothesis import strategies as st
from hypothesis import given

@given(st.text())
def test_property_based_run_length_encode_decode(input_data):
    assert decode(encode(input_data)) == input_data

In [None]:
!sh pytest_run.sh test_property_based_run_length_encode_decode

In [None]:
def encode_fixed(input_string):
    count = 1
    prev = ''
    lst = []
    character = ''
    for character in input_string:
        if character != prev:
            if prev:
                lst.append((prev, count))
            count = 1
            prev = character
        else:
            count += 1
    else:
        lst.append((character, count))
    return lst

In [None]:
@given(st.text())
def test_property_based_run_length_encode_decode_fixed(input_data):
    assert decode(encode_fixed(input_data)) == input_data

In [None]:
!sh pytest_run.sh test_property_based_run_length_encode_decode_fixed

Summary (TODO)

## Property patterns

In [None]:
from hypothesis import given, strategies as st

@given(st.lists(st.integers(), min_size=1))
def test_round_and_around(c):
    assert c[::-1][::-1] == c

In [None]:
!sh pytest_run.sh test_round_and_around

In [None]:
@given(st.integers(), st.integers())
def test_different_paths_same_destination_add(x, y):
    assert x + y == y + x

In [None]:
!sh pytest_run.sh test_different_paths_same_destination_add

In [None]:
@given(st.integers(), st.integers())
def test_there_and_back_again_add(m, n):
    assert m + n - n == m

@given(st.text())
def test_there_and_back_again_encode_decode(t):
    assert t.encode('utf-8').decode('utf-8') == t

In [None]:
!sh pytest_run.sh test_there_and_back_again

In [None]:
from heapq import heapify, heappop

@given(st.lists(st.integers(), min_size=1))
def test_some_things_never_change(c):
    smallest = min(c)
    heapify(c)
    assert heappop(c) == smallest

In [None]:
!sh pytest_run.sh test_some_things_never_change

In [None]:
@given(st.lists(st.integers()))
def test_the_more_things_change_the_more_they_stay_the_same(c):
    assert set(c) == set(set(c))

In [None]:
!sh pytest_run.sh test_the_more_things_change_the_more_they_stay_the_same

In [None]:
def test_hard_to_prove_easy_to_verify():
    pass

In [None]:
!sh pytest_run.sh test_hard_to_prove_easy_to_verify

In [None]:
def test_two_heads_are_better_than_one():
    pass

In [None]:
!sh pytest_run.sh test_two_heads_are_better_than_one

Summary (TODO)

## Data generation

In [None]:
from hypothesis import strategies as st

st.integers().example()

In [None]:
st.text().example()

In [None]:
st.floats().example()

In [None]:
st.lists(st.integers()).example()

In [None]:
nodes = st.floats() | st.booleans() | st.text() | st.none()
children = lambda x: st.lists(x) | st.dictionaries(st.text(), x)
st.recursive(nodes, children).example()

In [None]:
@st.composite
def composite_strategy(draw):
    pass

Summary (TODO)

## Model Based Testing

In [None]:
class Queue(object):

    def __init__(self, max_size):
        self._buffer = [None] * max_size
        self._in, self._out, self.max_size = 0, 0, max_size

    def put(self, item):
        self._buffer[self._in] = item
        self._in = (self._in + 1) % self.max_size

    def get(self):
        result = self._buffer[self._out]
        self._out = (self._out + 1) % self.max_size
        return result

    def __len__(self):
        return (self._in - self._out) % self.max_size

In [None]:
operations = 'new', 'put', 'get', 'size'

import itertools

list(itertools.permutations(operations))

But what if a bug only occurs if we perform the same operation twice in a row?

In [None]:
# ensure we also test the case where the same operation is
# performed twice
len(list(itertools.permutations(operations * 2)))

What about performing the same operation 3 times in a row? Or passing different values
to the various arguments these operations take? As we can see, in even
the simplest of systems the number of test cases for a brute force method
is unmanageable. There must be a better way.
Of course not all permutations are valid, for example we don't want a test case of the
form ``put``, ``get``, ``new``, ``size`` - it doesn't make sense to do perform any of
the other operations on a queue until after it is created. What we need is a way to
specify the valid operations for the system under test.
In hypothesis we can derive a class from RuleBasedStateMachine where methods decorated
with @rule are treated as states in the system. The @rule decorator is a bit like the
@given decorator, defining the strategies to use for generating argument values. However
@rule is only allowed in the context of a RuleBasedStateMachine.
All transitions between states are assumed valid, but the @precondition decorator can be
used on a method to indicate whether a transition to this state is valid or not. In this
way we can build a specification for the system under test.
Using the Queue example, this is how the specification would look. To create
a new queue we have the precondition that the queue must not already be
created. For all other operations we check the precondition that the queue
must have been created. And to get an item from the queue, we want to check
that the queue is not empty.

In [None]:
from hypothesis import strategies as st
from hypothesis.stateful import RuleBasedStateMachine, rule, precondition

class QueueMachine(RuleBasedStateMachine):

    Actual, Model = Queue, list

    def is_created(self):
        return hasattr(self, 'actual')

    @precondition(lambda self: not self.is_created())
    @rule(max_size=st.integers(min_value=1, max_value=10))
    def new(self, max_size):
        self.actual, self.model = self.Actual(max_size), self.Model()
        self.max_size = max_size

    @precondition(is_created)
    @rule(item=st.integers())
    def put(self, item):
        self.actual.put(item)
        self.model.append(item)

    def is_not_empty(self):
        return self.is_created() and len(self.model)

    @precondition(is_not_empty)
    @rule()
    def get(self):
        actual, model = self.actual.get(), self.model.pop()
        assert actual == model

    @precondition(is_created)
    @rule()
    def size(self):
        actual, model = len(self.actual), len(self.model)
        assert actual == model

Besides specifying the various states and valid transitions, we want these
methods to actually invoke the operation it represents on the system under
test, but also we want to have a model which represents our system under
test, and perform some similar operation on the model as well. We can then
assert post conditions comparing the model and the system under test to
determine if we got the expected behaviour or not. Of course you don't have
to use a model, you can assert other properties about the system under test
using this method.
So in our queue example we can use a list as our model.

In [54]:
test_model_based_1 = QueueMachine.TestCase
!sh pytest_run.sh test_model_based_1

F



_____________________ test_model_based_1.runTest _____________________


.venv/hypothesis/stateful.py:182: in runTest
    run_state_machine_as_test(state_machine_class)

.venv/hypothesis/stateful.py:109: in run_state_machine_as_test
    breaker.run(state_machine_factory(), print_steps=True)

.venv/hypothesis/stateful.py:237: in run
    state_machine.execute_step(value)

.venv/hypothesis/stateful.py:512: in execute_step
    result = rule.function(self, **data)

test_4_model_based_testing.py:131: in get
    assert actual == model
E   AssertionError: assert 0 == 1

----------------------------- Hypothesis -----------------------------

Step #1: new(max_size=2)
Step #2: put(item=0)
Step #3: put(item=1)
Step #4: get()


1 failed, 23 deselected in 0.17 seconds


RuleBasedStateMachine exposes a TestCase class which can be used like the
standard TestCase classes.
Let's run some tests.
As you can see we got some failures, let's have a look at the error, it appears
we created a new queue of size 2, we put a 0 into it, then put a 1 into it, and
the performed a get. If we look at the assertion failure, we see that by the get
our system under test gave us a 0, while the model gave us a 1. Now this is a
FIFO queue, which is the behaviour we see from the actual system under test. We
have here a bug in our model, our model is insufficient to represent the system
under test. Luckily we can fix that by changing the append to a prepend in the
put state.

In [None]:
class QueueMachine2(QueueMachine):

    @precondition(QueueMachine.is_created)
    @rule(item=st.integers())
    def put(self, item):
        self.actual.put(item)
        self.model.insert(0, item)

Let's run the tests again and see what happens.

In [55]:
test_model_based_2 = QueueMachine2.TestCase
!sh pytest_run.sh test_model_based_2

F



_____________________ test_model_based_2.runTest _____________________


.venv/hypothesis/stateful.py:182: in runTest
    run_state_machine_as_test(state_machine_class)

.venv/hypothesis/stateful.py:109: in run_state_machine_as_test
    breaker.run(state_machine_factory(), print_steps=True)

.venv/hypothesis/stateful.py:237: in run
    state_machine.execute_step(value)

.venv/hypothesis/stateful.py:512: in execute_step
    result = rule.function(self, **data)

test_4_model_based_testing.py:137: in size
    assert actual == model
E   AssertionError: assert 0 == 2

----------------------------- Hypothesis -----------------------------

Step #1: new(max_size=1)
Step #2: put(item=0)
Step #3: put(item=0)
Step #4: size()


1 failed, 23 deselected in 0.17 seconds


This time we encounter a different error. We create a new queue of size 1, we
put a 0 into it, then put another 0 into it, and then ask for the size. Our
actual system under test reports 0, while the model reports 2. Now the 0 is
clearly wrong, but what is going on here? Well I create a queue of size 1
and then I put 2 items into it. It's debatable about what the system should
actually do here, maybe raise an exception, but for the sake of the tests
we generated an invalid test case here, so this is a bug in our specification.
We can fix this by altering the pre condition to ensure we don't try and put
items on the queue if it is already full.

In [None]:
class QueueMachine3(QueueMachine2):

    def is_not_full(self):
        return self.is_created() and len(self.model) < self.max_size

    @precondition(is_not_full)
    @rule(item=st.integers())
    def put(self, item):
        self.actual.put(item)
        self.model.insert(0, item)

Let's run the tests again with the updated model and specification.

In [56]:
test_model_based_3 = QueueMachine3.TestCase
!sh pytest_run.sh test_model_based_3

F



_____________________ test_model_based_3.runTest _____________________


.venv/hypothesis/stateful.py:182: in runTest
    run_state_machine_as_test(state_machine_class)

.venv/hypothesis/stateful.py:109: in run_state_machine_as_test
    breaker.run(state_machine_factory(), print_steps=True)

.venv/hypothesis/stateful.py:237: in run
    state_machine.execute_step(value)

.venv/hypothesis/stateful.py:512: in execute_step
    result = rule.function(self, **data)

test_4_model_based_testing.py:137: in size
    assert actual == model
E   AssertionError: assert 0 == 1

----------------------------- Hypothesis -----------------------------

Step #1: new(max_size=1)
Step #2: put(item=0)
Step #3: size()


1 failed, 23 deselected in 0.15 seconds


Now we get yet another error, this time with create a new queue of size 1, we put
an item into it and we ask the size. Our model gives the correct answer, 1,
but our actual system under test gives a size of 0, that's clearly wrong, I
think this is a bug in the actual implementation of the system under test. Let's
take a look at what is going here.

In [None]:
class Queue2(Queue):
    def __init__(self, max_size):
        super(Queue2, self).__init__(max_size + 1)

class QueueMachine4(QueueMachine3):
    Actual = Queue2

In [57]:
test_model_based_4 = QueueMachine4.TestCase
!sh pytest_run.sh test_model_based_4

.


1 passed, 23 deselected in 0.28 seconds


Tackles the problem of testing interactions between features

Complexity, is this a bug in the spec, model or system under test?

## Django

## Real World Examples

In [51]:
%%javascript
$('#clear_all_output').click()

<IPython.core.display.Javascript object>

In [53]:
%%HTML
<link href="https://fonts.googleapis.com/css?family=ABeeZee" rel="stylesheet">
<style>body { font-family: 'ABeeZee', serif !important; }</style>