# SpecialCase Tests4Py: MarkUp

In [1]:
from tests4py.api.logging import deactivate
deactivate()
from debugging_benchmark.tests4py_benchmark.repository import MarkUpBenchmarkRepository

tests4py :: INFO     :: Loading projects


In [2]:
# build programs 
programs = MarkUpBenchmarkRepository().build()
program = programs[0]

# There is only one MarkUp Subject
program

Tests4PyBenchmarkProgram(markup_1)

In [3]:
param = program.to_dict()

In [4]:
# param

In [8]:
from avicenna.avicenna import Avicenna

avicenna = Avicenna(
    **program.to_dict(),
    min_recall=0.7
)

In [None]:
from typing import List, Tuple
from isla.language import Formula

diagnosis: Tuple[Formula, float, float] = avicenna.explain()
# Avicenna returns a List of learned ISla Formula and the corresponding precision and recall

In [10]:
from isla.language import ISLaUnparser

print(f"Avicenna determined the following constraints to describe the failure circumstances:\n")

print(ISLaUnparser(diagnosis[0]).unparse())
print(f"Avicenna calculated a precision of {diagnosis[1]*100:.2f}% and a recall of {diagnosis[2]*100:.2f}%", end="\n\n")

Avicenna determined the following constraints to describe the failure circumstances:

exists <char> elem in start:
  (= elem "\"")
Avicenna calculated a precision of 93.41% and a recall of 89.58%



In [11]:
print("\nEquivalent Representations:")
equivalent_representations = avicenna.get_equivalent_best_formulas()

if equivalent_representations:
    for diagnosis in equivalent_representations:
        print(ISLaUnparser(diagnosis[0]).unparse())



Equivalent Representations:


In [12]:
## Evaluation of the learned Constraint

In [71]:
from debugging_framework.fuzzingbook.fuzzer import GrammarFuzzer
from debugging_framework.input.input import Input

grammar = program.get_grammar()

evaluation_data_set = set()
fuzzer = GrammarFuzzer(grammar)

for _ in range(10000):
    tree = fuzzer.fuzz()
    evaluation_data_set.add(Input.from_str(grammar=grammar, input_string=tree))

print(f"Generated {len(evaluation_data_set)} inputs for evaluation!")

Generated 4308 inputs for evaluation!


In [72]:
from debugging_framework.input.input import OracleResult

oracle = program.get_oracle()
failing = set()
passing = set()

for inp in evaluation_data_set:
    oracle_result, exception = oracle(inp)

    if oracle_result == OracleResult.FAILING:
        failing.add(inp)
    elif oracle_result == OracleResult.PASSING:
        passing.add(inp)

print(f"Generated {len(passing)} passing inputs for evaluation!")
print(f"Generated {len(failing)} passing inputs for evaluation!")

Generated 4098 passing inputs for evaluation!
Generated 210 passing inputs for evaluation!


In [73]:
from isla.evaluator import evaluate

eval_results_passing = []
for inp in list(passing):
    eval_results_passing.append(bool(evaluate(diagnosis[0], inp.tree, grammar)))

eval_results_failing = []
for inp in list(failing):
    eval_results_failing.append(bool(evaluate(diagnosis[0], inp.tree, grammar)))

In [74]:
tp = sum(int(entry) for entry in eval_results_failing)
fn = len(eval_results_failing) -tp
fp = sum(int(entry) for entry in eval_results_passing)

precision = tp / (tp + fp)
recall = tp / (tp + fn)

print(f"The Diagnosis achieved a Precision of {precision*100:.2f}% and a Recall of {recall*100:.2f}%")

The Diagnosis achieved a Precision of 85.84% and a Recall of 46.19%


In [76]:
tp = sum(int(entry) for entry in list(eval_results_failing))
fn = len(eval_results_failing) -tp
fp = sum(int(entry) for entry in eval_results_passing)

precision = tp / (tp + fp)
recall = tp / (tp + fn)

print(f"The Diagnosis achieved a Precision of {precision*100:.2f}% and a Recall of {recall*100:.2f}%")

The Diagnosis achieved a Precision of 85.84% and a Recall of 46.19%
