# Tests4Py Benchmark: MarkUp
This notebook handles benchmark tests for MarkUp in the Tests4Py framework.


In [1]:
# Suppress logging for the notebook; uncomment the last line to disable Avicenna logs
import logging

# This will disable all logging messages
logging.disable(logging.CRITICAL)

### Build Program from Repository

In [2]:
from tests4py.api.logging import deactivate
deactivate()

from debugging_benchmark.tests4py_benchmark.repository import MarkUpBenchmarkRepository

# Initialize the benchmark repository and select the first program
repository = MarkUpBenchmarkRepository()
programs = repository.build()
program = programs[0]  # Assuming there is only one MarkUp Subject; we use markup_1

### Initialize Diagnostic System

Initialize the `Avicenna` diagnostic system with specific parameters including minimum recall.

In [3]:
# Create an Avicenna instance with configurations for diagnosis
from avicenna.avicenna import Avicenna

# Convert program to dictionary format for Avicenna initialization
param = program.to_dict()

# Initialize Avicenna with a minimum recall configuration
avicenna = Avicenna(
    **param,
    min_recall=0.7,
)

### Diagnosis Execution and Explanation

In [4]:
# Perform the diagnosis using Avicenna and store the results
from typing import Tuple
from isla.language import Formula

try:
    diagnosis: Tuple[Formula, float, float] = avicenna.explain()
    print("Diagnosis complete.")
except Exception as e:
    print(f"Error during diagnosis: {e}")

Diagnosis complete.


In [5]:
from isla.language import ISLaUnparser

print(f"Avicenna determined the following constraints to describe the failure circumstances:\n")

print(ISLaUnparser(diagnosis[0]).unparse())
print(f"Avicenna calculated a precision of {diagnosis[1]*100:.2f}% and a recall of {diagnosis[2]*100:.2f}%", end="\n\n")

Avicenna determined the following constraints to describe the failure circumstances:

exists <char> elem in start:
  (= elem "\"")
Avicenna calculated a precision of 96.03% and a recall of 98.04%



### Evaluation

Generate test inputs using a grammar-based fuzzer, and classify these inputs as passing or failing based on the learned constraints.

In [6]:
from debugging_framework.fuzzingbook.fuzzer import GrammarFuzzer
from debugging_framework.input.input import Input, OracleResult

def generate_inputs(grammar, num_inputs=1000):
    fuzzer = GrammarFuzzer(grammar)
    evaluation_data_set = set()

    while len(evaluation_data_set) < num_inputs:
        tree = fuzzer.fuzz()
        evaluation_data_set.add(Input.from_str(grammar=grammar, input_string=tree))

    return evaluation_data_set

def classify_inputs(program, evaluation_data_set):
    oracle = program.get_oracle()
    failing, passing = set(), set()

    for inp in evaluation_data_set:
        oracle_result, exception = oracle(inp)
        if oracle_result == OracleResult.FAILING:
            failing.add(inp)
        elif oracle_result == OracleResult.PASSING:
            passing.add(inp)

    return passing, failing

In [7]:
grammar = program.get_grammar()
evaluation_data_set = generate_inputs(grammar)
passing, failing = classify_inputs(program, evaluation_data_set)

print(f"Generated {len(evaluation_data_set)} unique inputs for evaluation.")
print(f"Generated {len(passing)} passing inputs for evaluation!")
print(f"Generated {len(failing)} failing inputs for evaluation!")

Generated 1000 unique inputs for evaluation.
Generated 961 passing inputs for evaluation!
Generated 39 failing inputs for evaluation!


Calculate and display the precision and recall for the diagnostic results based on the test evaluations.

In [8]:
from isla.evaluator import evaluate

# Calculate Precision and Recall
tp = sum(bool(evaluate(diagnosis[0], inp.tree, grammar)) for inp in failing)
fn = len(failing) - tp
fp = sum(bool(evaluate(diagnosis[0], inp.tree, grammar)) for inp in passing)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

print(f"The Diagnosis achieved a Precision of {precision*100:.2f}% " +
      f"a Recall of {recall*100:.2f}%")

The Diagnosis achieved a Precision of 92.00% a Recall of 58.97%
