# Tests4Py Benchmark: MarkUp
This notebook handles benchmark tests for MarkUp in the Tests4Py framework.


In [1]:
# Suppress logging for the notebook; uncomment the last line to disable Avicenna logs
import logging

# This will disable all logging messages
logging.disable(logging.CRITICAL)

### Build Program from Repository

In [2]:
from tests4py.api.logging import deactivate
deactivate()

from debugging_benchmark.tests4py_benchmark.repository import MarkUpBenchmarkRepository

# Initialize the benchmark repository and select the first program
repository = MarkUpBenchmarkRepository()
programs = repository.build()
program = programs[0]  # Assuming there is only one MarkUp Subject; we use markup_1


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Initialize Avicenna

Initialize the `Avicenna` diagnostic system with specific parameters including minimum recall.

In [3]:
# Create an Avicenna instance with configurations for diagnosis
from avicenna import Avicenna

# Convert program to dictionary format for Avicenna initialization
param = program.to_dict()

# Initialize Avicenna with a minimum recall configuration
avicenna = Avicenna(
    **param,
    min_recall=0.7,
)

### Diagnosis Execution and Explanation

In [4]:
# Perform the diagnosis using Avicenna and store the results
from typing import Tuple
from isla.language import Formula

try:
    diagnosis = avicenna.explain()
    print("Diagnosis complete.")
except Exception as e:
    print(f"Error during diagnosis: {e}")

Diagnosis complete.


In [5]:
from isla.language import ISLaUnparser

failure_diagnosis = diagnosis.pop(0)
    
print(f"Avicenna determined the following constraints for {program}:\n")
print(ISLaUnparser(failure_diagnosis.formula).unparse())
print(f"Avicenna calculated a precision of {failure_diagnosis.precision()*100:.2f}% and a recall of {failure_diagnosis.recall()*100:.2f}%", end="\n\n")

Avicenna determined the following constraints for Tests4PyBenchmarkProgram(markup_1):

exists <chars> elem_xy in start:
  inside(elem_xy, start)
Avicenna calculated a precision of 26.19% and a recall of 100.00%



### Super low Precision, Let's try a different Metric

In [18]:
from avicenna.learning.metric import F1ScoreFitness

candidates = avicenna.learner.get_candidates()
strat = F1ScoreFitness()
sorted_ = sorted(candidates, key=lambda c: strat.evaluate(c), reverse=True)

for dia in sorted_[:3]:
    print(ISLaUnparser(dia.formula).unparse())
    print(f"Avicenna calculated a precision of {dia.precision()*100:.2f}% and a recall of {dia.recall()*100:.2f}%", end="\n\n")

exists <char> elem in start:
  (= elem "\"")
Avicenna calculated a precision of 86.96% and a recall of 90.91%

(exists <char> elem in start:
   (= elem "\"") and
forall <html> container in start:
  exists <open> elem_0 in container:
    (= (str.len elem_0) (str.to.int "2")))
Avicenna calculated a precision of 93.33% and a recall of 63.64%

(exists <char> elem in start:
   (= elem "\"") and
forall <html> container in start:
  exists <open> elem_0 in container:
    (= elem_0 "<>"))
Avicenna calculated a precision of 93.33% and a recall of 63.64%



In [19]:
failure_diagnosis = sorted_[0]

The constraint: 

```
exists <char> elem in start:
    (= elem "\"")
```

means that the error is predicted when there is a double quote character (`"`) in the evaluated string or text elements. The diagnosis suggests that the presence of a double quote might be triggering errors, likely due to how these characters are handled or expected within the markup context.

## Evaluation

### Predictor

Generate test inputs using a grammar-based fuzzer, and classify these inputs as passing or failing based on the learned constraints.

In [20]:
from debugging_framework.fuzzingbook.fuzzer import GrammarFuzzer
from debugging_framework.input.input import Input, OracleResult

def generate_inputs(grammar, num_inputs=1000):
    fuzzer = GrammarFuzzer(grammar)
    evaluation_data_set = set()

    while len(evaluation_data_set) < num_inputs:
        tree = fuzzer.fuzz()
        evaluation_data_set.add(Input.from_str(grammar=grammar, input_string=tree))

    return evaluation_data_set

def classify_inputs(program, evaluation_data_set):
    oracle = program.get_oracle()
    failing, passing = set(), set()

    for inp in evaluation_data_set:
        oracle_result, exception = oracle(inp)
        if oracle_result == OracleResult.FAILING:
            failing.add(inp)
        elif oracle_result == OracleResult.PASSING:
            passing.add(inp)

    return passing, failing

In [21]:
grammar = program.get_grammar()
evaluation_data_set = generate_inputs(grammar)
passing, failing = classify_inputs(program, evaluation_data_set)

print(f"Generated {len(evaluation_data_set)} unique inputs for evaluation.")
print(f"Generated {len(passing)} passing inputs for evaluation!")
print(f"Generated {len(failing)} failing inputs for evaluation!")

Generated 1000 unique inputs for evaluation.
Generated 963 passing inputs for evaluation!
Generated 37 failing inputs for evaluation!


Calculate and display the precision and recall for the diagnostic results based on the test evaluations.

In [22]:
from isla.evaluator import evaluate

# Calculate Precision and Recall
tp = sum(bool(evaluate(failure_diagnosis.formula, inp.tree, grammar)) for inp in failing)
fn = len(failing) - tp
fp = sum(bool(evaluate(failure_diagnosis.formula, inp.tree, grammar)) for inp in passing)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

print(f"The Diagnosis achieved a Precision of {precision*100:.2f}% " +
      f"a Recall of {recall*100:.2f}%")

The Diagnosis achieved a Precision of 90.00% a Recall of 48.65%


### Producer

#### Generating more Failing Inputs from Diagnosis

In [23]:
formula = """
exists <char> elem in start:
    (= elem "\\"")
"""

In [24]:
from isla.solver import ISLaSolver

solver = ISLaSolver(
    grammar,
    formula=formula,
    enable_optimized_z3_queries=False
)

In [25]:
failing_inputs = []
for _ in range(100):
    try:
        inp = solver.solve()
        failing_inputs.append(inp)
        # print(str(inp).ljust(30), oracle(inp))
    except StopIteration:
        continue

In [26]:
from typing import List

oracle = program.get_oracle()

producer_failing: List[bool] = []
for inp in failing_inputs:
    oracle_result, exception = oracle(inp)
    producer_failing.append(
        oracle_result.is_failing()
    )

In [27]:
print(f"Generated {len(failing_inputs)} inputs which are expected to be failing. ({sum(not(inp) for inp in producer_failing)} inputs are passing)")

Generated 100 inputs which are expected to be failing. (0 inputs are passing)


#### Generating Passing Inputs by Negating Constraint

In [28]:
# Negated Constraint
formula = """
not(exists <char> elem in start:
    (= elem "\\""))
"""

In [29]:
from isla.solver import ISLaSolver

solver = ISLaSolver(
    grammar,
    formula=formula,
    enable_optimized_z3_queries=False
)

In [30]:
passing_inputs = []
for _ in range(100):
    try:
        inp = solver.solve()
        passing_inputs.append(inp)
        # print(str(inp).ljust(30), oracle(inp))
    except StopIteration:
        continue

In [31]:
oracle = program.get_oracle()

producer_passing: List[bool] = []
for inp in passing_inputs:
    oracle_result, exception = oracle(inp)
    producer_passing.append(
        oracle_result.is_failing()
    )

In [32]:
print(f"Generated {len(passing_inputs)} inputs which are expected to be passing. ({sum(producer_passing)} inputs are failing)")

Generated 11 inputs which are expected to be passing. (1 inputs are failing)


In [33]:
from isla.evaluator import evaluate

# Calculate Precision and Recall
tp = sum(inp for inp in producer_failing)
fn = len(producer_failing) - tp
fp = sum(inp for inp in producer_passing)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

print(f"Producer Evaluation:")
print(f"The Diagnosis achieved a Precision of {precision*100:.2f}% " +
      f"a Recall of {recall*100:.2f}%")

Producer Evaluation:
The Diagnosis achieved a Precision of 99.01% a Recall of 100.00%
