In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import os

from src.evaluation.metrics import aggregate_scores, calculate_scores
from src.data.data_loading import load_config
from src.data.code_processor import parse_code

In [3]:
# set working directory to project root - EXECUTE ONLY ONCE or RESTART KERNEL
os.chdir('..')
os.getcwd()

'C:\\Users\\merti\\PycharmProjects\\cadenza-playwright-llm'

In [4]:
config = load_config(config_path='config/config.yaml')

# Scoring
Metrics implemented and used in this project (see `src/evaluation/metrics.py`):
* **Weighted BLEU** $ \in [0.0, 1.0] $: The BLEU score proposed by [Papineni et al. (2002)](https://aclanthology.org/P02-1040.pdf) [1], [2] is a metric that measures the similarity between two sequences of text. The weighted BLEU score is a variant implementd in this project that uses a weighted average of the BLEU scores of the precondition part and the actual generated additional part in teh generated test script. The weights are defined in the configuration file `config/config.yaml`.
* **Success Rate** $ \in [0.0, 1.0] $: The success rate is the proportion of generated test scripts that run successfully, no matter if they are semantically correct or not.
* **Levenshtein Distance** $ d(s, t) \in \mathbb{N} $: The Levenshtein distance between strings $ s $ and $ t $ is an integer that measures the minimum number of single-character edits (insertions, deletions, or substitutions) required to change $ s $ into $ t $.

In [28]:
# Define example test cases
test_cases = [
    {
        'generated_code': parse_code(config['paths']['prediction_dir']+'/1_2.pred.ts'),
        'validation_code': parse_code(config['dataloading']['test_script_dir']+'/1_2.spec.ts'),
        'precondition_code': parse_code(config['dataloading']['test_script_dir']+'/1_1.spec.ts')
    },
    {
        'generated_code': parse_code(config['paths']['prediction_dir']+'/2_2.pred.ts'),
        'validation_code': parse_code(config['dataloading']['test_script_dir']+'/2_2.spec.ts'),
        'precondition_code': parse_code(config['dataloading']['test_script_dir']+'/2_1.spec.ts')
    }
]

2024-07-11 19:25:36 [[34msrc.data.code_processor:15[0m] [DEBUG[0m] >>>> Code parsed successfully. - Lines of Code: 16[0m
2024-07-11 19:25:36 [[34msrc.data.code_processor:15[0m] [DEBUG[0m] >>>> Code parsed successfully. - Lines of Code: 31[0m
2024-07-11 19:25:36 [[34msrc.data.code_processor:15[0m] [DEBUG[0m] >>>> Code parsed successfully. - Lines of Code: 32[0m
2024-07-11 19:25:36 [[34msrc.data.code_processor:15[0m] [DEBUG[0m] >>>> Code parsed successfully. - Lines of Code: 12[0m
2024-07-11 19:25:36 [[34msrc.data.code_processor:15[0m] [DEBUG[0m] >>>> Code parsed successfully. - Lines of Code: 13[0m
2024-07-11 19:25:36 [[34msrc.data.code_processor:15[0m] [DEBUG[0m] >>>> Code parsed successfully. - Lines of Code: 13[0m


In [29]:
# Run for one test case
scores = calculate_scores([test_cases[0]])
scores

{'weighted bleu': [0.4912019643332622],
 'success rate': [None],
 'levenshtein distance': [0.33620689655172414]}

In [30]:
# Run for all test cases
scores = calculate_scores(test_cases)
scores

{'weighted bleu': [0.4912019643332622, 0.3995064945165301],
 'success rate': [None, None],
 'levenshtein distance': [0.33620689655172414, 0.2632398753894081]}

In [32]:
# Aggregate scores
agg_scores = aggregate_scores(scores)
agg_scores



{'weighted bleu': 0.4453542294248961,
 'success rate': None,
 'levenshtein distance': 0.29972338597056614}

In [None]:
# Run evaluation script

# References
[1]
```bibtex
@INPROCEEDINGS{Papineni02bleu:a,
    author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
    title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
    booktitle = {},
    year = {2002},
    pages = {311--318}
}
```
[2]
```bibtex
@inproceedings{lin-och-2004-orange,
    title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
    author = "Lin, Chin-Yew  and
      Och, Franz Josef",
    booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
    month = "aug 23{--}aug 27",
    year = "2004",
    address = "Geneva, Switzerland",
    publisher = "COLING",
    url = "https://www.aclweb.org/anthology/C04-1072",
    pages = "501--507",
}
```
