In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from src.evaluation.metrics import aggregate_scores, calculate_scores
from src.data.data_loading import load_config
from src.data.code_processor import parse_code

In [3]:
# Set plot style
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rcParams['font.family'] = 'STIXGeneral'
plt.rcParams['font.size'] = 12
#%config InlineBackend.figure_format = 'retina'
# Set color palette
sns.set_palette('Paired')
sns.set_context('notebook')

# Set working directory

In [4]:
# set working directory to project root - EXECUTE ONLY ONCE or RESTART KERNEL
os.chdir('..')
os.getcwd()

'C:\\Users\\merti\\PycharmProjects\\cadenza-playwright-llm'

# Load data + config

In [35]:
config = load_config(config_path='config/config.yaml')

# Scoring
Metrics implemented and used in this project (see `src/evaluation/metrics.py`):
* **Weighted BLEU** $ \in [0.0, 1.0] $: The BLEU score proposed by [Papineni et al. (2002)](https://aclanthology.org/P02-1040.pdf) [1], [2] is a metric that measures the similarity between two sequences of text. The weighted BLEU score is a variant implementd in this project that uses a weighted average of the BLEU scores of the precondition part and the actual generated additional part in teh generated test script. The weights are defined in the configuration file `config/config.yaml`.
* **Success Rate** $ \in [0.0, 1.0] $: The success rate is the proportion of generated test scripts that run successfully, no matter if they are semantically correct or not.
* **Levenshtein Distance** $ d(s, t) \in \mathbb{N} $: The Levenshtein distance between strings $ s $ and $ t $ is an integer that measures the minimum number of single-character edits (insertions, deletions, or substitutions) required to change $ s $ into $ t $.

**Manual Evaluation**
First we are going to test the core functionality of the scoring functions by defining some example test cases and running the scoring functions on them.

In [6]:
# Define example test cases
test_cases = [
    {
        'test_case': '1',
        'test_step': '2',
        'generated_code': parse_code(config['paths']['prediction_dir']+'/1_2.spec.ts'),
        'validation_code': parse_code(config['dataloading']['test_script_dir']+'/1_2.spec.ts'),
        'precondition_code': parse_code(config['dataloading']['test_script_dir']+'/1_1.spec.ts')
    },
    {
        'test_case': '2',
        'test_step': '2',
        'generated_code': parse_code(config['paths']['prediction_dir']+'/2_2.spec.ts'),
        'validation_code': parse_code(config['dataloading']['test_script_dir']+'/2_2.spec.ts'),
        'precondition_code': parse_code(config['dataloading']['test_script_dir']+'/2_1.spec.ts')
    }
]

In [7]:
test_cases[1]['generated_code']

"import { test, expect } from '@playwright/test';\nimport { writeFileSync } from 'fs';\n\ntest('test', async ({ page }) => {\n  await page.goto('http://localhost:8080/cadenza/');\n  await page.getByRole('link', { name: 'Anmelden' }).click();\n  await page.getByLabel('Benutzername *').click();\n  await page.getByLabel('Benutzername *').fill('Admin');\n  await page.getByRole('button', { name: 'Anmelden' }).click();\n  await page.getByTestId('create-workbook-button').click();\n});\n"

In [8]:
# Run scoring for test cases
scores = calculate_scores(test_cases, config)
scores

2024-07-19 12:27:23 [[34msrc.evaluation.metrics:70[0m] [DEBUG[0m] >>>> Calculating scores for test case 1_2...[0m
Current working directory: C:\Users\merti\PycharmProjects\cadenza-playwright-llm
Screenshot directory: data\temp\eval_run\screenshot
Evaluation run directory: data\temp\eval_run
Created screenshot directory: data\temp\eval_run\screenshot
Created temp directory: data\temp\eval_run\test_script
Temp file path: data\temp\eval_run\test_script\1_2.spec.ts
File data\temp\eval_run\test_script\1_2.spec.ts created successfully.
Verified that the file data\temp\eval_run\test_script\1_2.spec.ts exists.
2024-07-19 12:27:50 [[34msrc.evaluation.metrics:285[0m] [DEBUG[0m] >>>> Playwright test result: 1[0m
1
2024-07-19 12:27:50 [[34msrc.evaluation.metrics:70[0m] [DEBUG[0m] >>>> Calculating scores for test case 2_2...[0m
Current working directory: C:\Users\merti\PycharmProjects\cadenza-playwright-llm
Screenshot directory: data\temp\eval_run\screenshot
Evaluation run directory: da

{'weighted bleu': [0.4912019643332622, 0.3988451029814817],
 'success rate': [1, 1],
 'levenshtein distance': [0.33620689655172414, 0.2632398753894081]}

In [9]:
# Aggregate scores
agg_scores = aggregate_scores(scores)
agg_scores

{'weighted bleu': 0.4450235336573719,
 'success rate': 1.0,
 'levenshtein distance': 0.29972338597056614}

**Automated Evaluation**
For simplicity an evaluation test script is implemented in `scripts/evaluation.py` that runs the scoring functions on all test cases available in the prediction directory defined in the configuration file `config/config.yaml`. The evaluation results are also automatically saved as a pickle file in the scoring results directory defined in the configuration file `config/config.yaml`.

In [10]:
# Run complete automated evaluation script
!python scripts/evaluation.py --config=config/config.yaml

2024-07-19 12:28:00 [[34m__main__:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-19 12:28:00 [[34msrc.evaluation.metrics:70[0m] [DEBUG[0m] >>>> Calculating scores for test case 1_2...[0m
Error: No tests found.
Make sure that arguments are regular expressions matching test files.
You may need to escape symbols like "$" or "*" and quote the arguments.

Current working directory: C:\Users\merti\PycharmProjects\cadenza-playwright-llm
Screenshot directory: data\temp\eval_run\screenshot
Evaluation run directory: data\temp\eval_run
Created screenshot directory: data\temp\eval_run\screenshot
Created temp directory: data\temp\eval_run\test_script
Temp file path: data\temp\eval_run\test_script\1_2.spec.ts
File data\temp\eval_run\test_script\1_2.spec.ts created successfully.
Verified that the file data\temp\eval_run\test_script\1_2.spec.ts exists.
2024-07-19 12:28:04 [[34msrc.evaluation.metrics:285[0m] [DEBUG[0m] >>>> Playwright test result: 1[0m
1
2024-07-19 12:28:04 [[

It is also possible to evaluate the test cases inside the notebook by using the evaluate_test_cases() function without running the whole script but directly calling the main function of the script:

In [45]:
# Run complete automated evaluation inside notebook
from scripts.evaluation import evaluate_test_cases
results = evaluate_test_cases(config)
results

2024-07-19 15:07:51 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-19 15:07:51 [[34mscripts.evaluation:43[0m] [DEBUG[0m] >>>> Test cases: [{'test_case': '1', 'test_step': '2', 'generated_code': "import { test, expect } from '@playwright/test';\nimport { writeFileSync } from 'fs';\n\ntest('test', async ({ page }) => {\n  await page.goto('http://localhost:8080/cadenza/');\n  await page.getByRole('link', { name: 'Anmelden' }).click();\n  await page.getByLabel('Benutzername *').click();\n  await page.getByLabel('Benutzername *').fill('Admin');\n  await page.getByLabel('Benutzername *').press('Tab');\n  await page.getByPlaceholder(' ').fill('Admin');\n  await page.getByRole('button', { name: 'Anmelden' }).click();\n  await page.getByText('Verzeichnis Gewässergüte', { exact: true }).click();\n  const parentElement = await page.getByText('Arbeitsmappe Übersicht Messstellen').locator('..');\n  await parentElement.locator('.d-icon.d-icon-bold.status-i

Unnamed: 0,file_id,test_case,test_step,weighted bleu,success rate,levenshtein distance
0,1_2,1,2,0.500697,1,0.092308
1,2_2,2,2,0.398845,1,0.26324


# Run Scoring on all Different Predictions
We generated different predictions using different input configurations. Now we want to evaluate all of them and compare the results.

In [18]:
# Adjust file names from pred.ts to spec.ts
def rename_files_in_directory(directory: str):
    # List all files in the directory
    for file_name in os.listdir(directory):
        # Check if the file name ends with '.pred.ts'
        if file_name.endswith('.pred.ts'):
            # Create the new file name
            new_file_name = file_name.replace('.pred.ts', '.spec.ts')
            # Create the full paths to the old and new file names
            old_file_path = os.path.join(directory, file_name)
            new_file_path = os.path.join(directory, new_file_name)
            # Rename the file
            os.rename(old_file_path, new_file_path)
            print(f"Renamed '{old_file_path}' to '{new_file_path}'")

**Input Configuration 1:**
* HTML Concat Mode: Single
* Max HTML Attribute Length: 50
* Includes: HTML, Screenshot, Precondition Code

In [19]:
# Rename files in the directory to make sure they have the correct file extension so that playwright test runner can run them
directory_path = './data/prediction/pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained'
rename_files_in_directory(directory_path)

Renamed './data/prediction/pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained\10_1.pred.ts' to './data/prediction/pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained\10_1.spec.ts'
Renamed './data/prediction/pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained\10_2.pred.ts' to './data/prediction/pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained\10_2.spec.ts'
Renamed './data/prediction/pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained\10_3.pred.ts' to './data/prediction/pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained\10_3.spec.ts'
Renamed './data/prediction/pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained\10_4.pred.ts' to './data/prediction/pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained\10_4.spec.ts'
Renamed './data/prediction/pred_test

In [32]:
config_1 = load_config(config_path='config/config_pred_01.yaml')

In [43]:
%%time
result_1 = evaluate_test_cases(config_1)

2024-07-19 15:06:21 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-19 15:06:22 [[34mscripts.evaluation:43[0m] [DEBUG[0m] >>>> Test cases: [{'test_case': '10', 'test_step': '1', 'generated_code': "import { test, expect } from '@playwright/test';\nimport { writeFileSync } from 'fs';\n\ntest('Cadenz Test', async ({ page }) => {\n  await page.goto('http://localhost:8080/cadenza/');\n  await page.getByRole('link', { name: 'Anmelden' }).click();\n  await page.getByLabel('Benutzername *').click();\n  await page.getByLabel('Benutzername *').fill('Admin');\n  await page.getByRole('button', { name: 'Anmelden' }).click();\n\n  // Additional actions based on UI test description\n  // ...\n\n  // Take a screenshot of the page\n  await page.screenshot({ path: 'cadenz_test.png' });\n\n  // Write the screenshot to a file\n  writeFileSync('cadenz_test.png', '');\n});", 'validation_code': "import { test, expect } from '@playwright/test';\nimport { writeFileSyn

KeyboardInterrupt: 

In [34]:
result_1

Unnamed: 0,file_id,test_case,test_step,weighted bleu,success rate,levenshtein distance
0,10_1,10,1,0.402686,1,0.239198
1,10_2,10,2,0.403467,1,0.266467
2,10_3,10,3,0.000000,1,0.814661
3,10_4,10,4,0.435755,1,0.199507
4,10_5,10,5,0.469436,1,0.124567
...,...,...,...,...,...,...
95,9_2,9,2,0.403467,1,0.266467
96,9_3,9,3,0.000000,1,0.816187
97,9_4,9,4,0.000000,1,0.831671
98,9_5,9,5,0.469601,1,0.139050


# Evaluation

In [29]:
# Load saved evaluation results
results = pd.read_pickle(config['paths']['scores_dir']+'eval_scores_20240719-135530.pkl')
results

Unnamed: 0,file_id,test_case,test_step,weighted bleu,success rate,levenshtein distance
0,10_1,10,1,0.385737,0.0,0.291425
1,10_2,10,2,0.385737,0.0,0.291425
2,10_3,10,3,0.385737,0.0,0.291425
3,10_4,10,4,0.385737,0.0,0.291425
4,10_5,10,5,0.385737,0.0,0.291425
...,...,...,...,...,...,...
95,9_2,9,2,0.385737,0.0,0.291425
96,9_3,9,3,0.385737,0.0,0.291425
97,9_4,9,4,0.385737,0.0,0.291425
98,9_5,9,5,0.385737,0.0,0.291425


In [None]:
# Display evaluation results in a boxplot
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=results, ax=ax, orient='h')
ax.set_title('Evaluation Results', fontsize=16, fontweight='bold')
ax.set_xlabel('Score')
plt.xticks(rotation=45)
plt.show()

# References
[1]
```bibtex
@INPROCEEDINGS{Papineni02bleu:a,
    author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
    title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
    booktitle = {},
    year = {2002},
    pages = {311--318}
}
```
[2]
```bibtex
@inproceedings{lin-och-2004-orange,
    title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
    author = "Lin, Chin-Yew  and
      Och, Franz Josef",
    booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
    month = "aug 23{--}aug 27",
    year = "2004",
    address = "Geneva, Switzerland",
    publisher = "COLING",
    url = "https://www.aclweb.org/anthology/C04-1072",
    pages = "501--507",
}
```
