In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [12]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

from src.evaluation.metrics import aggregate_scores, calculate_scores
from src.data.data_loading import load_config
from src.data.code_processor import parse_code

In [3]:
# Set plot style
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rcParams['font.family'] = 'STIXGeneral'
plt.rcParams['font.size'] = 12
#%config InlineBackend.figure_format = 'retina'
# Set color palette
sns.set_palette('Paired')
sns.set_context('notebook')

# Set working directory

In [4]:
# set working directory to project root - EXECUTE ONLY ONCE or RESTART KERNEL
os.chdir('..')
os.getcwd()

'C:\\Users\\merti\\PycharmProjects\\cadenza-playwright-llm'

# Load data + config

In [5]:
config = load_config(config_path='config/config.yaml')

# Scoring
Metrics implemented and used in this project (see `src/evaluation/metrics.py`):
* **Weighted BLEU** $ \in [0.0, 1.0] $: The BLEU score proposed by [Papineni et al. (2002)](https://aclanthology.org/P02-1040.pdf) [1], [2] is a metric that measures the similarity between two sequences of text. The weighted BLEU score is a variant implementd in this project that uses a weighted average of the BLEU scores of the precondition part and the actual generated additional part in teh generated test script. The weights are defined in the configuration file `config/config.yaml`.
* **Success Rate** $ \in [0.0, 1.0] $: The success rate is the proportion of generated test scripts that run successfully, no matter if they are semantically correct or not.
* **Levenshtein Distance** $ d(s, t) \in \mathbb{N} $: The Levenshtein distance between strings $ s $ and $ t $ is an integer that measures the minimum number of single-character edits (insertions, deletions, or substitutions) required to change $ s $ into $ t $.

**Manual Evaluation**
First we are going to test the core functionality of the scoring functions by defining some example test cases and running the scoring functions on them.

In [6]:
# Define example test cases
test_cases = [
    {
        'test_case': '1',
        'test_step': '2',
        'generated_code': parse_code(config['paths']['prediction_dir']+'/1_2.spec.ts'),
        'validation_code': parse_code(config['dataloading']['test_script_dir']+'/1_2.spec.ts'),
        'precondition_code': parse_code(config['dataloading']['test_script_dir']+'/1_1.spec.ts')
    },
    {
        'test_case': '2',
        'test_step': '2',
        'generated_code': parse_code(config['paths']['prediction_dir']+'/2_2.spec.ts'),
        'validation_code': parse_code(config['dataloading']['test_script_dir']+'/2_2.spec.ts'),
        'precondition_code': parse_code(config['dataloading']['test_script_dir']+'/2_1.spec.ts')
    }
]

In [7]:
test_cases[1]['generated_code']

"import { test, expect } from '@playwright/test';\nimport { writeFileSync } from 'fs';\n\ntest('test', async ({ page }) => {\n  await page.goto('http://localhost:8080/cadenza/');\n  await page.getByRole('link', { name: 'Anmelden' }).click();\n  await page.getByLabel('Benutzername *').click();\n  await page.getByLabel('Benutzername *').fill('Admin');\n  await page.getByRole('button', { name: 'Anmelden' }).click();\n  await page.getByTestId('create-workbook-button').click();\n});\n"

In [8]:
# Run scoring for test cases
scores = calculate_scores(test_cases, config)
scores

2024-07-20 11:21:32 [[34msrc.evaluation.metrics:86[0m] [DEBUG[0m] >>>> Calculating scores for test case 1_2...[0m
data\temp\eval_run\screenshots
2024-07-20 11:21:32 [[34msrc.evaluation.metrics:209[0m] [DEBUG[0m] >>>> Current working directory: C:\Users\merti\PycharmProjects\cadenza-playwright-llm[0m
2024-07-20 11:21:32 [[34msrc.evaluation.metrics:210[0m] [DEBUG[0m] >>>> Screenshot directory: data\temp\eval_run\screenshots[0m
2024-07-20 11:21:32 [[34msrc.evaluation.metrics:211[0m] [DEBUG[0m] >>>> Evaluation run directory: data\temp\eval_run[0m
2024-07-20 11:21:32 [[34msrc.evaluation.metrics:219[0m] [DEBUG[0m] >>>> Screenshot path: data/temp/eval_run/screenshots/1_2.png[0m
2024-07-20 11:21:32 [[34msrc.evaluation.metrics:258[0m] [DEBUG[0m] >>>> Created temp directory: data\temp\eval_run\test_script[0m
2024-07-20 11:21:32 [[34msrc.evaluation.metrics:261[0m] [DEBUG[0m] >>>> Temp file path: data\temp\eval_run\test_script\1_2.spec.ts[0m
2024-07-20 11:21:32 [[34msr

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\merti/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:04<00:00, 10.6MB/s]

2024-07-20 11:21:56 [[34msrc.evaluation.metrics:86[0m] [DEBUG[0m] >>>> Calculating scores for test case 2_2...[0m





data\temp\eval_run\screenshots
2024-07-20 11:21:56 [[34msrc.evaluation.metrics:209[0m] [DEBUG[0m] >>>> Current working directory: C:\Users\merti\PycharmProjects\cadenza-playwright-llm[0m
2024-07-20 11:21:56 [[34msrc.evaluation.metrics:210[0m] [DEBUG[0m] >>>> Screenshot directory: data\temp\eval_run\screenshots[0m
2024-07-20 11:21:56 [[34msrc.evaluation.metrics:211[0m] [DEBUG[0m] >>>> Evaluation run directory: data\temp\eval_run[0m
2024-07-20 11:21:56 [[34msrc.evaluation.metrics:219[0m] [DEBUG[0m] >>>> Screenshot path: data/temp/eval_run/screenshots/2_2.png[0m
2024-07-20 11:21:56 [[34msrc.evaluation.metrics:258[0m] [DEBUG[0m] >>>> Created temp directory: data\temp\eval_run\test_script[0m
2024-07-20 11:21:56 [[34msrc.evaluation.metrics:261[0m] [DEBUG[0m] >>>> Temp file path: data\temp\eval_run\test_script\2_2.spec.ts[0m
2024-07-20 11:21:56 [[34msrc.evaluation.metrics:267[0m] [DEBUG[0m] >>>> File data\temp\eval_run\test_script\2_2.spec.ts created successfully.[

{'weighted bleu': [0.5006969523919225, 0.3988451029814817],
 'success rate': [1, 1],
 'levenshtein distance': [0.09230769230769231, 0.2632398753894081],
 'similarity': [0, 0]}

In [9]:
# Aggregate scores
agg_scores = aggregate_scores(scores)
agg_scores



{'weighted bleu': 0.4497710276867021,
 'success rate': 1.0,
 'levenshtein distance': 0.1777737838485502,
 'similarity': None}

**Automated Evaluation**
For simplicity an evaluation test script is implemented in `scripts/evaluation.py` that runs the scoring functions on all test cases available in the prediction directory defined in the configuration file `config/config.yaml`. The evaluation results are also automatically saved as a pickle file in the scoring results directory defined in the configuration file `config/config.yaml`.

In [10]:
# Run complete automated evaluation script
!python scripts/evaluation.py --config=config/config.yaml

2024-07-20 11:22:05 [[34m__main__:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 11:22:05 [[34m__main__:43[0m] [DEBUG[0m] >>>> Test cases: [{'test_case': '1', 'test_step': '2', 'generated_code': "import { test, expect } from '@playwright/test';\nimport { writeFileSync } from 'fs';\n\ntest('test', async ({ page }) => {\n  await page.goto('http://localhost:8080/cadenza/');\n  await page.getByRole('link', { name: 'Anmelden' }).click();\n  await page.getByLabel('Benutzername *').click();\n  await page.getByLabel('Benutzername *').fill('Admin');\n  await page.getByLabel('Benutzername *').press('Tab');\n  await page.getByPlaceholder(' ').fill('Admin');\n  await page.getByRole('button', { name: 'Anmelden' }).click();\n  await page.getByText('Verzeichnis Gewässergüte', { exact: true }).click();\n  const parentElement = await page.getByText('Arbeitsmappe Übersicht Messstellen').locator('..');\n  await parentElement.locator('.d-icon.d-icon-bold.status-icon').click();\n});\



It is also possible to evaluate the test cases inside the notebook by using the evaluate_test_cases() function without running the whole script but directly calling the main function of the script:

In [11]:
# Run complete automated evaluation inside notebook
from scripts.evaluation import evaluate_test_cases
results = evaluate_test_cases(config)
results

2024-07-20 11:22:12 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 11:22:12 [[34mscripts.evaluation:43[0m] [DEBUG[0m] >>>> Test cases: [{'test_case': '1', 'test_step': '2', 'generated_code': "import { test, expect } from '@playwright/test';\nimport { writeFileSync } from 'fs';\n\ntest('test', async ({ page }) => {\n  await page.goto('http://localhost:8080/cadenza/');\n  await page.getByRole('link', { name: 'Anmelden' }).click();\n  await page.getByLabel('Benutzername *').click();\n  await page.getByLabel('Benutzername *').fill('Admin');\n  await page.getByLabel('Benutzername *').press('Tab');\n  await page.getByPlaceholder(' ').fill('Admin');\n  await page.getByRole('button', { name: 'Anmelden' }).click();\n  await page.getByText('Verzeichnis Gewässergüte', { exact: true }).click();\n  const parentElement = await page.getByText('Arbeitsmappe Übersicht Messstellen').locator('..');\n  await parentElement.locator('.d-icon.d-icon-bold.status-i

Unnamed: 0,file_id,test_case,test_step,weighted bleu,success rate,levenshtein distance,similarity
0,1_2,1,2,0.500697,1,0.092308,0
1,2_2,2,2,0.398845,1,0.26324,0


# Run Scoring on all Different Predictions
We generated different predictions using different input configurations. Now we want to evaluate all of them and compare the results.

In [14]:
# Adjust file names from pred.ts to spec.ts
def rename_files_in_directory(directory: str):
    # List all files in the directory
    for file_name in os.listdir(directory):
        # Check if the file name ends with '.pred.ts'
        if file_name.endswith('.pred.ts'):
            # Create the new file name
            new_file_name = file_name.replace('.pred.ts', '.spec.ts')
            # Create the full paths to the old and new file names
            old_file_path = os.path.join(directory, file_name)
            new_file_path = os.path.join(directory, new_file_name)
            # Rename the file
            os.rename(old_file_path, new_file_path)
            print(f"Renamed '{old_file_path}' to '{new_file_path}'")

In [34]:
%%time
# Go through prediction directory and run the renaming function and evaluation function on each directory
results = []
for dir in tqdm(os.listdir("data/prediction/")):
    if dir == '.gitkeep':
        continue
    print('\n'+'='*80+'\n')
    print(f"Processing directory '{dir}'...")
    rename_files_in_directory("data/prediction/" + dir)
    config['paths']['prediction_dir'] = "data/prediction/" + dir + "/"
    result = evaluate_test_cases(config)
    results.append(result)

# merge
results = pd.concat(results).reset_index(drop=True)
results.to_pickle(config['paths']['scores_dir']+'eval_scores_all.pkl')

  0%|          | 0/25 [00:00<?, ?it/s]



Processing directory 'pred_test_script'...
2024-07-20 12:11:06 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:11:12 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 12:11:12 [[34mscripts.evaluation:49[0m] [[32mINFO[0m] >>>> Aggregating...[0m
2024-07-20 12:11:12 [[34mscripts.evaluation:52[0m] [[32mINFO[0m] >>>> Scores aggregated: {'weighted bleu': 0.4497710276867021, 'success rate': 1.0, 'levenshtein distance': 0.1777737838485502, 'similarity': None}[0m
2024-07-20 12:11:12 [[34mscripts.evaluation:66[0m] [[32mINFO[0m] >>>> Results saved to ./data/scores/eval_scores_20240720-121112.pkl[0m


  8%|▊         | 2/25 [00:05<01:08,  3.00s/it]



Processing directory 'pred_test_script_finetuned_T1_sc+_html+_single'...
2024-07-20 12:11:12 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:11:22 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_4: Line 29: Unexpected token ILLEGAL[0m
2024-07-20 12:11:38 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 11_3: Line 28: Unexpected token ILLEGAL[0m
2024-07-20 12:11:44 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_1: Line 22: Unexpected token ILLEGAL[0m
2024-07-20 12:11:50 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_3: Line 24: Unexpected token ILLEGAL[0m
2024-07-20 12:12:10 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 14_1: Line 29: Unex

 12%|█▏        | 3/25 [05:22<49:04, 133.82s/it]



Processing directory 'pred_test_script_finetuned_T1_sc+_html+_single_test_set'...
2024-07-20 12:16:29 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:16:36 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 15_3: Line 24: Unexpected token ILLEGAL[0m
2024-07-20 12:16:42 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 15_5: Line 26: Unexpected token ILLEGAL[0m
2024-07-20 12:16:45 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 28_1: Line 29: Unexpected token ILLEGAL[0m
2024-07-20 12:17:32 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 9_6: Line 23: Unexpected token ILLEGAL[0m
2024-07-20 12:17:35 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 12:17:35 [[34mscripts.evalua

 16%|█▌        | 4/25 [06:29<38:00, 108.61s/it]



Processing directory 'pred_test_script_finetuned_T1_sc-_html+_single'...
2024-07-20 12:17:35 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:18:13 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_3: Line 23: Unexpected token ILLEGAL[0m
2024-07-20 12:18:16 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_4: Line 21: Unexpected token ILLEGAL[0m
2024-07-20 12:19:49 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 1_3: Line 23: Unexpected token ILLEGAL[0m
2024-07-20 12:21:36 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 3_2: Line 21: Unexpected token ILLEGAL[0m
2024-07-20 12:23:16 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 12:23:16 [[34mscripts.evaluation:49[0

 20%|██        | 5/25 [12:09<1:03:05, 189.26s/it]



Processing directory 'pred_test_script_finetuned_T1_sc-_html+_single_test_set'...
2024-07-20 12:23:16 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:24:17 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 12:24:17 [[34mscripts.evaluation:49[0m] [[32mINFO[0m] >>>> Aggregating...[0m
2024-07-20 12:24:17 [[34mscripts.evaluation:52[0m] [[32mINFO[0m] >>>> Scores aggregated: {'weighted bleu': 0.7972442574430477, 'success rate': 1.0, 'levenshtein distance': 0.05943636180917157, 'similarity': None}[0m
2024-07-20 12:24:17 [[34mscripts.evaluation:66[0m] [[32mINFO[0m] >>>> Results saved to ./data/scores/eval_scores_20240720-122417.pkl[0m


 24%|██▍       | 6/25 [13:11<46:29, 146.82s/it]  



Processing directory 'pred_test_script_finetuned_T5_sc+_html+_single'...
2024-07-20 12:24:17 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:24:18 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_1: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 12:24:27 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_4: Line 25: Unexpected token ILLEGAL[0m
2024-07-20 12:24:41 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 11_3: Line 26: Unexpected token ILLEGAL[0m
2024-07-20 12:25:01 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 13_1: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 12:25:19 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 15_1: Line 30: Unex

 28%|██▊       | 7/25 [18:06<58:25, 194.74s/it]



Processing directory 'pred_test_script_finetuned_T5_sc+_html+_single_test_set'...
2024-07-20 12:29:13 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:29:13 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 15_1: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 12:29:28 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 28_1: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 12:29:59 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 9_1: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 12:30:16 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 12:30:16 [[34mscripts.evaluation:49[0m] [[32mINFO[0m] >>>> Aggregating...[0m
2024-07-20 12:30:16 [[34mscripts.evaluation:52[0m] [[32mINFO[0m] >>>> Scores aggregated: {'weighted bleu': 0.5654

 32%|███▏      | 8/25 [19:10<43:26, 153.33s/it]



Processing directory 'pred_test_script_finetuned_T5_sc-_html+_single'...
2024-07-20 12:30:16 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:33:42 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 30_3: Line 26: Unexpected token ILLEGAL[0m
2024-07-20 12:33:48 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 3_1: Line 11: Unexpected token ILLEGAL[0m
2024-07-20 12:33:51 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 3_2: Line 21: Unexpected token ILLEGAL[0m
2024-07-20 12:35:17 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 12:35:17 [[34mscripts.evaluation:49[0m] [[32mINFO[0m] >>>> Aggregating...[0m
2024-07-20 12:35:17 [[34mscripts.evaluation:52[0m] [[32mINFO[0m] >>>> Scores aggregated: {'weighted bleu': 0.77125753798096

 36%|███▌      | 9/25 [24:10<53:02, 198.92s/it]



Processing directory 'pred_test_script_finetuned_T5_sc-_html+_single_test_set'...
2024-07-20 12:35:17 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:35:56 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 30_3: Line 26: Unexpected token ILLEGAL[0m
2024-07-20 12:36:19 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 12:36:19 [[34mscripts.evaluation:49[0m] [[32mINFO[0m] >>>> Aggregating...[0m
2024-07-20 12:36:19 [[34mscripts.evaluation:52[0m] [[32mINFO[0m] >>>> Scores aggregated: {'weighted bleu': 0.7545435429993087, 'success rate': 1.0, 'levenshtein distance': 0.06334898614530313, 'similarity': None}[0m
2024-07-20 12:36:19 [[34mscripts.evaluation:66[0m] [[32mINFO[0m] >>>> Results saved to ./data/scores/eval_scores_20240720-123619.pkl[0m


 40%|████      | 10/25 [25:12<39:15, 157.03s/it]



Processing directory 'pred_test_script_pretr_T1_sc+_html+_all'...
2024-07-20 12:36:19 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:36:25 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_3: Line 28: Unexpected token ILLEGAL[0m
2024-07-20 12:36:51 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_2: Line 37: Unexpected token ILLEGAL[0m
2024-07-20 12:37:38 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 16_2: Line 42: Unexpected token ILLEGAL[0m
2024-07-20 12:37:41 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 16_3: Line 26: Unexpected token ILLEGAL[0m
2024-07-20 12:37:59 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 18_2: Line 46: Unexpected 

 44%|████▍     | 11/25 [30:26<47:46, 204.75s/it]



Processing directory 'pred_test_script_pretr_T1_sc+_html+_single'...
2024-07-20 12:41:33 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:41:40 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_3: Line 29: Unexpected token ILLEGAL[0m
2024-07-20 12:41:52 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 11_1: Line 23: Unexpected token ILLEGAL[0m
2024-07-20 12:42:04 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_1: Line 24: Unexpected token ILLEGAL[0m
2024-07-20 12:42:57 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 16_3: Line 23: Unexpected token ILLEGAL[0m
2024-07-20 12:43:44 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 25_1: Line 24: Unexpect

 48%|████▊     | 12/25 [35:22<50:20, 232.34s/it]



Processing directory 'pred_test_script_pretr_T1_sc+_html-_single'...
2024-07-20 12:46:28 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:47:23 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 14_1: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 12:47:47 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 16_1: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 12:48:08 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 18_1: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 12:49:51 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 30_3: Line 47: Unexpected token ILLEGAL[0m
2024-07-20 12:51:26 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 12:51:26 [[34mscripts.evaluation:49[0m]

 52%|█████▏    | 13/25 [40:19<50:23, 251.93s/it]



Processing directory 'pred_test_script_pretr_T1_sc-_html+_single'...
2024-07-20 12:51:26 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:51:42 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_6: Line 12: Unexpected token ILLEGAL[0m
2024-07-20 12:51:45 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 11_1: Line 24: Unexpected token ILLEGAL[0m
2024-07-20 12:51:59 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_2: Line 24: Unexpected token ILLEGAL[0m
2024-07-20 12:52:22 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 14_2: Line 36: Unexpected token ILLEGAL[0m
2024-07-20 12:52:34 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 15_3: Line 23: Unexpect

 56%|█████▌    | 14/25 [45:15<48:37, 265.25s/it]



Processing directory 'pred_test_script_pretr_T1_sc-_html-_single'...
2024-07-20 12:56:22 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 12:57:32 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 15_3: Line 27: Unexpected token ILLEGAL[0m
2024-07-20 12:59:12 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 27_2: Line 25: Unexpected token ILLEGAL[0m
2024-07-20 12:59:46 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 30_2: Line 26: Unexpected token ILLEGAL[0m
2024-07-20 13:00:01 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 4_1: Line 31: Unexpected token ILLEGAL[0m
2024-07-20 13:00:13 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 4_5: Line 44: Unexpected

 60%|██████    | 15/25 [50:17<46:02, 276.21s/it]



Processing directory 'pred_test_script_pretr_T2_sc+_html+_single'...
2024-07-20 13:01:23 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 13:01:33 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_4: Line 29: Unexpected token ILLEGAL[0m
2024-07-20 13:02:47 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 16_3: Line 21: Unexpected token ILLEGAL[0m
2024-07-20 13:02:59 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 17_4: Line 20: Unexpected token ILLEGAL[0m
2024-07-20 13:04:06 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 27_1: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 13:04:29 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 28_4: Line 21: Unexpect

 64%|██████▍   | 16/25 [55:13<42:21, 282.42s/it]



Processing directory 'pred_test_script_pretr_T3_sc+_html+_single'...
2024-07-20 13:06:20 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 13:06:21 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_1: Line 23: Unexpected token ILLEGAL[0m
2024-07-20 13:06:24 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_2: Line 32: Unexpected token ILLEGAL[0m
2024-07-20 13:06:31 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_4: Line 31: Unexpected token ILLEGAL[0m
2024-07-20 13:06:39 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 11_1: Line 23: Unexpected token ILLEGAL[0m
2024-07-20 13:06:52 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_1: Line 24: Unexpect

 68%|██████▊   | 17/25 [1:00:21<38:39, 289.98s/it]



Processing directory 'pred_test_script_pretr_T4_sc+_html+_single'...
2024-07-20 13:11:28 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 13:11:32 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_2: Line 32: Unexpected token ILLEGAL[0m
2024-07-20 13:11:46 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 11_1: Line 24: Unexpected token ILLEGAL[0m
2024-07-20 13:12:09 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_5: Line 24: Unexpected token ILLEGAL[0m
2024-07-20 13:12:48 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 16_2: Line 40: Unexpected token ILLEGAL[0m
2024-07-20 13:13:08 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 18_2: Line 38: Unexpect

 72%|███████▏  | 18/25 [1:05:19<34:06, 292.35s/it]



Processing directory 'pred_test_script_pretr_T5_sc+_html+_all'...
2024-07-20 13:16:26 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 13:16:32 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 10_3: Line 26: Unexpected token ILLEGAL[0m
2024-07-20 13:17:03 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_3: Line 33: Unexpected token ILLEGAL[0m
2024-07-20 13:17:21 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 14_1: Line 25: Unexpected token ILLEGAL[0m
2024-07-20 13:17:45 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 16_1: Line 25: Unexpected token ILLEGAL[0m
2024-07-20 13:18:06 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 18_1: Line 25: Unexpected 

 76%|███████▌  | 19/25 [1:10:18<29:27, 294.52s/it]



Processing directory 'pred_test_script_pretr_T5_sc+_html+_single'...
2024-07-20 13:21:25 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 13:23:43 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 25_4: Line 29: Unexpected token ILLEGAL[0m
2024-07-20 13:23:58 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 26_4: Line 27: Unexpected token ILLEGAL[0m
2024-07-20 13:24:41 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 30_2: Line 30: Unexpected token ILLEGAL[0m
2024-07-20 13:25:33 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 7_2: Line 28: Unexpected token ILLEGAL[0m
2024-07-20 13:26:17 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 13:26:17 [[34mscripts.evaluation:49[0m] 

 80%|████████  | 20/25 [1:15:10<24:27, 293.57s/it]



Processing directory 'pred_test_script_pretr_T5_sc-_html+_single'...
2024-07-20 13:26:17 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 13:26:56 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 12_4: Line 28: Unexpected token ILLEGAL[0m
2024-07-20 13:27:10 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 14_1: Line 35: Unexpected token ILLEGAL[0m
2024-07-20 13:27:31 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 15_5: Line 38: Unexpected token ILLEGAL[0m
2024-07-20 13:27:34 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 16_1: Line 35: Unexpected token ILLEGAL[0m
2024-07-20 13:27:54 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 18_1: Line 35: Unexpect

 84%|████████▍ | 21/25 [1:20:04<19:35, 293.77s/it]



Processing directory 'pred_test_script_template_1_html_concat_mode_single_max_attr_length_50_pretrained'...
2024-07-20 13:31:11 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 13:36:10 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 13:36:10 [[34mscripts.evaluation:49[0m] [[32mINFO[0m] >>>> Aggregating...[0m
2024-07-20 13:36:10 [[34mscripts.evaluation:52[0m] [[32mINFO[0m] >>>> Scores aggregated: {'weighted bleu': 0.38573672117195584, 'success rate': 1.0, 'levenshtein distance': 0.2914253065987123, 'similarity': None}[0m
2024-07-20 13:36:10 [[34mscripts.evaluation:66[0m] [[32mINFO[0m] >>>> Results saved to ./data/scores/eval_scores_20240720-133610.pkl[0m


 88%|████████▊ | 22/25 [1:25:04<14:46, 295.54s/it]



Processing directory 'pred_test_script_template_1_no_html_pretrained'...
2024-07-20 13:36:10 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 13:41:11 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 13:41:11 [[34mscripts.evaluation:49[0m] [[32mINFO[0m] >>>> Aggregating...[0m
2024-07-20 13:41:11 [[34mscripts.evaluation:52[0m] [[32mINFO[0m] >>>> Scores aggregated: {'weighted bleu': 0.2354291135542711, 'success rate': 1.0, 'levenshtein distance': 0.46249194171936, 'similarity': None}[0m
2024-07-20 13:41:11 [[34mscripts.evaluation:66[0m] [[32mINFO[0m] >>>> Results saved to ./data/scores/eval_scores_20240720-134111.pkl[0m


 92%|█████████▏| 23/25 [1:30:04<09:54, 297.08s/it]



Processing directory 'pred_test_script_template_2_html_concat_mode_all_max_attr_length_50_pretrained'...
2024-07-20 13:41:11 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 13:44:38 [[34msrc.evaluation.metrics:132[0m] [[31mERROR[0m] >>>> Error calculating weighted bleu for test case 3_1: Line 19: Unexpected token ILLEGAL[0m
2024-07-20 13:46:06 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 13:46:06 [[34mscripts.evaluation:49[0m] [[32mINFO[0m] >>>> Aggregating...[0m
2024-07-20 13:46:06 [[34mscripts.evaluation:52[0m] [[32mINFO[0m] >>>> Scores aggregated: {'weighted bleu': 0.2522897304237842, 'success rate': 1.0, 'levenshtein distance': 0.4574200337451043, 'similarity': None}[0m
2024-07-20 13:46:06 [[34mscripts.evaluation:66[0m] [[32mINFO[0m] >>>> Results saved to ./data/scores/eval_scores_20240720-134606.pkl[0m


 96%|█████████▌| 24/25 [1:34:59<04:56, 296.44s/it]



Processing directory 'pred_test_script_template_2_html_concat_mode_single_max_attr_length_50_pretrained'...
2024-07-20 13:46:06 [[34mscripts.evaluation:22[0m] [[32mINFO[0m] >>>> Calculating scores...[0m
2024-07-20 14:12:43 [[34mscripts.evaluation:46[0m] [[32mINFO[0m] >>>> Scores calculated.[0m
2024-07-20 14:12:43 [[34mscripts.evaluation:49[0m] [[32mINFO[0m] >>>> Aggregating...[0m
2024-07-20 14:12:43 [[34mscripts.evaluation:52[0m] [[32mINFO[0m] >>>> Scores aggregated: {'weighted bleu': 0.29558613115819576, 'success rate': 1.0, 'levenshtein distance': 0.4192614982918608, 'similarity': None}[0m
2024-07-20 14:12:43 [[34mscripts.evaluation:66[0m] [[32mINFO[0m] >>>> Results saved to ./data/scores/eval_scores_20240720-141243.pkl[0m


100%|██████████| 25/25 [2:01:37<00:00, 291.89s/it]

CPU times: total: 3min 19s
Wall time: 2h 1min 37s





# Evaluation

In [37]:
# Load saved evaluation results of different predictions
results = pd.read_pickle(config['paths']['scores_dir']+'eval_scores_all.pkl')
results

Unnamed: 0,file_id,test_case,test_step,prediction_dir,weighted bleu,success rate,levenshtein distance,similarity
0,1_2,1,2,data/prediction/pred_test_script/,0.500697,1,0.092308,0
1,2_2,2,2,data/prediction/pred_test_script/,0.398845,1,0.263240,0
2,10_1,10,1,data/prediction/pred_test_script_finetuned_T1_...,0.513906,1,0.578158,0
3,10_2,10,2,data/prediction/pred_test_script_finetuned_T1_...,0.725558,1,0.029940,0
4,10_3,10,3,data/prediction/pred_test_script_finetuned_T1_...,0.900160,1,0.013831,0
...,...,...,...,...,...,...,...,...
1981,9_2,9,2,data/prediction/pred_test_script_template_2_ht...,0.563547,1,0.224000,0
1982,9_3,9,3,data/prediction/pred_test_script_template_2_ht...,0.000000,1,0.816187,0
1983,9_4,9,4,data/prediction/pred_test_script_template_2_ht...,0.000000,1,0.831671,0
1984,9_5,9,5,data/prediction/pred_test_script_template_2_ht...,0.340417,1,0.259560,0


In [36]:
results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1986 entries, 0 to 1985
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   file_id               1986 non-null   object 
 1   test_case             1986 non-null   object 
 2   test_step             1986 non-null   object 
 3   prediction_dir        1986 non-null   object 
 4   weighted bleu         1799 non-null   float64
 5   success rate          1986 non-null   int64  
 6   levenshtein distance  1986 non-null   float64
 7   similarity            1986 non-null   int64  
dtypes: float64(2), int64(2), object(4)
memory usage: 124.3+ KB


In [None]:
# Display evaluation results in a boxplot
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=results, ax=ax, orient='h')
ax.set_title('Evaluation Results', fontsize=16, fontweight='bold')
ax.set_xlabel('Score')
plt.xticks(rotation=45)
plt.show()

# References
[1]
```bibtex
@INPROCEEDINGS{Papineni02bleu:a,
    author = {Kishore Papineni and Salim Roukos and Todd Ward and Wei-jing Zhu},
    title = {BLEU: a Method for Automatic Evaluation of Machine Translation},
    booktitle = {},
    year = {2002},
    pages = {311--318}
}
```
[2]
```bibtex
@inproceedings{lin-och-2004-orange,
    title = "{ORANGE}: a Method for Evaluating Automatic Evaluation Metrics for Machine Translation",
    author = "Lin, Chin-Yew  and
      Och, Franz Josef",
    booktitle = "{COLING} 2004: Proceedings of the 20th International Conference on Computational Linguistics",
    month = "aug 23{--}aug 27",
    year = "2004",
    address = "Geneva, Switzerland",
    publisher = "COLING",
    url = "https://www.aclweb.org/anthology/C04-1072",
    pages = "501--507",
}
```
