In [1]:
%load_ext autoreload
%autoreload 2

Structure of the notebook:

In [25]:
import sqlite3
from bs4 import BeautifulSoup

from src.main import main, main_cluster_multimodal_model
from src.utils.helpers import truncate_text, clean_string
from src.utils.logger import setup_logger
from src.llm.access_2_cluster import Access2Cluster

import logging
from src.data.html_processor import extract_html_info
from src.data.code_processor import parse_code
from src.data.image_processor import extract_text_from_image
from src.input_builder import create_input
from src.ui_tests.test_generation import generate_code
from src.evaluation import calculate_scores
from src.utils.logger import setup_logger


logger = setup_logger(__name__, level='DEBUG') # Change to 'INFO' for less verbosity

# Load Data

In [3]:
# Connect to the database
conn = sqlite3.connect('../data/raw/playwright_script.db')
cursor = conn.cursor()

res = cursor.execute("SELECT * FROM tests")
items = res.fetchall()

print("There are {} data.".format(len(items)))

There are 100 data.


In [4]:
# Check the first item
items[0]

('1.1',
 '[1.1] Öffne die Arbeitsmappe "Übersicht Messstellen" im Ordner "Gewässergüte".',
 '[1.1] Expected result: Die Arbeitsmappe wird geöffnet, der Analysekontext ist nicht sichtbar.',
 '.\\html\\1_1.html',
 '.\\screenshot\\1_1.png',
 '.\\test_script\\1_1.spec.ts')

# Approach

Use a Pre-trained LLM:
* GPT-3, GPT-3.5, or a lighter version like GPT-2 (well-suited for text generation tasks)
* Fine-tuning or adapting for specific tasks later possible

Steps:
1. ✅ HTML Processing: Extract relevant information from the HTML file.
    * Use **BeautifulSoup** or lxml in Python to parse and extract information from the HTML file. ➡️ see src.data.html_processor.py
2. ✅ Image Processing: Extract relevant information from the image:
    * Use image-to-text models like **Tesseracts** or pytesseract (OCR libraries) to extract text from the image. ➡️ see src.data.image_processor.py
    * Use OpenCV or PIL (Pillow) in Python to process the image and extract relevant information.
3. Summarize the image and HTML information and the prompt from the playwright test code using T5 model. (optional)
4. ✅ Python Processing: Parse the given playwright test code for previous step as a precondition. ➡️ see src.data.python_processor.py
5. ✅ Combine the extracted information from the HTML and the image with the prompt for the language model. ➡️ see src.data.input_combiner.py
6. ✅ Pass the combined input to the language model for generating the UI test code. ➡️ see src.ui_tests.test_generation.py

➡️ Run locally via notebook or script using the src.main.py

# HTML Processing

In [5]:
# load example file
html_path = './html/0_1.html'

Option 1: Parse all HTML content by extracting the text from it using BeautifulSoup:

In [6]:
def parse_html(html_path: str, max_length: int = 200) -> str:
    """ Parse the HTML content from a file using BeautifulSoup. It extracts the text content and truncates it to the given maximum length.

    :param max_length: The maximum length of the text.
    :param html_path: The path to the HTML file.
    :return: The text content of the HTML file.
    """
    # Load HTML content from a file
    with open(html_path, "r") as file:
        html_content = file.read()

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Get the text content
    html_text = soup.get_text(strip=True)

    # Truncate the text to the maximum length
    html_text = truncate_text(html_text, max_length=max_length)

    logger.debug(f"HTML content parsed successfully. - Lines of Code: {len(html_text.splitlines())}")

    return html_text

In [7]:
html_text = parse_html(html_path, max_length=2000)
print(html_text)

2024-06-25 12:11:53 [[34m__main__:21[0m] [DEBUG[0m] >>>> HTML content parsed successfully. - Lines of Code: 3[0m
Startseite - disy CadenzaAchtung!Ihr Browser unterstÃ¼tzt kein JavaScript oder JavaScript wurde in Ihrem Browser deaktiviert.Bitte verwenden Sie einen Browser, der JavaScript unterstÃ¼tzt, oder aktivieren Sie JavaScript in Ihrem Browser. Ohne aktiviertes JavaScript ist die Anwendung nicht nutzbar.OfflineVerbinden â€¦Zum Navigatorbaum springenZum Hauptbereich springenStartseiteKartedisy Cadenza[{"printName":"Lernmodule â€“ Tutorials und mehr","url":"/help-learning/index.html","targetFrame":"_blank","id":"help","type":"help","webApplication":false},{"printName":"Hilfe","url":"/help/","targetFrame":"_blank","id":"help","type":"help","webApplication":false},{"printName":"Hilfe zu Classic","url":"/help-classic/","targetFrame":"_blank","id":"help-classic","type":"help-classic","webApplication":false}]Admin[{"printName":"Profil","url":"/pages/access/userprofile.xhtml","targetFr

Option 2: Extract only Elements (input fields, buttons, links) from the HTML file using BeautifulSoup.

In [8]:
def extract_html_info(file_path):
    """Extract relevant information from an HTML file.

    :param file_path: The path to the HTML file.
    :return: A formatted string containing the extracted HTML elements.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    buttons = soup.find_all('button')
    inputs = soup.find_all('input')
    links = soup.find_all('a')

    html_elements = "HTML Elements:\n"

    for button in buttons:
        button_text = clean_string(button.text)
        button_id = button.get("id", "No ID")
        button_class = ' '.join(button.get("class", []))  # Convert list to string with space separator
        button_type = button.get("type", "button")
        html_elements += f'Button: {button_text} - ID: {button_id} - Class: {button_class} - Type: {button_type}\n'

    for input_field in inputs:
        input_name = input_field.get("name", "No Name")
        input_type = input_field.get("type", "text")
        input_value = input_field.get("value", "")
        input_placeholder = clean_string(input_field.get("placeholder", ""))
        input_id = input_field.get("id", "No ID")
        input_class = ' '.join(input_field.get("class", []))  # Convert list to string with space separator
        input_label = input_field.get("aria-label", "")
        html_elements += f'Input: {input_name} - Type: {input_type} - Value: {input_value} - Placeholder: {input_placeholder} - ID: {input_id} - Class: {input_class} - Label: {input_label}\n'

    for link in links:
        link_text = clean_string(link.text)
        link_href = link.get("href", "#")
        link_id = link.get("id", "No ID")
        link_class = ' '.join(link.get("class", []))  # Convert list to string with space separator
        html_elements += f'Link: {link_text} - Href: {link_href} - ID: {link_id} - Class: {link_class}\n'

    logger.debug(f"HTML elements extracted successfully. - Number of Elements: {len(html_elements.splitlines())}")

    return html_elements.strip()

In [9]:
html_elements = extract_html_info(html_path)
print(html_elements)

2024-06-25 12:11:53 [[34m__main__:42[0m] [DEBUG[0m] >>>> HTML elements extracted successfully. - Number of Elements: 48[0m
HTML Elements:
Button:  - ID: navigationTrigger - Class: button button-icon button-borderless - Type: button
Button: [{"printName":"Lernmodule – Tutorials und mehr","url":"/help-learning/index.html","targetFrame":"_blank","id":"help","type":"help","webApplication":false},{"printName":"Hilfe","url":"/help/","targetFrame":"_blank","id":"help","type":"help","webApplication":false},{"printName":"Hilfe zu Classic","url":"/help-classic/","targetFrame":"_blank","id":"help-classic","type":"help-classic","webApplication":false}] - ID: No ID - Class: d-help-menu button button-icon button-borderless d-topnav-dropdown - Type: button
Button: Admin [{"printName":"Profil","url":"/pages/access/userprofile.xhtml","targetFrame":"_self","id":"userprofile","type":"userprofile","webApplication":false},{"printName":"Abmelden","url":"/logout","targetFrame":"_self","id":"logout","type

# Image Processing

In [None]:
# Imports for Image Processing
from PIL import Image
import pytesseract
from typing import Union

# Utility functions
from src.utils.helpers import truncate_text
from src.utils.logger import setup_logger

# Setup Logger
logger = setup_logger(__name__, level='DEBUG')

# Define the function to extract text from image
def extract_text_from_image(image_path: str, max_length: Union[int, None] = 200) -> str:
    """ Extract text from an image file.

    :param max_length: The maximum length of the text.
    :param image_path: The path to the image file.
    :return: The extracted text from the image.
    """
    image = Image.open(image_path)
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    # Extract text from the image
    text = pytesseract.image_to_string(image)
    # Truncate the text to the maximum length
    if max_length:
        text = truncate_text(text, max_length=max_length)

    logger.debug(f"Text extracted from image successfully. - Characters: {len(text)}")

    return text

# Demonstrate the function with an example image
image_path = 'path/to/image/file.png'
extracted_text = extract_text_from_image(image_path, max_length=300)
print("Extracted Text from Image:", extracted_text)


# (Local) Pipeline: LLM UI Test Generation
Using smaller GPT-2 model

LLM Test Generation with the test from the database

1. First, extract the relevant information from the database for a specific test to be predicted:

In [10]:
def get_previous_id(id):
    test, step = map(int, id.split('.'))
    if step > 1:
        previous_id = f"{test}.{step - 1}"
    else:
        print(f"Test {test} has no more previous step. No context available.")

    return previous_id

In [11]:
def fetch_relevant_items(db_file, current_id):
    # Connect to SQLite database
    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()

    # Get the previous ID
    previous_id = get_previous_id(current_id)

    # Prepare the SQL query to retrieve the desired rows
    query = 'SELECT * FROM tests WHERE id IN (?, ?)'
    cursor.execute(query, (current_id, previous_id))

    # Fetch all matching rows
    items = cursor.fetchall()

    # Close the connection
    conn.close()

    return items

In [12]:
def map_items_to_args(items):
    # The file paths from the test x-1 are used as context
    html_path = items[0][3]
    image_path = items[0][4]
    precondition_path = items[0][5]
    # The last step of the text x is used as prompt
    steps = items[1][1].split(']')
    description = steps[-1].strip()

    return {"html_path": html_path,
            "image_path": image_path,
            "precondition_path": precondition_path,
            "description": description}

In [13]:
# Select test ID and database file
db_file = '../data/raw/playwright_script.db'
current_id = '1.4'

In [14]:
# Get relevant data from the database
items = fetch_relevant_items(db_file, current_id)
args = map_items_to_args(items)

In [15]:
items

[('1.3',
  '[1.1] Öffne die Arbeitsmappe "Übersicht Messstellen" im Ordner "Gewässergüte". [1.2]  Öffnen der Tabellen-Sicht "Messstellenliste" über die Werkzeugliste der Arbeitsmappe. [1.3] Klicke auf den 3-Punkte-Button innerhalb der Tabellen-Sicht "Messstelleninformationen"',
  '[1.3] Expected result: Kontextmenü erscheint mit den Optionen: - Duplizieren - In Arbeitsblatt duplizieren - Exportieren - Designer öffnen - Löschen.',
  '.\\html\\1_3.html',
  '.\\screenshot\\1_3.png',
  '.\\test_script\\1_3.spec.ts'),
 ('1.4',
  '[1.1] Öffne die Arbeitsmappe "Übersicht Messstellen" im Ordner "Gewässergüte". [1.2]  Öffnen der Tabellen-Sicht "Messstellenliste" über die Werkzeugliste der Arbeitsmappe. [1.3] Klicke auf den 3-Punkte-Button innerhalb der Tabellen-Sicht "Messstelleninformationen". [1.4] Klicke auf "Duplizieren".',
  '[1.4] Expected result: Das Kontextmenü schließt sich. Die Sicht ist dupliziert. Die duplizierte Sicht erscheint neben der Original-Sicht. Beide Sichten teilen sich in

In [16]:
args

{'html_path': '.\\html\\1_3.html',
 'image_path': '.\\screenshot\\1_3.png',
 'precondition_path': '.\\test_script\\1_3.spec.ts',
 'description': 'Klicke auf "Duplizieren".'}

2. Now we can run the main function to sum it up with the extracted information, we will get the generated UI test code for the test with id 1.4. Prompt engineering is implemented in the main function.

In [36]:
%%time
# Generate UI test code
prompt = main(**args)

2024-06-25 13:32:45 [[34msrc.main:25[0m] [[32mINFO[0m] >>>> Loading context...[0m
2024-06-25 13:32:45 [[34msrc.data.html_processor:106[0m] [DEBUG[0m] >>>> HTML elements extracted successfully. - Number of Elements: 36 - Number of Characters: 3522[0m
2024-06-25 13:32:46 [[34msrc.data.image_processor:27[0m] [DEBUG[0m] >>>> Text extracted from image successfully. - Characters: 303[0m
2024-06-25 13:32:46 [[34msrc.data.python_processor:15[0m] [DEBUG[0m] >>>> Python code parsed successfully. - Lines of Code: 17[0m
2024-06-25 13:32:46 [[34msrc.main:31[0m] [[32mINFO[0m] >>>> Context loaded successfully.[0m
2024-06-25 13:32:46 [[34msrc.main:34[0m] [[32mINFO[0m] >>>> Creating input prompt...[0m
2024-06-25 13:32:46 [[34msrc.main:36[0m] [[32mINFO[0m] >>>> Input prompt created successfully.[0m
2024-06-25 13:32:46 [[34msrc.main:37[0m] [DEBUG[0m] >>>> Input prompt:
You are a test automation script writer. Your task is to create a Playwright test script for the given

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


ValueError: Input length of input_ids is 1024, but `max_length` is set to 500. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

# (Cluster) Pipeline: LLM UI Test Generation
Using Llava 1.5 model

1. Extract relevant information from the database for a specific test to be predicted:

In [166]:
# Select test ID and database file
db_file = '../data/raw/playwright_script.db'
current_id = '1.4'

In [167]:
items = fetch_relevant_items(db_file, current_id)
args = map_items_to_args(items)

In [168]:
items

[('1.3',
  '[1.1] Öffne die Arbeitsmappe "Übersicht Messstellen" im Ordner "Gewässergüte". [1.2]  Öffnen der Tabellen-Sicht "Messstellenliste" über die Werkzeugliste der Arbeitsmappe. [1.3] Klicke auf den 3-Punkte-Button innerhalb der Tabellen-Sicht "Messstelleninformationen"',
  '[1.3] Expected result: Kontextmenü erscheint mit den Optionen: - Duplizieren - In Arbeitsblatt duplizieren - Exportieren - Designer öffnen - Löschen.',
  '.\\html\\1_3.html',
  '.\\screenshot\\1_3.png',
  '.\\test_script\\1_3.spec.ts'),
 ('1.4',
  '[1.1] Öffne die Arbeitsmappe "Übersicht Messstellen" im Ordner "Gewässergüte". [1.2]  Öffnen der Tabellen-Sicht "Messstellenliste" über die Werkzeugliste der Arbeitsmappe. [1.3] Klicke auf den 3-Punkte-Button innerhalb der Tabellen-Sicht "Messstelleninformationen". [1.4] Klicke auf "Duplizieren".',
  '[1.4] Expected result: Das Kontextmenü schließt sich. Die Sicht ist dupliziert. Die duplizierte Sicht erscheint neben der Original-Sicht. Beide Sichten teilen sich in

In [169]:
# To traverse folders in cluster change path
args['image_path'] = args['image_path'].replace('\\', '/')

In [170]:
args

{'html_path': '.\\html\\1_3.html',
 'image_path': './screenshot/1_3.png',
 'precondition_path': '.\\test_script\\1_3.spec.ts',
 'description': 'Klicke auf "Duplizieren".'}

2. Setup access to the cluster:

Precondition:
- Adjust the paths in src file with your specific student credentials
- Make sure that playwright is installed on your local machine

In [171]:
access2cluster = Access2Cluster()
await access2cluster.login()

NotImplementedError: 

3. Init the Llava 1.5 model:

In [None]:
await access2cluster.start_llm()
args['model'] = access2cluster

In [None]:
args

4. Run the main function to sum it up with the extracted information, we will get the generated UI test code for the test with id 1.4. Prompt engineering is implemented in the main function.

In [None]:
await main_cluster_multimodal_model(**args)

# More Developing: Using modular source code
Possible to update function content in the following files, but no RENAMING or DELETING or changing function signature!:
* src/main.py: Main function for UI test generation.
    * Change constant Max length (max length of the input text for the model)
* src/ui_tests/test_generation.py: Generate the UI test code using the language model.
    * Expand model selection: Just add new case statements for different models with its code.
* src/data/html_processing.py: Extract relevant information from the HTML file.
* src/data/image_processing.py: Extract relevant information from the image.
* src/data/python_processing.py: Parse the given playwright test code for previous step as a precondition.


# Extract Information

In [None]:
# extract text from image
def extract_text_from_image(image_path: str, max_length: Union[int, None] = 200) -> str:
    """ Extract text from an image file.

    :param max_length: The maximum length of the text.
    :param image_path: The path to the image file.
    :return: The extracted text from the image.
    """
    image = Image.open(image_path)
    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    # Extract text from the image
    text = pytesseract.image_to_string(image)
    # Truncate the text to the maximum length
    if max_length:
        text = truncate_text(text, max_length=max_length)

    logger.debug(f"Text extracted from image successfully. - Characters: {len(text)}")

    return text


# Create Input Prompt


In [None]:
# in config yaml ist die prompt 


# Finetuning


# Evaluation


Liste mit allen Evaluationen, die wir ausprobiert haben


In [None]:
"""  
##Evaluation
In this section, we evaluate the generated code by comparing it with a validation code.
We use several metrics such as weighted BLEU score, success rate, and Levenshtein distance.
"""

# Define the evaluation functions
def calculate_scores(generated_code: str, validation_code: str, precondition_code: str, programming_language: str) -> dict:
    """ This method returns the scores of the given generated code.

    generated_code: The generated code from the LLM as Python or TypeScript playwright script.
    validation_code: Examples for validation as Python or TypeScript playwright script.
    precondition: The precondition of the step as Python or TypeScript playwright script

    return: The scores of the given generated code as dictionary.
    """
    return {'weighted bleu': calculate_weighted_bleu_score(generated_code, validation_code, precondition_code),
            'success rate': calculate_success_rate(generated_code),
            'levensthein distance': calculate_levenshtein_distance(generated_code, validation_code)}


def calculate_weighted_bleu_score(generated_code: str, validation_code: str, precondition_code: str, alpha: float = 0.5) -> float:
    """ This method returns the BLEU score of the given generated code.

    :param generated_code: The generated code from the LLM as Python or TypeScript playwright script.
    :param validation_code: Examples for validation as Python or TypeScript playwright script.
    :param precondition_code: The precondition of the step as Python or TypeScript playwright script
    :param alpha: The weight of the second part of the BLEU score.
    :return: The BLEU score of the given generated code.
    """
    generated_code_tokens = esprima.tokenize(generated_code)
    validation_code_tokens = esprima.tokenize(validation_code)
    precondition_code_tokens = esprima.tokenize(precondition_code)

    # Convert tokens to string
    generated_code_tokens = [str(elem) for elem in generated_code_tokens]
    validation_code_tokens = [str(elem) for elem in validation_code_tokens]
    precondition_code_tokens = [str(elem) for elem in precondition_code_tokens]

    precondition_code_length = len(precondition_code_tokens)
    precondition_code_length_without_end_lines = -1
    for i in range(precondition_code_length):
        if validation_code_tokens[i] != precondition_code_tokens[i]:
            precondition_code_length_without_end_lines = i
            break

    # The first part: Has the LLM correctly copied the precondition code?
    first_bleu_score = sentence_bleu(references=[validation_code_tokens[:precondition_code_length_without_end_lines]],
                                     hypothesis=generated_code_tokens[:precondition_code_length_without_end_lines])

    # The second part: Has the LLM correctly added the new lines to reach the given goal?
    second_bleu_score = sentence_bleu(references=[validation_code_tokens[precondition_code_length_without_end_lines:]],
                                      hypothesis=generated_code_tokens[precondition_code_length_without_end_lines:])

    return (1 - alpha) * first_bleu_score + alpha * second_bleu_score

# Placeholder for actual implementation
def calculate_success_rate(generated_code: str):
    pass
    # try to run generated playwright code and if successful then return 1 as success rate

# Placeholder for actual implementation
def calculate_levenshtein_distance(generated_code, validation_code):
    pass
    # return the levenshtein distance between the generated code and the validation code

# Validate the generated code (if validation_path is provided)
if validation_path:
    validation_code = parse_code(validation_path)
    scores = calculate_scores(generated_code=generated_code, validation_code=validation_code, precondition_code=precondition_text, programming_language='Python')  # Adjust language as needed
    print("Validation Scores:", scores)
