In [1]:
from openai import OpenAI
import os
import json
import re
from key import get_key

### Preparing data

In [2]:
# getting the OPENAI key
client = OpenAI(
    api_key= get_key()
)

In [3]:
# folders for our application versions
""" OLD_XML_DIR = "../layouts/our.v.1"
NEW_XML_DIR = "../layouts/our.v.2"
OLD_TESTS_DIR = "../tests/our.v.1"
NEW_TESTS_DIR = "../tests/our.v.2" """

' OLD_XML_DIR = "../layouts/our.v.1"\nNEW_XML_DIR = "../layouts/our.v.2"\nOLD_TESTS_DIR = "../tests/our.v.1"\nNEW_TESTS_DIR = "../tests/our.v.2" '

In [4]:
# folders for Omni-Notes
OLD_XML_DIR = "../layouts/v6.1.0"
NEW_XML_DIR = "../layouts/v6.3.0"
OLD_TESTS_DIR = "../tests/v6.1.0"
NEW_TESTS_DIR = "../tests/v6.3.0"

In [5]:
# function used to read a content of a file
def read_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

# reading all files from different directories and store their content in a dictionary where the keys are the filenames and the values are the file contents.
old_xml_files = {f: read_file(os.path.join(OLD_XML_DIR, f)) for f in os.listdir(OLD_XML_DIR)}
new_xml_files = {f: read_file(os.path.join(NEW_XML_DIR, f)) for f in os.listdir(NEW_XML_DIR)}
old_test_files = {f: read_file(os.path.join(OLD_TESTS_DIR, f)) for f in os.listdir(OLD_TESTS_DIR)}

In [6]:
# converting XMLs into a single string, for LLM input
old_xml_data = "\n\n".join([f"File: {name}\n{content}" for name, content in old_xml_files.items()])
new_xml_data = "\n\n".join([f"File: {name}\n{content}" for name, content in new_xml_files.items()])

### First LLM call

In [7]:
# first LLM step, finding the XML differences between different GUI versions 
xml_differences = {}

for xml_name, old_xml_content in old_xml_files.items():
    new_xml_content = new_xml_files.get(xml_name, "")

    prompt = """
    You are an expert in analyzing Android UI XML layouts.
    Your task is to compare two XML layouts and find the differences between them.
    
    ## Comparison Task
    Here is the old UI XML layout:
    """+ old_xml_content + """

    Here is the updated UI XML layout:
    """+ new_xml_content +"""

    Please list all UI changes between these two versions.
    Format the response as structured text, including:
    - Elements that were **added** (with their IDs and attributes).
    - Elements that were **removed**.
    - Elements that were **renamed**.
    - Any **structural changes** (e.g., button moved inside another layout).
    - Any **attribute changes** (e.g., text color changed).
    - Any **value changes**, checking the text content, checking the text attribute in the every items (e.g., text content changed, for example if the button text changed or the title changed). For example if an item, in the old version, have the attribute "text="Item 1" and in the new version for the same item the attribute is "text="Item 2", then it is a value change, so you have to write that as a difference.

    Return the response in JSON format.
    
    ## Example 1: Simple Button Change
    Old XML:
    ```xml
    <Button
        android:id="@+id/submit_button"
        android:layout_width="wrap_content"
        android:text="Send"
        android:background="#FF0000" />
    ```

    New XML:
    ```xml
    <Button
        android:id="@+id/submit_button"
        android:layout_width="match_parent"
        android:text="Submit Now"
        android:background="#0000FF" />
    ```

    Expected Output:
    ```json
    {
        "changes": {
            "added": [],
            "removed": [],
            "renamed": [],
            "structural_changes": [],
            "attribute_changes": [
                {
                    "id": "@+id/submit_button",
                    "changes": [
                        {
                            "attribute": "android:layout_width",
                            "old_value": "wrap_content",
                            "new_value": "match_parent"
                        },
                        {
                            "attribute": "android:background",
                            "old_value": "#FF0000",
                            "new_value": "#0000FF"
                        }
                    ]
                }
            ],
            "value_changes": [
                {
                    "id": "@+id/submit_button",
                    "attribute": "android:text",
                    "old_value": "Send",
                    "new_value": "Submit Now"
                }
            ]
        }
    }
    ```

    ## Example 2: Layout Structural Change
    Old XML:
    ```xml
    <LinearLayout>
        <TextView android:id="@+id/title" />
        <EditText android:id="@+id/input" />
    </LinearLayout>
    ```

    New XML:
    ```xml
    <RelativeLayout>
        <EditText android:id="@+id/input" android:layout_centerInParent="true" />
        <TextView android:id="@+id/title" android:layout_above="@id/input" />
    </RelativeLayout>
    ```

    Expected Output:
    ```json
    {
        "changes": {
            "added": [
                {
                    "id": "@+id/input",
                    "attributes": {
                        "android:layout_centerInParent": "true"
                    }
                },
                {
                    "id": "@+id/title",
                    "attributes": {
                        "android:layout_above": "@id/input"
                    }
                }
            ],
            "removed": [],
            "renamed": [],
            "structural_changes": [
                {
                    "description": "Root layout changed from LinearLayout to RelativeLayout",
                    "old_layout": "LinearLayout",
                    "new_layout": "RelativeLayout"
                },
                {
                    "description": "Element order and positioning changed",
                    "details": "Elements repositioned within the layout"
                }
            ],
            "attribute_changes": [],
            "value_changes": []
        }
    }
    ```
    """

    response = client.chat.completions.create(
        messages=[{"role": "system", "content": "You are an expert in Android UI structure analysis."},
                  {"role": "user", "content": prompt}],
        model="gpt-4o-mini"
    )

    xml_differences[xml_name] = response.choices[0].message.content

    print(f"XML differences detected for {xml_name}")

XML differences detected for screen7.xml
XML differences detected for screen6.xml
XML differences detected for screen5.xml
XML differences detected for screen1.xml
XML differences detected for screen2.xml
XML differences detected for screen3.xml
XML differences detected for screen12.xml
XML differences detected for screen11.xml
XML differences detected for screen10.xml
XML differences detected for screen14.xml
XML differences detected for screen15.xml
XML differences detected for screen8.xml
XML differences detected for screen9.xml


In [8]:
xml_differences['screen5.xml']

'```json\n{\n    "changes": {\n        "added": [\n            {\n                "id": "android.widget.FrameLayout",\n                "attributes": {\n                    "pane-title": "Note"\n                }\n            },\n            {\n                "id": "android.widget.Button",\n                "attributes": {\n                    "text": "",\n                    "content-desc": "Cerca",\n                    "resource-id": "it.feio.android.omninotes:id/menu_search",\n                    "checkable": "false",\n                    "checked": "false",\n                    "clickable": "true",\n                    "enabled": "true",\n                    "focusable": "true",\n                    "focused": "false",\n                    "long-clickable": "false",\n                    "password": "false",\n                    "scrollable": "false",\n                    "selected": "false",\n                    "bounds": "[672,97][816,241]",\n                    "displayed": "true"

In [9]:
# function used to clean the LLM-generated code, that starts with ```java and end with ```
def clean_java_code(java_code):
    """ Remove the code block markers (```java ... ```) """
    return re.sub(r'```java\n|\n```', '', java_code)

In [10]:
# function used to clean the LLM-generated code, that starts with ```kotlin and end with ```
def clean_kotlin_code(kotlin_code):
    """ Remove the code block markers (```kotlin ... ```) """
    return re.sub(r'```kotlin\n|\n```', '', kotlin_code)

### Second LLM call

In [11]:
# second LLM step, updating the test cases, starting from the old ones, analyzing all the xml changes
for test_name, old_test in old_test_files.items():
    prompt = """
    You are an expert in Android UI test automation using Espresso.
    The Android app's UI has changed, and test cases need to be updated accordingly.

    ## Actual Test Case Update Task
    Here are all the detected UI changes:
    """+ json.dumps(xml_differences, indent=4) +"""

    Here is an old test case:
    """+ old_test +"""

    Please update the test case to ensure it remains valid with the new UI structure.
    Ensure that:
    - It still verifies the original functionality.
    - It interacts with the correct elements based on the updated XML.
    - Any removed elements are handled gracefully.
    - Any renamed or moved elements are adjusted accordingly.
    - It follows best practices for Android UI test automation.

    Return only the updated Java test code without any explanation.
    
    ## Example 1: Simple Button Change
    XML differences:
    ```json
    {
        "changes": {
            "added": [],
            "removed": [],
            "renamed": [],
            "structural_changes": [],
            "attribute_changes": [
                {
                    "id": "@+id/submit_button",
                    "changes": [
                        {
                            "attribute": "android:layout_width",
                            "old_value": "wrap_content",
                            "new_value": "match_parent"
                        },
                        {
                            "attribute": "android:background",
                            "old_value": "#FF0000",
                            "new_value": "#0000FF"
                        }
                    ]
                }
            ],
            "value_changes": [
                {
                    "id": "@+id/submit_button",
                    "attribute": "android:text",
                    "old_value": "Send",
                    "new_value": "Submit Now"
                }
            ]
        }
    }
    ```
    
    Old Test Case:
    ```java
    @Test
    public void testSubmitButton() {{
        onView(withId(R.id.submit_button))
            .check(matches(withText("Send")))
            .perform(click());
    }}
    ```
    
    Updated Test Case:
    ```java
    @Test
    public void testSubmitButton() {{
        onView(withId(R.id.submit_button))
            .check(matches(withText("Submit Now")))
            .check(matches(isFullWidth()))
            .perform(click());
    }}
    ```
    
    ## Example 2: Layout Structural Redesign
    XML differences:
    ```json
    {
        "changes": {
            "added": [
                {
                    "id": "@+id/input",
                    "attributes": {
                        "android:layout_centerInParent": "true"
                    }
                },
                {
                    "id": "@+id/title",
                    "attributes": {
                        "android:layout_above": "@id/input"
                    }
                }
            ],
            "removed": [],
            "renamed": [],
            "structural_changes": [
                {
                    "description": "Root layout changed from LinearLayout to RelativeLayout",
                    "old_layout": "LinearLayout",
                    "new_layout": "RelativeLayout"
                },
                {
                    "description": "Element order and positioning changed",
                    "details": "Elements repositioned within the layout"
                }
            ],
            "attribute_changes": [],
            "value_changes": []
        }
    }
    ```
    
    Old Test Case:
    ```java
    @Test
    public void testLoginFlow() {
        onView(withId(R.id.title))
            .check(matches(isDisplayed()));
        
        onView(withId(R.id.input))
            .perform(typeText("username"));
    }
    ```
    
    Updated Test Case:
    ```java
    @Test
    public void testLoginFlow() {
        onView(withId(R.id.input))
            .check(matches(isDisplayedAtCenterOfScreen()))
            .perform(typeText("username"));
        
        onView(withId(R.id.title))
            .check(matches(isAbove(withId(R.id.input))));
    }
    ```
    
    Use the old test as a structure, writing all the import and all the java structure, then apply all the updates on the tests
    """

    response = client.chat.completions.create(
        messages=[{"role": "system", "content": "You are an expert in Android UI test automation."},
                  {"role": "user", "content": prompt}],
        model="gpt-4o-mini"
    )

    updated_code = response.choices[0].message.content.strip()
    cleaned_java_code = clean_java_code(updated_code)
    # cleaned_kotlin_code = clean_kotlin_code(updated_code) # this used in case of our GUI, bacause we did our tests in Kotlin

    file_path = f"{NEW_TESTS_DIR}/{test_name}"
    with open(file_path, "w") as f:
        f.write(cleaned_java_code)
        # f.write(cleaned_kotlin_code) # this used in case of our GUI, bacause we did our tests in Kotlin
    print(f"Saved {file_path}")

Saved ../tests/v6.3.0/CategoryLifecycleTest.java
Saved ../tests/v6.3.0/SearchTagBackArrowTest.java
Saved ../tests/v6.3.0/NoteLifecycleTest.java
Saved ../tests/v6.3.0/FabCameraNoteTest.java
Saved ../tests/v6.3.0/AutoBackupTest.java
Saved ../tests/v6.3.0/RecurrenceRuleTest.java
Saved ../tests/v6.3.0/SettingsActivityTest.java
Saved ../tests/v6.3.0/RemindersLifecycleTest.java
Saved ../tests/v6.3.0/BaseEspressoTest.java
Saved ../tests/v6.3.0/MrJingleLifecycleTest.java
Saved ../tests/v6.3.0/NoteListMenuTest.java
Saved ../tests/v6.3.0/DrawerMenusEspressoTest.java
Saved ../tests/v6.3.0/FabLifecycleTest.java


### Evaluation

In [12]:
# folder for the goal test cases
GOAL_TESTS_DIR = "../tests/v6.3.0goal"

In [13]:
# reading all files from different directories and store their content in a dictionary where the keys are the filenames and the values are the file contents.
new_test_files = {f: read_file(os.path.join(NEW_TESTS_DIR, f)) for f in os.listdir(NEW_TESTS_DIR)}
goal_test_files = {f: read_file(os.path.join(GOAL_TESTS_DIR, f)) for f in os.listdir(GOAL_TESTS_DIR)}

In [14]:
from collections import Counter

# function used to compute precision and recall between AI-generated and human-written test cases
def compute_precision_recall(ai_test, human_test):
    
    # function to remove comments from code (single-line and multi-line comments)
    def remove_comments(code):
        # Remove single-line comments (//)
        code = re.sub(r'//.*', '', code)
        # Remove multi-line comments (/* ... */)
        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
        return code
    
    # function used to tokenize Java code by splitting words, removing special characters, and ignoring case
    def tokenize(code):
        code = remove_comments(code)  # Remove comments first
        return re.findall(r'\w+', code.lower())  # Extract words (ignoring special chars)
    
    ai_tokens = tokenize(ai_test)
    human_tokens = tokenize(human_test)

    ai_counter = Counter(ai_tokens)
    human_counter = Counter(human_tokens)

    # compute intersection: words that appear in both tests
    common_tokens = sum((ai_counter & human_counter).values())

    precision = common_tokens / sum(ai_counter.values()) if ai_counter else 0
    recall = common_tokens / sum(human_counter.values()) if human_counter else 0

    return precision, recall, ai_counter, human_counter

# function used to compute overall precision and recall across all test cases
def compute_overall_precision_recall(total_ai_counter, total_human_counter, total_common_tokens):
    overall_precision = total_common_tokens / sum(total_ai_counter.values()) if total_ai_counter else 0
    overall_recall = total_common_tokens / sum(total_human_counter.values()) if total_human_counter else 0
    return overall_precision, overall_recall

total_ai_counter = Counter()
total_human_counter = Counter()
total_common_tokens = 0

# loop through all test cases and compare AI-generated vs. human-written tests
test_count = 0
for test_name, ai_test in new_test_files.items():
    if test_name not in goal_test_files:
        print(f"Goal test for {test_name} not found, skipping...")
        continue  

    goal_test = goal_test_files[test_name]

    precision, recall, ai_counter, human_counter = compute_precision_recall(ai_test, goal_test)
    
    print(f"Test: {test_name}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print("------------------------------------")

    total_ai_counter.update(ai_counter)
    total_human_counter.update(human_counter)
    total_common_tokens += sum((ai_counter & human_counter).values())

    test_count += 1

overall_precision, overall_recall = compute_overall_precision_recall(
    total_ai_counter, total_human_counter, total_common_tokens
)

print("----- OVERALL METRICS -----")
print(f"Overall Precision: {overall_precision:.4f}")
print(f"Overall Recall: {overall_recall:.4f}")
print("---------------------------")

Test: CategoryLifecycleTest.java
Precision: 0.9986
Recall: 0.9893
------------------------------------
Test: SearchTagBackArrowTest.java
Precision: 0.8793
Recall: 1.0000
------------------------------------
Test: NoteLifecycleTest.java
Precision: 0.9958
Recall: 0.9958
------------------------------------
Test: FabCameraNoteTest.java
Precision: 0.9870
Recall: 0.9870
------------------------------------
Test: AutoBackupTest.java
Precision: 1.0000
Recall: 1.0000
------------------------------------
Test: RecurrenceRuleTest.java
Precision: 0.9959
Recall: 0.9959
------------------------------------
Test: SettingsActivityTest.java
Precision: 1.0000
Recall: 1.0000
------------------------------------
Test: RemindersLifecycleTest.java
Precision: 0.9948
Recall: 0.9095
------------------------------------
Test: BaseEspressoTest.java
Precision: 0.9485
Recall: 0.9984
------------------------------------
Test: MrJingleLifecycleTest.java
Precision: 0.9859
Recall: 0.8108
-----------------------------