In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time
import openai
from dotenv import load_dotenv

from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs

from method.ours.utils import create_driver
from method.ours.parse import (
    initial_parse,
    parse_after_feedback,
)

In [3]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Global Variables
HEADLESS = False
TEXT_EMBEDDING_METHOD = 'ADA' # ['ADA', 'WORD2VEC', 'SPACY']
GRAPH_EMBEDDING_METHOD = 'NODE2VEC' # ['NODE2VEC', 'GCN']

In [4]:
driver = create_driver(HEADLESS)
driver.get('https://www.aircanada.com/ca/en/aco/home.html')

In [5]:
# form = driver.find_elements(By.TAG_NAME, 'form')[48]
# form = driver.find_element(By.ID, 'register')
form = driver.find_elements(By.TAG_NAME, 'form')[1]

In [30]:
html1 = driver.find_element(By.TAG_NAME, 'body').get_attribute('outerHTML')

# Processing

In [6]:
relation_graph, input_groups = initial_parse(driver, form, TEXT_EMBEDDING_METHOD)

Computing transition probabilities:   0%|          | 0/340 [00:00<?, ?it/s]

Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 651.22it/s]
Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 637.33it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 747.21it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 780.40it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 34/34 [00:10<00:00,  3.37it/s]
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


# Generation

In [8]:
from method.ours.constraints import generate_constraints_from_string
from method.ours.prompts import (
    get_form_context,
    create_constraint_generation_user_prompt,
    create_value_generation_user_prompt,
)
from method.ours.generation import (
    generate_constraints_with_llm,
    generate_values_with_llm,
)

In [9]:
form_context = get_form_context(input_groups)

In [10]:
class FillTable:
    def __init__(self):
        self.entries = {}
    
    
    def add_entry(self, field_id, input_group, constraints):
        self.entries[field_id] = FillTableRow(input_group, constraints)
    
    
    def get_entry_by_field_id(self, field_id):
        return self.entries[field_id]
    
    
    def get_entry_by_input_group(self, input_group):
        filtered = list(filter(
            lambda x: x.input_group.node.xpath == input_group.node.xpath,
            self.entries.values()
        ))
        if len(filtered) > 0:
            return filtered[0]
        return None


class FillTableRow:
    def __init__(self, input_group, constraints):
        self.input_group = input_group
        self.constraints = constraints
        self.feedback = None
        self.value = None
    
    
    def set_value(self, value):
        self.value = value
    
    
    def set_feedback(self, feedback):
        self.feedback = feedback

In [11]:
fill_table = FillTable()

In [12]:
for input_group in input_groups:
    # TODO: set more conditions on <button> to be sure that buttons here are inputs
    if input_group.node.element.name == 'button':
        continue
    
    constraint_user_prompt = create_constraint_generation_user_prompt(form_context, input_group)
    
    generated_constraints = generate_constraints_with_llm(
        constraint_user_prompt,
        openai_api_key=openai.api_key
    )
    
    field_name, constraints = generate_constraints_from_string(generated_constraints)
    
    fill_table.add_entry(field_name, input_group, constraints)

 [0m Prompt: [
  {
    "role": "system",
    "content": "Instructions:\nYour task is to generate a set of constraints for web form fields. Your decisions must be made independently without seeking user assistance or additional information. If there are multiple ways to express constraints, use the least number of constraints to describe them. Only generate the constraints and refrain from explaining your answers. Only generate constraints for the input field in question, not those in the relevant information section. You must choose your constraints in the format of our modified version of the Jest library in JavaScript. The list of functions in this modified format are:\n1. toBeEqual(value) # the input field value is exactly equal to the given value\n2. toHaveLengthCondition(condition, value) # the length of the input field value matches the given condition\n3. toBeTruthy() # the input field value is truthy and not empty (not false, 0, '', null, undefined, or NaN)\n4. toHaveCompareCo

 [0m Response: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "# sample\nexpect(field('bkmgFlights_tripTypeSelector_M'))\n.toBeEqual('M')\n# end of the sample",
        "role": "assistant"
      }
    }
  ],
  "created": 1688154949,
  "id": "chatcmpl-7XEVZCiDCr8DnbwVetBmghVEUZLJK",
  "model": "gpt-4-0613",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 25,
    "prompt_tokens": 585,
    "total_tokens": 610
  }
}
 [0m Total running cost: $0.057
 [0m Prompt: [
  {
    "role": "system",
    "content": "Instructions:\nYour task is to generate a set of constraints for web form fields. Your decisions must be made independently without seeking user assistance or additional information. If there are multiple ways to express constraints, use the least number of constraints to describe them. Only generate the constraints and refrain from explaining your answers. Only generate constraints for the input field in

 [0m Response: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "expect(field('bkmgFlights_destination_trip_1'))\n.toBeTruthy()\n.not.toBeEqual(field('bkmgFlights_origin_trip_1'))",
        "role": "assistant"
      }
    }
  ],
  "created": 1688154960,
  "id": "chatcmpl-7XEVklzLE0mbGppMUUGG7P1VgXoCC",
  "model": "gpt-4-0613",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 29,
    "prompt_tokens": 613,
    "total_tokens": 642
  }
}
 [0m Total running cost: $0.114
 [0m Prompt: [
  {
    "role": "system",
    "content": "Instructions:\nYour task is to generate a set of constraints for web form fields. Your decisions must be made independently without seeking user assistance or additional information. If there are multiple ways to express constraints, use the least number of constraints to describe them. Only generate the constraints and refrain from explaining your answers. Only generate constraints for

In [13]:
def split_constant_and_field_constraints(constraints):
    constants = list(filter(lambda x: not x.is_field, constraints))
    fields = list(filter(lambda x: x.is_field, constraints))
    return constants, fields

In [14]:
# value generation loop

for input_group in input_groups:
    if input_group.node.element.name == 'button':
        continue
    
    fill_entry = fill_table.get_entry_by_input_group(input_group)
    
    if fill_entry is None:
        continue
    
    constraints = fill_entry.constraints
    constant_constraints, field_constraints = split_constant_and_field_constraints(constraints)
    including_constraints = [*constant_constraints]

    relevant_field_values = []
    for field_constraint in field_constraints:
        field_arg = field_constraint.get_field_args()[0]
        
        if field_arg not in fill_table.entries:
            continue
        
        value = fill_table.get_entry_by_field_id(field_arg).value
        
        if value is not None:
            including_constraints.append(field_constraint)
            relevant_field_values.append((field_arg, value))
    
    value_user_prompt = create_value_generation_user_prompt(
        form_context,
        fill_entry.input_group,
        including_constraints,
        relevant_field_values=relevant_field_values if len(relevant_field_values) > 0 else None,
    )
    
    generated_value = generate_values_with_llm(
        value_user_prompt,
        openai_api_key=openai.api_key
    )
    
    fill_entry.set_value(generated_value)

 [0m Prompt: [
  {
    "role": "system",
    "content": "Your task is to generate a value for a web form field based on the form field information and a set of constraints on the field. Your decisions must be made independently without seeking user assistance or additional information. For each user prompt, you need to generate one value that satisfies the constraints while keeping in mind the nature of the input from the available information. Only generate value and refrain from explaining your answers. Only generate value for the input field in question, and not the ones in the relevant information section."
  },
  {
    "role": "user",
    "content": "The following are all the labels in the form, which provide context for the functionality of the form:  Round-trip ,  One-way ,  Multi-city/Stopover ,  Book with points , From, To, Departure, Return\n    We are generating filling values for the following input field:\n    input: <input id=\"bkmgFlights_tripTypeSelector_R\" name=\"tri

 [0m Response: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "New York",
        "role": "assistant"
      }
    }
  ],
  "created": 1688155024,
  "id": "chatcmpl-7XEWmHcL3KLG8tOxImgV90gope1Mi",
  "model": "gpt-4-0613",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 2,
    "prompt_tokens": 239,
    "total_tokens": 241
  }
}
 [0m Total running cost: $0.205
 [0m Prompt: [
  {
    "role": "system",
    "content": "Your task is to generate a value for a web form field based on the form field information and a set of constraints on the field. Your decisions must be made independently without seeking user assistance or additional information. For each user prompt, you need to generate one value that satisfies the constraints while keeping in mind the nature of the input from the available information. Only generate value and refrain from explaining your answers. Only generate value for the input field in

In [15]:
from method.ours.utils import interact_with_input

In [16]:
for input_group in input_groups:
    entry = fill_table.get_entry_by_input_group(input_group)
    
    if entry is None:
        continue
    
    print(entry.input_group.node)
    print(entry.value)
    
    element = driver.find_element(By.XPATH, entry.input_group.node.xpath)
    
    if element.get_attribute('type') not in ['submit', 'radio', 'checkbox']:
        interact_with_input(element, entry.value)

<input>value is R</input> at y: (567, 597), x: (219, 319)
R
<input>value is O</input> at y: (567, 597), x: (334, 421)
O
<input>value is M</input> at y: (567, 597), x: (437, 591)
M
<input></input> at y: (574, 594), x: (652, 905)
true
<input></input> at y: (628, 678), x: (198, 466)
Vancouver
<input></input> at y: (628, 678), x: (536, 804)
New York
<input>DD/MM</input> at y: (628, 678), x: (833, 958)
15/07
<input>DD/MM</input> at y: (628, 678), x: (978, 1093)
20/07


In [17]:
submit = list(filter(
    lambda x: x.node.element.attrs['type'] == 'submit',
    input_groups
))[0]

In [18]:
interact_with_input(driver.find_element(By.XPATH, submit.node.xpath), True)

# Feedback with Diff

In [20]:
html2 = driver.find_element(By.TAG_NAME, 'body').get_attribute('outerHTML')

In [21]:
from bs4 import BeautifulSoup
from deepdiff import DeepDiff

# Parse the HTML.
soup1 = BeautifulSoup(html1, 'html.parser')
soup2 = BeautifulSoup(html2, 'html.parser')

# Find the differences.
diff = DeepDiff(soup1, soup2)

# Print the differences.
for change, details in diff.items():
    print(change)
    for detail in details:
        print("   ", detail)

In [22]:
feedback_keywords = [
    'invalid',
    'required',
    'error',
    'not possible',
    'missing',
    'failed',
    'incorrect',
    'retry',
    'verify',
    'exceeds',
    'denied',
    'please enter',
    'format',
    'does not match',
    'try again',
    'unavailable',
    'warning',
    'out of range',
    'unable to',
    'not recognized',
    'not allowed',
    'incomplete',
    'cannot process',
    'duplicate',
    'too short',
    'too long',
    'doesn\'t exist',
    'does not exist',
    'already exist',
    'authentication failed',
    'expired',
    'update failed',
    'connection',
    'server error',
    'system error',
    'not valid',
    'timed out',
    'access denied',
    'please confirm',
    'not acceptable',
    'submission failed',
    'unexpected',
    'conflict',
    'not applicable',
    'not active',
    'not found',
    'not secure',
    'forbidden',
    'disconnect',
    'prohibited',
]

In [23]:
diff

{}

In [24]:
changes = []

if 'type_changes' in diff:
    for type_change in diff['type_changes'].values():
        changes.append(type_change['new_value'])

if 'values_changed' in diff:
    for value_change in diff['values_changed'].values():
        changes.append(value_change['new_value'])

if 'iterable_item_added' in diff:
    for iterable_item in diff['iterable_item_added'].values():
        changes.append(iterable_item)

In [25]:
changes = list(filter(lambda x: x.strip() != '', map(lambda x: x.text.strip(), changes)))

In [26]:
def has_feedback_keyword(sentence):
    for keyword in feedback_keywords:
        if keyword in sentence:
            return True
    return False


changes = list(filter(has_feedback_keyword, changes))

In [27]:
changes

[]

# Feedback

In [28]:
new_form = driver.find_elements(By.TAG_NAME, 'form')[1]

In [29]:
new_relation_graph, new_input_groups = parse_after_feedback(driver, new_form, relation_graph, TEXT_EMBEDDING_METHOD)

Computing transition probabilities:   0%|          | 0/366 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 762.00it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 769.68it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 680.25it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 705.44it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38/38 [00:11<00:00,  3.26it/s]
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [31]:
relation_graph.diff(new_relation_graph)

{'added': [<span>New York </span> at y: (732, 754), x: (536, 611),
  <span> All airports </span> at y: (732, 754), x: (611, 692),
  <span>Sat Jul 15</span> at y: (732, 754), x: (833, 901),
  <span>Thu Jul 20</span> at y: (732, 754), x: (978, 1051)],
 'removed': []}

In [32]:
for input_group in new_input_groups:
    # TODO: set more conditions on <button> to be sure that buttons here are inputs
    if input_group.node.element.name == 'button':
        continue
    
    last_entry = fill_table.get_entry_by_input_group(input_group)
    
    local_feedback = list(
        map(
            lambda x: x.target.element.text,
            filter(
                lambda x: x.type.value == 'P_FEEDBACK',
                input_group.node.edges.values()
            )
        )
    )
    global_feedback = changes
    
    feedback_string = '\n'.join([*local_feedback, *global_feedback]).strip()
    
    last_try = {
        "value": last_entry.value,
        "feedback": feedback_string
    }
    
    if feedback_string != '':
        constraint_user_prompt = create_constraint_generation_user_prompt(
            form_context,
            input_group,
            last_try=last_try,
            constraints=last_entry.constraints
        )
        
        generated_constraints = generate_constraints_with_llm(
            constraint_user_prompt,
            openai_api_key=openai.api_key
        )
        
        field_name, constraints = generate_constraints_from_string(generated_constraints)
        fill_table.add_entry(field_name, input_group, constraints)
    
    '''constraint_user_prompt = create_constraint_generation_user_prompt(form_context, input_group)
    
    generated_constraints = generate_constraints_with_llm(
        constraint_user_prompt,
        openai_api_key=openai.api_key
    )
    
    field_name, constraints = generate_constraints_from_string(generated_constraints)
    
    fill_table.add_entry(field_name, input_group, constraints)'''

 [0m Prompt: [
  {
    "role": "system",
    "content": "Instructions:\nYour task is to generate a set of constraints for web form fields. Your decisions must be made independently without seeking user assistance or additional information. If there are multiple ways to express constraints, use the least number of constraints to describe them. Only generate the constraints and refrain from explaining your answers. Only generate constraints for the input field in question, not those in the relevant information section. You must choose your constraints in the format of our modified version of the Jest library in JavaScript. The list of functions in this modified format are:\n1. toBeEqual(value) # the input field value is exactly equal to the given value\n2. toHaveLengthCondition(condition, value) # the length of the input field value matches the given condition\n3. toBeTruthy() # the input field value is truthy and not empty (not false, 0, '', null, undefined, or NaN)\n4. toHaveCompareCo

In [33]:
# value generation loop

for input_group in input_groups:
    if input_group.node.element.name == 'button':
        continue
    
    fill_entry = fill_table.get_entry_by_input_group(input_group)
    
    if fill_entry is None:
        continue
    
    constraints = fill_entry.constraints
    constant_constraints, field_constraints = split_constant_and_field_constraints(constraints)
    including_constraints = [*constant_constraints]

    relevant_field_values = []
    for field_constraint in field_constraints:
        field_arg = field_constraint.get_field_args()[0]
        
        if field_arg not in fill_table.entries:
            continue
        
        value = fill_table.get_entry_by_field_id(field_arg).value
        
        if value is not None:
            including_constraints.append(field_constraint)
            relevant_field_values.append((field_arg, value))
    
    value_user_prompt = create_value_generation_user_prompt(
        form_context,
        fill_entry.input_group,
        including_constraints,
        relevant_field_values=relevant_field_values if len(relevant_field_values) > 0 else None,
    )
    
    generated_value = generate_values_with_llm(
        value_user_prompt,
        openai_api_key=openai.api_key
    )
    
    fill_entry.set_value(generated_value)

 [0m Prompt: [
  {
    "role": "system",
    "content": "Your task is to generate a value for a web form field based on the form field information and a set of constraints on the field. Your decisions must be made independently without seeking user assistance or additional information. For each user prompt, you need to generate one value that satisfies the constraints while keeping in mind the nature of the input from the available information. Only generate value and refrain from explaining your answers. Only generate value for the input field in question, and not the ones in the relevant information section."
  },
  {
    "role": "user",
    "content": "The following are all the labels in the form, which provide context for the functionality of the form:  Round-trip ,  One-way ,  Multi-city/Stopover ,  Book with points , From, To, Departure, Return\n    We are generating filling values for the following input field:\n    input: <input id=\"bkmgFlights_tripTypeSelector_R\" name=\"tri

 [0m Response: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "New York",
        "role": "assistant"
      }
    }
  ],
  "created": 1688155111,
  "id": "chatcmpl-7XEYBVd9edR5jWICvcccj7RlQaOJC",
  "model": "gpt-4-0613",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 2,
    "prompt_tokens": 252,
    "total_tokens": 254
  }
}
 [0m Total running cost: $0.293
 [0m Prompt: [
  {
    "role": "system",
    "content": "Your task is to generate a value for a web form field based on the form field information and a set of constraints on the field. Your decisions must be made independently without seeking user assistance or additional information. For each user prompt, you need to generate one value that satisfies the constraints while keeping in mind the nature of the input from the available information. Only generate value and refrain from explaining your answers. Only generate value for the input field in

In [34]:
for input_group in input_groups:
    entry = fill_table.get_entry_by_input_group(input_group)
    
    if entry is None:
        continue
    
    print(entry.input_group.node)
    print(entry.value)
    
    element = driver.find_element(By.XPATH, entry.input_group.node.xpath)
    
    if element.get_attribute('type') not in ['submit', 'radio', 'checkbox']:
        interact_with_input(element, entry.value)

<input>value is R</input> at y: (567, 597), x: (219, 319)
R
<input>value is O</input> at y: (567, 597), x: (334, 421)
O
<input>value is M</input> at y: (567, 597), x: (437, 591)
M
<input></input> at y: (574, 594), x: (652, 905)
true
<input></input> at y: (628, 678), x: (198, 466)
Vancouver
<input></input> at y: (710, 760), x: (536, 804)
New York
<input>DD/MM</input> at y: (628, 678), x: (833, 958)
15/07
<input>DD/MM</input> at y: (628, 678), x: (978, 1093)
20/07


In [35]:
submit = list(filter(
    lambda x: x.node.element.attrs['type'] == 'submit',
    input_groups
))[0]

In [36]:
interact_with_input(driver.find_element(By.XPATH, submit.node.xpath), True)