In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import openai
from dotenv import load_dotenv

from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs


from method.ours.utils import create_driver, embed_properties_into_html
from method.ours.preprocessing import get_processable_nodes
from method.ours.graph_embedding import create_node2vec_model
from method.ours.relation_graph import create_relation_graph
from method.ours.node_linking import create_base_links
from method.ours.similarity import add_similarity_scores_to_graph
from method.ours.edge_pruning import (
    prune_relation_graph_extra_edges,
    prune_low_score_uncertain_edges
)
from method.ours.input_group import (
    create_input_groups,
    prune_low_score_group_relations
)

In [3]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Global Variables
HEADLESS = False
TEXT_EMBEDDING_METHOD = 'ADA' # ['ADA', 'WORD2VEC', 'SPACY']
GRAPH_EMBEDDING_METHOD = 'NODE2VEC' # ['NODE2VEC', 'GCN']

In [4]:
driver = create_driver(HEADLESS)
driver.get('https://www.aircanada.com/ca/en/aco/home.html')

In [5]:
# form = driver.find_elements(By.TAG_NAME, 'form')[48]
# form = driver.find_element(By.ID, 'register')
form = driver.find_elements(By.TAG_NAME, 'form')[1]
form = embed_properties_into_html(driver, form)

form_doc = bs(form.get_attribute('outerHTML'), 'html.parser')

In [6]:
form_processable_nodes = get_processable_nodes(form_doc)
node2vec_model = create_node2vec_model(form_doc)

Computing transition probabilities:   0%|          | 0/340 [00:00<?, ?it/s]

Generating walks (CPU: 1):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 3):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 2):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 4):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 681.18it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 672.99it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 648.38it/s]
Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 634.14it/s]


In [7]:
relation_graph = create_relation_graph(form_processable_nodes, TEXT_EMBEDDING_METHOD)

relation_graph = create_base_links(relation_graph)

relation_graph = add_similarity_scores_to_graph(node2vec_model, relation_graph)

relation_graph = prune_relation_graph_extra_edges(relation_graph)
relation_graph = prune_low_score_uncertain_edges(relation_graph, factor=0.5)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:10<00:00,  3.53it/s]
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [8]:
input_groups = create_input_groups(relation_graph)
input_groups = prune_low_score_group_relations(input_groups, node2vec_model, factor=0.5)

# Constraint Generation

In [45]:
system_prompt = '''
Your task is to generate a set of assertions for form fields. The list of the assertions and their signatures is as following:

1. toBeEqual(value) # for any input type
2. toHaveLengthCondition(condition, value) # for textual inputs
3. toBeTruthy() # for boolean inputs
4. toHaveCondition(condition, value) # for numeric or date inputs
5. toBeEmpty() # for any input type
6. toMatch(regexPattern) # for textual inputs

generate conditions in the as the sample:

# sample
expect(field('username'))
.toHaveLengthCondition('>', 8)
.toHaveLengthCondition('<', 50)
.not.toBeEmpty()
# end of sample

if there are multiple inputs that should have the relation with each other, you can use a format such as:

# sample
expect(field('password'))
.toBe(field('confirm password'))
# end of sample

Only generate the assertions and nothing else. Only generate assertions for the inputs in question, and not the ones in the relevant information section.
'''

In [62]:
input_group = input_groups[4]

In [63]:
input_group.node.element

<input id="bkmgFlights_origin_trip_1" name="bkmgFlights_origin_trip_1" type="text"/>

In [64]:
input_data_str = str(input_group)

relevant_input_groups = list(map(
    lambda x: str(x[0]).split('with the following relevant text tags')[0],
    sorted(input_group.edges, key=lambda x: x[1], reverse=True)
))

relevant_input_groups_str

relevant_input_groups_str = "\n".join(
    map(lambda data: f'''{data[0] + 1}.\n{data[1]}'''.strip(), enumerate(relevant_input_groups[:3]))
)


input_prompt = f'''
We are filling the following field in the form:
{input_data_str}

The relevant information available in the form are (in order of relevance):
{relevant_input_groups_str}
'''

In [65]:
print(input_prompt)


We are filling the following field in the form:
input: <input id="bkmgFlights_origin_trip_1" name="bkmgFlights_origin_trip_1" type="text"/>
with label: From
with the following relevant text tags:
<span>Vancouver </span>
<span>From</span>

The relevant information available in the form are (in order of relevance):
1.
input: <input id="bkmgFlights_destination_trip_1" name="bkmgFlights_destination_trip_1" type="text"/>
with label: To
2.
input: <input id="bkmgFlights_travelDates_1-formfield-2" name="bkmgFlights_travelDates_1-formfield-2" type="text"/>
with label: Return
3.
input: <input id="bkmgFlights_travelDates_1-formfield-1" name="bkmgFlights_travelDates_1-formfield-1" type="text"/>
with label: Departure



In [66]:
messages = [
    {
        'role': 'system',
        'content': system_prompt
    },
    {
        'role': 'user',
        'content': input_prompt
    }
]

In [67]:
response = openai.ChatCompletion.create(
    # model='gpt-3.5-turbo',
    model='gpt-4',
    messages=messages,
    temperature=0.5,
)

In [68]:
response_text = response.choices[0].message.content
print(response_text)

expect(field('bkmgFlights_origin_trip_1'))
.toHaveLengthCondition('>', 1)
.toHaveLengthCondition('<', 100)
.not.toBeEmpty()
.toMatch(/^[a-zA-Z\s]+$/)
