In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import time
import openai
from dotenv import load_dotenv

from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs


from method.ours.utils import create_driver, embed_properties_into_html
from method.ours.preprocessing import get_processable_nodes
from method.ours.graph_embedding import create_node2vec_model
from method.ours.relation_graph import create_relation_graph
from method.ours.node_linking import create_base_links
from method.ours.similarity import add_similarity_scores_to_graph
from method.ours.edge_pruning import (
    prune_relation_graph_extra_edges,
    prune_low_score_uncertain_edges
)
from method.ours.input_group import (
    create_input_groups,
    prune_low_score_group_relations
)

In [3]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Global Variables
HEADLESS = False
TEXT_EMBEDDING_METHOD = 'ADA' # ['ADA', 'WORD2VEC', 'SPACY']
GRAPH_EMBEDDING_METHOD = 'NODE2VEC' # ['NODE2VEC', 'GCN']

In [4]:
driver = create_driver(HEADLESS)
driver.get('https://www.aircanada.com/ca/en/aco/home.html')

In [5]:
# form = driver.find_elements(By.TAG_NAME, 'form')[48]
# form = driver.find_element(By.ID, 'register')
form = driver.find_elements(By.TAG_NAME, 'form')[1]

In [6]:
def parse_form(form):
    form = embed_properties_into_html(driver, form)
    form_doc = bs(form.get_attribute('outerHTML'), 'html.parser')
    
    form_processable_nodes = get_processable_nodes(form_doc)
    node2vec_model = create_node2vec_model(form_doc)
    
    relation_graph = create_relation_graph(form_processable_nodes, TEXT_EMBEDDING_METHOD)

    relation_graph = create_base_links(relation_graph)

    relation_graph = add_similarity_scores_to_graph(node2vec_model, relation_graph)

    relation_graph = prune_relation_graph_extra_edges(relation_graph)
    relation_graph = prune_low_score_uncertain_edges(relation_graph, factor=0.5)
    
    input_groups = create_input_groups(relation_graph)
    input_groups = prune_low_score_group_relations(input_groups, node2vec_model, factor=0.5)
    
    return input_groups

In [7]:
t0 = time.time()
input_groups = parse_form(form)
print('running time:', time.time() - t0)

Computing transition probabilities:   0%|          | 0/340 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 519.05it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 499.41it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 462.07it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 466.34it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:08<00:00,  4.13it/s]
  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


running time: 42.94207978248596


# Constraint Generation

In [98]:
input_group = input_groups[8]
input_group.node.element

<input id="bkmgFlights_travelDates_1-formfield-2" name="bkmgFlights_travelDates_1-formfield-2" type="text"/>

In [99]:
input_data_str = str(input_group)

relevant_input_groups = list(map(
    lambda x: str(x[0]).split('with the following relevant text tags')[0],
    sorted(input_group.edges, key=lambda x: x[1], reverse=True)
))

relevant_input_groups_str = "\n".join(
    map(lambda data: f'''{data[0] + 1}.\n{data[1]}'''.strip(), enumerate(relevant_input_groups[:3]))
)


input_prompt = f'''
We are filling the following field in the form:
{input_data_str}

The relevant information available in the form are (in order of relevance):
{relevant_input_groups_str}
'''

In [100]:
print(input_prompt)


We are filling the following field in the form:
input: <input id="bkmgFlights_travelDates_1-formfield-2" name="bkmgFlights_travelDates_1-formfield-2" type="text"/>
with label: Return
with the following relevant text tags:
<span>Return</span>

The relevant information available in the form are (in order of relevance):
1.
input: <input id="bkmgFlights_travelDates_1-formfield-1" name="bkmgFlights_travelDates_1-formfield-1" type="text"/>
with label: Departure
2.
input: <input id="bkmgFlights_tripTypeSelector_M" name="tripType" type="radio"/>
with label: Multi-city/Stopover
3.
input: <input id="bkmgFlights_origin_trip_1" name="bkmgFlights_origin_trip_1" type="text"/>
with label: From



In [112]:
system_prompt = '''
Your task is to generate a set of assertions for form fields. Your decisions must be made independently without seeking user assistance or additional information.

The list of the assertions and their signatures is as follows:

1. toBe(value) # the input field value exactly matches the given value
2. toHaveLengthCondition(condition, value) # the length of the input field value matches the given condition
3. toBeTruthy() # the input field value is truthy and not empty (not false, 0, '', null, undefined, or NaN)
4. toHaveCompareCondition(condition, value) # the input field value has the given condition to the given value
5. toContainSubstr(value) # the input field value contains a specific string
6. toContainChar(value) # the input field value contains a specific character
7. toBeAlpha() # the input field value is alphabetic
8. toBeNumeric() # the input field value is numeric
9. toBeAlphaNumeric() # the input value contains both alphabetical and numeric characters
10. toHaveUpperCase() # the input value field contains upper case characters
11. toHaveSpecialChars() # the input value field contains special characters
12. toHaveWhiteSpace() # the input value field contains whitespace
13. toStartWith(value) # the input value field must start with a certain value
14. toEndWith(value) # the input value field must end with a certain value
15. toMatch(regex) # the input value field must match a certain regex pattern

generate conditions in the as the sample:

# sample
expect(field('password'))
.toHaveLengthCondition('>', 8)
.toHaveLengthCondition('<', 50)
.toBeAlphaNumeric()
.toHaveUpperCase()
.toHaveSpecialChars()
.not.toBeTruthy()
.toBeExactlyEqual(field('confirm password'))
# end of sample

If there are multiple ways to express assertions, use the one with the least number of assertions to describe it.
Only generate the assertions and don't explain your answers.
Only generate assertions for the inputs in question, not those in the relevant information section.
'''


messages = [
    {
        'role': 'system',
        'content': system_prompt
    },
    {
        'role': 'user',
        'content': input_prompt
    }
]

In [113]:
response = openai.ChatCompletion.create(
    # model='gpt-3.5-turbo',
    model='gpt-4',
    messages=messages,
    temperature=0,
)

In [114]:
response_text = response.choices[0].message.content
print(response_text)

expect(field('bkmgFlights_travelDates_1-formfield-2'))
.toBeTruthy()
.toHaveCompareCondition('>', field('bkmgFlights_travelDates_1-formfield-1'))


In [115]:
from method.ours.constraints import ConstraintFactory

In [116]:
constraint_strings = response_text.split('\n.')[1:]

In [118]:
constraints = list(map(ConstraintFactory.create, constraint_strings))

In [119]:
input_constraints_string = '\n'.join(map(lambda x: x.to_prompt_string(), constraints))


value_generation_prompt = f'''
{input_prompt}

generate values based on the following constraints:
{input_constraints_string}
'''

In [120]:
print(value_generation_prompt)



We are filling the following field in the form:
input: <input id="bkmgFlights_travelDates_1-formfield-2" name="bkmgFlights_travelDates_1-formfield-2" type="text"/>
with label: Return
with the following relevant text tags:
<span>Return</span>

The relevant information available in the form are (in order of relevance):
1.
input: <input id="bkmgFlights_travelDates_1-formfield-1" name="bkmgFlights_travelDates_1-formfield-1" type="text"/>
with label: Departure
2.
input: <input id="bkmgFlights_tripTypeSelector_M" name="tripType" type="radio"/>
with label: Multi-city/Stopover
3.
input: <input id="bkmgFlights_origin_trip_1" name="bkmgFlights_origin_trip_1" type="text"/>
with label: From


generate values based on the following constraints:
input field should be non-empty
input field should be  field('bkmgFlights_travelDates_1-formfield-1')



In [121]:
system_value_prompt = '''
Your task is to generate a set of values for a form field based on the form field information and a set of constraints on the field. Your decisions must always be made independently without seeking user assistance or additional information.
For each user prompt, you need to generate five distinct values that satisfy the constraints while keeping in mind the nature of the input from the available information.
Only generate the values and don't explain your answers.
Generate the values in a Python array. We must be able to parse your generation with json.loads.
Only generate values for the inputs in question, and not the ones in the relevant information section.
'''

filling_messages = [
    {
        'role': 'system',
        'content': system_value_prompt
    },
    {
        'role': 'user',
        'content': value_generation_prompt
    }
]

In [122]:
response = openai.ChatCompletion.create(
    # model='gpt-3.5-turbo',
    model='gpt-4',
    messages=filling_messages,
    temperature=0,
)

In [123]:
response_text = response.choices[0].message.content
print(response_text)

["2022-10-15", "2022-11-20", "2022-12-25", "2023-01-10", "2023-02-14"]
