In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import openai
from dotenv import load_dotenv

from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs


from method.ours.utils import create_driver, embed_properties_into_html
from method.ours.preprocessing import get_processable_nodes
from method.ours.graph_embedding import create_node2vec_model
from method.ours.relation_graph import create_relation_graph
from method.ours.node_linking import create_base_links
from method.ours.similarity import add_similarity_scores_to_graph

In [4]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

# Global Variables
HEADLESS = False
TEXT_EMBEDDING_METHOD = 'ADA' # ['ADA', 'WORD2VEC', 'SPACY']
GRAPH_EMBEDDING_METHOD = 'NODE2VEC' # ['NODE2VEC', 'GCN']

In [5]:
driver = create_driver(HEADLESS)
driver.get('https://www.aircanada.com/ca/en/aco/home.html')

In [6]:
# form = driver.find_elements(By.TAG_NAME, 'form')[48]
# form = driver.find_element(By.ID, 'register')
form = driver.find_elements(By.TAG_NAME, 'form')[1]
form = embed_properties_into_html(driver, form)

form_doc = bs(form.get_attribute('outerHTML'), 'html.parser')

In [None]:
form_processable_nodes = get_processable_nodes(form_doc)
node2vec_model = create_node2vec_model(form_doc)

Computing transition probabilities:   0%|          | 0/340 [00:00<?, ?it/s]

Generating walks (CPU: 1):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 2):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 3):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 4):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 496.30it/s]Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 483.93it/s]
Generating walks (CPU: 2):  98%|█████████▊| 49/50 [00:00<00:00, 483.05it/s]Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 461.33it/s]
Generating walks (CPU: 3):  92%|█████████▏| 46/50 [00:00<00:00, 451.62it/s]Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 435.07it/s]
Generating walks (CPU: 4):  90%|█████████ | 45/50 [00:00<00:00, 441.78it/s]Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 421.23it/s]


In [13]:
relation_graph = create_relation_graph(form_processable_nodes, TEXT_EMBEDDING_METHOD)
relation_graph = create_base_links(relation_graph)
relation_graph = add_similarity_scores_to_graph(node2vec_model, relation_graph)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [00:08<00:00,  4.14it/s]


  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [92]:
def is_input(element):
    input_tags = ['input', 'textarea', 'select', 'option', 'button']
    return element.name in input_tags


def is_label(element):
    return element.name == 'label'


def is_text(element):
    return not is_input(element) and not is_label(element)

In [93]:
# remove <span-wraps> because we are done with them here
for edge in relation_graph.edges():
    if edge.target.element.name == 'span-wrap':
        child = edge.target
        relation_graph.remove_edge(edge)
        relation_graph.remove_node(child)


# remove input-input edges (for now)
for edge in relation_graph.edges():
    if is_input(edge.source.element) and is_input(edge.target.element):
        relation_graph.remove_edge(edge)

In [94]:
from method.ours.relation_graph import RelationEdge, EdgeType


for node in relation_graph.nodes():
    if is_input(node.element):
        continue
    if node.element.name == 'label':
        edges = list(node.edges.values())
        input_edges = list(filter(lambda x: is_input(x.target.element), edges))
        sorted_input_edges = sorted(input_edges, key=lambda x: x.weight, reverse=True)
        for i in range(1, len(sorted_input_edges)):
            relation_graph.remove_edge(sorted_input_edges[i])
        
        probable_for_edge = sorted_input_edges[0]
        if not probable_for_edge.type.value == 'FOR':
            probable_for_edge.set_type(EdgeType.PROBABLE_FOR)
        
        relation_graph.add_edge(RelationEdge(
            probable_for_edge.target,
            probable_for_edge.source,
            probable_for_edge.type,
            probable_for_edge.weight
        ))

In [95]:
from method.ours.relation_graph import EdgeType


def is_label_input(edge):
    return edge.type.value == 'FOR' or edge.type.value == 'PROBABLE_FOR'


def is_child(edge):
    return edge.type.value == 'CHILD'


scores = []

for edge in relation_graph.edges():
    if is_label_input(edge) or is_child(edge):
        continue
    scores.append((edge, edge.weight))

mean = np.mean(list(map(lambda x: x[1], scores)))
std_dev = np.std(list(map(lambda x: x[1], scores)))

# Set the cutoff to be one standard deviation above the mean
cutoff = mean + 0.5 * std_dev

In [96]:
cutoff

0.7093221149170539

In [97]:
for edge in relation_graph.edges():
    if is_label_input(edge) or is_child(edge):
        continue
    if edge.weight < cutoff:
        relation_graph.remove_edge(edge)

In [98]:
relation_graph.edges()

[edge FOR weight 1 from <label> Round-trip </label> at y: (572, 592), x: (242, 312) to <input>value is R</input> at y: (567, 597), x: (212, 312),
 edge FOR weight 1 from <label> One-way </label> at y: (572, 592), x: (357, 414) to <input>value is O</input> at y: (567, 597), x: (327, 414),
 edge FOR weight 1 from <label> Multi-city/Stopover </label> at y: (572, 592), x: (459, 583) to <input>value is M</input> at y: (567, 597), x: (429, 583),
 edge FOR weight 1 from <label> Book with points </label> at y: (574, 594), x: (675, 898) to <input></input> at y: (574, 594), x: (645, 898),
 edge FOR weight 1 from <label>From</label> at y: (622, 672), x: (190, 229) to <input></input> at y: (628, 678), x: (190, 458),
 edge FOR weight 1 from <label>To</label> at y: (628, 678), x: (528, 546) to <input></input> at y: (628, 678), x: (528, 796),
 edge FOR weight 1 from <label>Departure</label> at y: (628, 678), x: (826, 903) to <input>DD/MM</input> at y: (628, 678), x: (826, 951),
 edge FOR weight 1 fro

# Create Global Connections

In [197]:
'''def append_if_not_in_list(groups, node_to_index, node):
    idx = max(groups.keys()) + 1 if len(groups.keys()) > 0 else 0
    if node_to_index[node] == -1:
        groups[idx] = [node]
        node_to_index[node] = idx


def merge_subgraphs(groups, node_to_index, idx1, idx2):
    merge_at = min(idx1, idx2)
    remove_at = max(idx1, idx2)
    sub1, sub2 = groups[idx1], groups[idx2]
    merged = [*sub1, *sub2]
    groups[merge_at] = merged
    groups.pop(remove_at)
    for node in merged:
        node_to_index[node] = merge_at
    return groups


def create_subgraph_groups(nodes, edges):
    groups = {}
    node_to_index = {}
    
    for node in nodes:
        node_to_index[node] = -1
    
    for node, idx in node_to_index.items():
        append_if_not_in_list(groups, node_to_index, node)
    
    for edge in edges:
        source_idx = node_to_index[edge.source]
        target_idx = node_to_index[edge.target]
        if source_idx == target_idx:
            continue
        groups = merge_subgraphs(groups, node_to_index, source_idx, target_idx)    
    
    return groups'''


from method.ours.embedding_distance import get_text_similarity, combined_similarity


class InputGroupNode:
    def __init__(self, input_node):
        self.node = input_node
        self.label = self._get_label(input_node)
        self.group = self._form_group(input_node)
        self.edges = []
    
    
    def add_edge(self, other_group, weight):
        self.edges.append((other_group, weight))
    
    
    def _get_label(self, node):
        candidate_label = list(filter(
            lambda x: is_label(x.target.element) and 'FOR' in x.type.value,
            node.edges.values()
        ))
        if len(candidate_label) > 0:
            return candidate_label[0].target
        return None
    
    
    def _form_group(self, node):
        # TODO: refactor
        group = []
        for edge in node.edges.values():
            group.append(edge.target)
            if is_label(edge.target.element):
                for label_edge in edge.target.edges.values():
                    if not is_input(label_edge.target.element):
                        group.append(label_edge.target)
        group = filter(lambda x: not is_label(x.element), group)
        return list(set(group))
    
    
    def get_similarity(self, other_group):
        input_sim = combined_similarity(node2vec_model, self.node, other_group.node)
        label_sim = None
        if self.label is not None and other_group.label is not None:
            label_sim = combined_similarity(node2vec_model, self.label, other_group.label)
        return max(input_sim, label_sim or 0)

In [198]:
input_groups = []
input_elements = filter(lambda x: is_input(x.element), relation_graph.nodes())

for input_element in input_elements:
    input_groups.append(InputGroupNode(input_element))

In [199]:
scores = []

for i in range(len(input_groups)):
    for j in range(i + 1, len(input_groups)):
        score = input_groups[i].get_similarity(input_groups[j])
        scores.append(score)

In [200]:
scores

[0.8185127966654819,
 0.8354747215453431,
 0.5337760715473836,
 0.560251043192448,
 0.5383677950013455,
 0.6122490231564008,
 0.5915530685951673,
 0.502083232511667,
 0.565117916446097,
 0.5544095638014083,
 0.8302621913108069,
 0.504845711190959,
 0.5684613510159877,
 0.5697884795999151,
 0.603912465162541,
 0.6122736259176985,
 0.5213759457482057,
 0.5625488987136568,
 0.5505762469818876,
 0.509140754933852,
 0.5406334210058701,
 0.5416171712432807,
 0.6111800703822929,
 0.613000440937217,
 0.5234091647431388,
 0.5773531638109726,
 0.5579346358077801,
 0.5505653863470682,
 0.5235944696053949,
 0.5633576102544491,
 0.5226595328755418,
 0.36182138,
 0.4272264,
 0.42160055,
 0.6535252319982527,
 0.6097240846528329,
 0.6168321374500301,
 0.37536594,
 0.4153875,
 0.35677257,
 0.5837818129688241,
 0.6172661234631519,
 0.4204767,
 0.34201366,
 0.3214908,
 0.6814585477113723,
 0.5327481134583556,
 0.5599033121597641,
 0.5468414459180814,
 0.5815896644045913,
 0.5526980195772522,
 0.519042361

In [201]:
import numpy as np

In [202]:
non_zero_scores = list(filter(lambda x: x != 0, map(lambda x: x, scores)))

In [203]:
mean = np.mean(non_zero_scores)
std_dev = np.std(non_zero_scores)

# Set the cutoff to be one standard deviation above the mean
cutoff = mean + 0.5 * std_dev

In [204]:
print(mean)
print(std_dev)
print(cutoff)

0.5493230788020325
0.10224211897106411
0.6004441382875645


In [205]:
for i in range(len(input_groups)):
    for j in range(i + 1, len(input_groups)):
        score = input_groups[i].get_similarity(input_groups[j])
        if score < cutoff:
            continue
        input_groups[i].add_edge(input_groups[j], score)
        input_groups[j].add_edge(input_groups[i], score)

In [208]:
for edge in input_groups[0].edges:
    print(edge[0].label, edge[1])

<label> One-way </label> at y: (572, 592), x: (357, 414) 0.8185127966654819
<label> Multi-city/Stopover </label> at y: (572, 592), x: (459, 583) 0.8354747215453431
<label>Departure</label> at y: (628, 678), x: (826, 903) 0.6122490231564008


In [96]:
messages = [
    {
        'role': 'system',
        'content': '''
        Your task is to generate a set of assertions for form fields. The list of the assertions and their signatures is as following:

        1. toBeEqual(value) # for any input type
        2. toHaveLengthCondition(condition, value) # for textual inputs
        3. toBeTruthy() # for boolean inputs
        4. toHaveCondition(condition, value) # for numeric or date inputs
        5. toBeEmpty() # for any input type
        6. toMatch(regexPattern) # for textual inputs

        generate conditions in the as the sample:

        # sample
        expect(field('username'))
        .toHaveLengthCondition('>', 8)
        .toHaveLengthCondition('<', 50)
        .not.toBeEmpty()
        # end of sample

        if there are multiple inputs that should have the relation with each other, you can use a format such as:

        # sample
        expect(field('password'))
        .toBe(field('confirm password'))
        # end of sample

        Only generate the assertions and nothing else. Only generate assertions for the inputs in question, and not the ones in the relevant information section.
        '''
    },
    {
        'role': 'user',
        'content': """
            We are filling the following field in the form:
            <input id="register_password" aria-required="true" type="password" class="ant-input css-dfjnss">

            The relevant information available in the form are (in order of relevance):
            1. <label for="register_password" class="ant-form-item-required" title="Password">Password</label>
            2. <input id="register_confirm" aria-required="true" type="password" class="ant-input css-dfjnss">
            3. <label for="register_confirm" class="ant-form-item-required" title="Confirm Password">Confirm Password</label>
        """
    }
]

In [97]:
response = openai.ChatCompletion.create(
    # model='gpt-3.5-turbo',
    model='gpt-4',
    messages=messages,
    temperature=0.5,
)

In [98]:
response_text = response.choices[0].message.content
print(response_text)

expect(field('register_password'))
.toHaveLengthCondition('>', 8)
.toHaveLengthCondition('<', 50)
.not.toBeEmpty()
.toMatch(/^(?=.*[a-z])(?=.*[A-Z])(?=.*\d)[a-zA-Z\d]{8,50}$/)

expect(field('register_password'))
.toBe(field('register_confirm'))
