In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup as bs, NavigableString, Comment

In [2]:
chrome_options = Options()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--window-size=3072x1920");

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

In [3]:
driver.get('https://ant.design/components/form')

In [4]:
xpath_script = """
    function getPathTo(element) {
        // if (element.id !== '')
        //     return 'id(\"'+element.id+'\")';
        if (element === document.body)
            return element.tagName;
        var ix= 0;
        var siblings= element.parentNode.childNodes;
        for (var i= 0; i<siblings.length; i++) {
            var sibling= siblings[i];
            if (sibling===element)
                return getPathTo(element.parentNode) + '/' + element.tagName + '[' + (ix + 1) + ']';
            if (sibling.nodeType===1 && sibling.tagName===element.tagName)
                ix++;
        }
    }
    const path = getPathTo(arguments[0]);
    if (path.startsWith('id(')) {
        return path;
    }
    return '//' + path;
"""


def embed_properties(root):
    '''
    This function embeds the x_span, y_span, and xpath
    properties into WebElement as attributes for processing
    in the next steps. We do this because we want to convert
    the WebElement into a BeautifulSoup document.
    '''
    
    # Get the location and size of the root element
    location = root.location
    size = root.size

    # Calculate the boundaries of the root element
    x_span = (location['x'], location['x'] + size['width'])
    y_span = (location['y'], location['y'] + size['height'])
    
    # Get the xpath of the element
    xpath = driver.execute_script(xpath_script, root)
    
    # Embed attributes in the HTML element
    driver.execute_script("arguments[0].setAttribute('xpath', arguments[1])", root, xpath)
    driver.execute_script("arguments[0].setAttribute('x_start', arguments[1])", root, x_span[0])
    driver.execute_script("arguments[0].setAttribute('x_end', arguments[1])", root, x_span[1])
    driver.execute_script("arguments[0].setAttribute('y_start', arguments[1])", root, y_span[0])
    driver.execute_script("arguments[0].setAttribute('y_end', arguments[1])", root, y_span[1])
    
    for child in root.find_elements(By.XPATH, '*'):
        embed_properties(child)
    
    return root

In [5]:
form = driver.find_elements(By.TAG_NAME, 'form')[48]
form = embed_properties(form)

In [6]:
form_doc = bs(form.get_attribute('outerHTML'), 'html.parser')

In [7]:
print(form_doc.prettify())

<form class="ant-form ant-form-horizontal css-k7429z" id="complex-form" style="max-width:600px" x_end="939" x_start="339" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]" y_end="8501" y_start="8301">
 <div class="ant-form-item css-k7429z" x_end="939" x_start="339" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]" y_end="8333" y_start="8301">
  <div class="ant-row ant-form-item-row css-k7429z" x_end="939" x_start="339" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]/DIV[1]" y_end="8333" y_start="8301">
   <div class="ant-col ant-col-8 ant-form-item-label css-k7429z" x_end="539" x_start="339" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]/DIV[1]/DIV[1]" y_end="8333" y_start="8301">
    <label class="" title="Username" x_end

# Get Processable Nodes

In [8]:
'''def is_displayed(element):
    if isinstance(element, NavigableString):
        return True
    x_span = (
            float(element.attrs['x_start']),
            float(element.attrs['x_end'])
    )
    y_span = (
        float(element.attrs['y_start']),
        float(element.attrs['y_end'])
    )
    return x_span[0] != x_span[1] and y_span[0] != y_span[1]'''


def should_skip_processing(element):
    should_skip = \
        isinstance(element, Comment) or \
        (isinstance(element, NavigableString) and element.strip() == '') # or \
        # not is_displayed(element)
    return should_skip


def is_force_not_keep(element):
    # TODO: list of elements not to keep, like children of svg
    return False


def is_force_keep(element):
    force_keep_tags = ['label', 'input', 'textarea', 'select', 'option', 'button']
    return element.name in force_keep_tags
    


def has_action_listener(element):
    # TODO: check if element has JavaScript action listeners
    # such as the case of <div> instead of <button> for submit
    return False


def is_processable(element):
    if not isinstance(element, NavigableString) and is_force_not_keep(element):
        return False
    
    if isinstance(element, NavigableString):
        if element.parent is not None and len(element.parent.contents) > 1:
            return True
    
    elif len(element.contents) == 0:
        return True
    
    elif len(element.contents) == 1 and isinstance(element.contents[0], NavigableString):
        return True
    
    elif is_force_keep(element):
        return True
    
    elif has_action_listener(element):
        return True
    
    return False
        


def get_processable_nodes(soup):
    nodes = []
    
    for element in soup.recursiveChildGenerator():
        if should_skip_processing(element):
            continue
        if is_processable(element):
            nodes.append(element)
    return nodes

In [9]:
form_bs_nodes = get_processable_nodes(form_doc)

In [10]:
form_bs_nodes[0]

<label class="" title="Username" x_end="539" x_start="459" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]/DIV[1]/DIV[1]/LABEL[1]" y_end="8333" y_start="8301">Username</label>

# Create a Graph From Nodes

In [11]:
from enum import Enum


class EdgeDir(Enum):
    IN = 'IN'
    OUT = 'OUT'


class EdgeType(Enum):
    # connection via for attribute
    FOR = 'FOR'
    
    # neighbor connections
    CHILD = 'CHILD'
    NLEFT = 'NLEFT'
    NRIGHT = 'NRIGHT'
    NTOP = 'NTOP'
    NBOTTOM = 'NBOTTOM'

In [12]:
from math import floor


class Graph:
    def __init__(self):
        self.nodes = {}
        self.edges = []
    
    
    def add_node(self, node):
        self.nodes[node.get_id()] = node
    
    
    def add_edge(self, source_id, target_id, conn_type):
        source = self.nodes[source_id]
        target = self.nodes[target_id]
        
        edge = Edge(source, target, conn_type)
        
        source.add_edge(edge, EdgeDir.OUT)
        target.add_edge(edge, EdgeDir.IN)
        
        self.edges.append(edge)


class Node:
    def __init__(self, bs_element):
        self.element = bs_element
        
        self.xpath = bs_element.attrs['xpath']
        self.x_span = (
            floor(float(bs_element.attrs['x_start'])),
            floor(float(bs_element.attrs['x_end']))
        )
        self.y_span = (
            floor(float(bs_element.attrs['y_start'])),
            floor(float(bs_element.attrs['y_end']))
        )
        
        self.in_links = []
        self.out_links = []
    
    
    def add_edge(self, edge, direction):
        if direction == EdgeDir.IN:
            self.in_links.append(edge)
        else:
            self.out_links.append(edge)
    
    
    def get_area(self):
        return (self.x_span[1] - self.x_span[0]) * (self.y_span[1] - self.y_span[0])

        
    
    def get_id(self):
        if 'id' in self.element.attrs:
            return self.element.attrs['id']
        return self.xpath
    
    
    def __repr__(self):
        return str(self)
   

    def __str__(self):
        return f'<{self.element.name}> at y: {str(self.y_span)}, x: {str(self.x_span)}'


class Edge:
    def __init__(self, source, target, conn_type):
        self.source = source
        self.target = target
        self.conn_type = conn_type
    
    
    def __repr__(self):
        return str(self)
    
    
    def __str__(self):
        return f'edge {self.conn_type.value} from {self.source} to {self.target}'

In [13]:
graph = Graph()

for n in form_bs_nodes:
    node = Node(n)
    graph.add_node(node)

In [14]:
graph.nodes

{'//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]/DIV[1]/DIV[1]/LABEL[1]': <label> at y: (8301, 8333), x: (459, 539),
 'complex-form_username': <input> at y: (8301, 8333), x: (539, 699),
 '//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]/DIV[1]/DIV[2]/DIV[1]/DIV[1]/DIV[1]/DIV[2]/A[1]': <a> at y: (8309, 8326), x: (707, 782),
 '//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[2]/DIV[1]/DIV[1]/LABEL[1]': <label> at y: (8357, 8389), x: (472, 539),
 'complex-form_address_province': <input> at y: (8358, 8388), x: (551, 736),
 '//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[2]/DIV[1]/DIV[2]/DIV[1]/DIV[1]/DIV[1]/DIV[1]/DIV[1]/SPAN[2]': <span> at y: (8358, 8388), x: (551, 736),
 '//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION

# Add Proximity Links

In [15]:
def tuple_avg(t):
    return (t[0] + t[1]) / 2


def binary_search_span(span_list, item_span):
    '''
    Performs a binary search on a span list, which is a list of tuples (start, end).
    The list is ordered by the first member of the tuple.
    The search key is (start + end) / 2
    '''
    if len(span_list) == 0:
        return -1
    
    if len(span_list) == 1:
        return 0
    
    middle = len(span_list) // 2
    if tuple_avg(item_span) >= tuple_avg(span_list[middle]):
        return middle + binary_search_span(span_list[middle:], item_span)
    
    return binary_search_span(span_list[:middle], item_span)


class SpanOrderedDict:
    def __init__(self):
        self.spans = []
        self.items = {}
    
    
    def force_set_item(self, span, item):
        self.items[span] = item
    
    
    def set_item(self, span, item):
        # print(span)
        span_to_add = self.find_span_to_add(span)
        # print(span_to_add)
        # print(self.spans, '\n')
        if span_to_add not in self.items:
            self.items[span_to_add] = []
        self.items[span_to_add].append(item)
    
    
    def find_span_to_add(self, span):
        closest_span_idx = binary_search_span(self.spans, span)
        
        # if this is the first span that we are adding something to
        if closest_span_idx == -1:
            # print('first', span)
            self.spans = [span]
            return span

        closest_span = self.spans[closest_span_idx]
        
        # if new span is inside some other span
        if span[0] >= closest_span[0] and span[1] <= closest_span[1]:
            # print('inside', closest_span)
            return closest_span
        
        # if new span is before some other span
        if span[0] <= closest_span[0] and span[1] <= closest_span[0]:
            # print('before', closest_span)
            insert_idx = max(closest_span_idx - 1, 0)
            self.spans.insert(insert_idx, span)
            return span
        
        # if new span is after some other span
        if span[0] >= closest_span[1] and span[1] >= closest_span[1]:
            # print('outside', closest_span)
            insert_idx = closest_span_idx + 1
            self.spans.insert(insert_idx, span)
            return span
        
        # otherwise, the spans collide with eachother
        # print('collide', closest_span)
        span_start = min(closest_span[0], span[0])
        span_end = max(closest_span[1], span[1])
        new_span = (span_start, span_end)
        
        # swap span with new created span
        self.spans[closest_span_idx] = new_span
        items = self.items[closest_span]
        self.items[new_span] = items
        self.items.pop(closest_span)

        return new_span
    
    
    def __getitem__(self, key):
        return self.items[key]

In [16]:
sod = SpanOrderedDict()

for node in graph.nodes.values():
    span = node.y_span
    sod.set_item(span, node)

for key, values in sod.items.items():
    # print('\n\nSublist\n\n')
    subsod = SpanOrderedDict()
    for node in values:
        span = node.x_span
        subsod.set_item(span, node)
    for subkey, subvalues in subsod.items.items():
        sorted_values = sorted(subvalues, key=lambda x: x.get_area())
        subsod.force_set_item(subkey, subvalues)

    sod.force_set_item(key, subsod)

# Add "For" links

In [17]:
for node_id, node in graph.nodes.items():
    if 'for' in node.element.attrs:
        for_id = node.element.attrs['for']
        graph.add_edge(node_id, for_id, EdgeType.FOR)

In [18]:
graph.edges

[]

# List Neighbors for Each Node

## Parent-Child, and Left-Right

In [19]:
for y_span, row in sod.items.items():
    row_nodes = list(row.items.values())
    
    for node_idx in range(len(row_nodes)):
        # if there are multiple items in certain location, then we
        # take the first item as the representative item for the loc
        repr_node = row_nodes[node_idx][0]
        
        # add multi-nodes in the same location as children of repr node
        for child_idx in range(1, len(row_nodes[node_idx])):
            graph.add_edge(
                repr_node.get_id(),
                row_nodes[node_idx][child_idx].get_id(),
                EdgeType.CHILD
            )
        
        # add left and right elements as neighbors
        if node_idx - 1 >= 0:
            left_repr = row_nodes[node_idx - 1][0]
            graph.add_edge(
                repr_node.get_id(),
                left_repr.get_id(),
                EdgeType.NLEFT
            )
        if node_idx + 1 < len(row_nodes):
            right_repr = row_nodes[node_idx + 1][0]
            graph.add_edge(
                repr_node.get_id(),
                right_repr.get_id(),
                EdgeType.NRIGHT
            )

In [20]:
graph.edges

[edge NRIGHT from <label> at y: (8301, 8333), x: (459, 539) to <input> at y: (8301, 8333), x: (539, 699),
 edge NLEFT from <input> at y: (8301, 8333), x: (539, 699) to <label> at y: (8301, 8333), x: (459, 539),
 edge NRIGHT from <input> at y: (8301, 8333), x: (539, 699) to <a> at y: (8309, 8326), x: (707, 782),
 edge NLEFT from <a> at y: (8309, 8326), x: (707, 782) to <input> at y: (8301, 8333), x: (539, 699),
 edge NRIGHT from <label> at y: (8357, 8389), x: (472, 539) to <input> at y: (8358, 8388), x: (551, 736),
 edge CHILD from <input> at y: (8358, 8388), x: (551, 736) to <span> at y: (8358, 8388), x: (551, 736),
 edge CHILD from <input> at y: (8358, 8388), x: (551, 736) to <path> at y: (8370, 8376), x: (726, 736),
 edge NLEFT from <input> at y: (8358, 8388), x: (551, 736) to <label> at y: (8357, 8389), x: (472, 539),
 edge NRIGHT from <input> at y: (8358, 8388), x: (551, 736) to <input> at y: (8357, 8389), x: (747, 860),
 edge NLEFT from <input> at y: (8357, 8389), x: (747, 860) to

## Top-Bottom

In [21]:
def spans_overlap(span1, span2):
    if span2[0] >= span1[1] and span2[1] > span1[1]:
        return False
    if span2[1] <= span1[0] and span2[0] < span1[0]:
        return False
    return True


rows = list(sod.items.values())
for row_idx in range(len(sod.items)):
    row = rows[row_idx]
    
    for nodes in row.items.values():
        repr_node = nodes[0]
        
        if row_idx - 1 >= 0:
            top_row = rows[row_idx - 1]
            for top_nodes in top_row.items.values():
                top_node_repr = top_nodes[0]
                
                if spans_overlap(repr_node.x_span, top_node_repr.x_span):
                    graph.add_edge(
                        repr_node.get_id(),
                        top_node_repr.get_id(),
                        EdgeType.NTOP
                    )
        if row_idx + 1 < len(sod.items):
            bottom_row = rows[row_idx + 1]
            for bottom_nodes in bottom_row.items.values():
                bottom_node_repr = bottom_nodes[0]
                
                if spans_overlap(repr_node.x_span, bottom_node_repr.x_span):
                    graph.add_edge(
                        repr_node.get_id(),
                        bottom_node_repr.get_id(),
                        EdgeType.NBOTTOM
                    )

In [22]:
graph.edges

[edge NRIGHT from <label> at y: (8301, 8333), x: (459, 539) to <input> at y: (8301, 8333), x: (539, 699),
 edge NLEFT from <input> at y: (8301, 8333), x: (539, 699) to <label> at y: (8301, 8333), x: (459, 539),
 edge NRIGHT from <input> at y: (8301, 8333), x: (539, 699) to <a> at y: (8309, 8326), x: (707, 782),
 edge NLEFT from <a> at y: (8309, 8326), x: (707, 782) to <input> at y: (8301, 8333), x: (539, 699),
 edge NRIGHT from <label> at y: (8357, 8389), x: (472, 539) to <input> at y: (8358, 8388), x: (551, 736),
 edge CHILD from <input> at y: (8358, 8388), x: (551, 736) to <span> at y: (8358, 8388), x: (551, 736),
 edge CHILD from <input> at y: (8358, 8388), x: (551, 736) to <path> at y: (8370, 8376), x: (726, 736),
 edge NLEFT from <input> at y: (8358, 8388), x: (551, 736) to <label> at y: (8357, 8389), x: (472, 539),
 edge NRIGHT from <input> at y: (8358, 8388), x: (551, 736) to <input> at y: (8357, 8389), x: (747, 860),
 edge NLEFT from <input> at y: (8357, 8389), x: (747, 860) to

# Calculate Embedding Distances from Nodes

## Hueristic

In [23]:
# TODO: improve this section

### Textual Distance

In [24]:
from openai.embeddings_utils import get_embedding, cosine_similarity


def get_text_similarity(text1, text2):
    embedding1 = get_embedding(text1.lower())
    embedding2 = get_embedding(text2.lower())
    return cosine_similarity(embedding1, embedding2)

In [25]:
get_text_similarity(
    """<label class="" title="Username">Username</label>""",
    """<input id="complex-form_username" placeholder="Please input" type="text" value="" />"""
)

0.8746199468854565

In [26]:
get_text_similarity(
    """<label class="" title="Username">Username</label>""",
    """<input id="complex-form_address_province" placeholder="Select province" type="search" value=""/>"""
)

0.7977355290887229

### Structural Distance

In [27]:
# !pip install lxml
# !pip install networkx
# !pip install node2vec

In [28]:
import networkx as nx
from bs4 import BeautifulSoup
from node2vec import Node2Vec


# Function to create a graph from the BeautifulSoup object
def create_graph(soup):
    G = nx.DiGraph()

    for tag in soup.find_all():
        # Assuming 'xpath' attribute exists for each tag
        xpath = tag.attrs['xpath']
        
        if xpath not in G:
            G.add_node(xpath)
        for child in tag.children:
            if isinstance(child, Comment) or isinstance(child, NavigableString):
                continue
            child_xpath = child.attrs.get('xpath')
            if child_xpath not in G:
                G.add_node(child_xpath)
            G.add_edge(xpath, child_xpath)

    return G


# Function to generate embeddings
def generate_embeddings(G, dimensions=1024, walk_length=30, num_walks=200, workers=4):
    # Create Node2Vec object
    node2vec = Node2Vec(G, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, workers=workers)

    # Train Node2Vec model
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    
    return model


# Create graph
G = create_graph(form_doc)

# Generate embeddings
model = generate_embeddings(G)

Computing transition probabilities:   0%|          | 0/59 [00:00<?, ?it/s]

Generating walks (CPU: 1):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 2):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 3):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 4):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 1444.35it/s]
Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 1366.30it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 1503.04it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 1472.10it/s]


In [29]:
def get_structure_similarity(xpath1, xpath2):
    # Check if XPaths exist in the model
    if xpath1 not in model.wv or xpath2 not in model.wv:
        print("One or both of the XPaths are not in the model.")
        return None

    # Get vectors for XPaths
    vector1 = model.wv[xpath1]
    vector2 = model.wv[xpath2]

    # Calculate and return cosine similarity
    return cosine_similarity(vector1, vector2)

In [32]:
get_structure_similarity(
    form_bs_nodes[0].attrs['xpath'],
    form_bs_nodes[2].attrs['xpath']
)

0.94508326

### Combination

In [33]:
def combined_similarity(node1, node2, alpha=0.5):
    text_sim = get_text_similarity(str(node1), str(node2))
    structure_sim = get_structure_similarity(
        node1.attrs['xpath'],
        node2.attrs['xpath']
    )
    return alpha * text_sim + (1 - alpha) * structure_sim

In [34]:
form_bs_nodes[3]

<label class="" title="Address" x_end="539" x_start="472" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[2]/DIV[1]/DIV[1]/LABEL[1]" y_end="8389" y_start="8357">Address</label>

In [35]:
form_bs_nodes[1]

<input aria-required="true" class="ant-input css-k7429z" id="complex-form_username" placeholder="Please input" style="width:160px" type="text" value="" x_end="699" x_start="539" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]/DIV[1]/DIV[2]/DIV[1]/DIV[1]/DIV[1]/DIV[1]/INPUT[1]" y_end="8333" y_start="8301"/>

In [41]:
combined_similarity(form_bs_nodes[0], form_bs_nodes[10])

0.5269933791017302

## Train GCN for Direct Embedding with Features

# Sudo Code

In [None]:
def rank_neighbors(node):
    pass


def form_relationship_graaph(form):
    pass