In [23]:
import os
from enum import Enum
from math import floor
from dotenv import load_dotenv

import numpy as np
from tqdm.notebook import tqdm

import spacy
import openai
from openai.embeddings_utils import get_embedding as get_embedding_openai, cosine_similarity

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup as bs, NavigableString, Comment

In [27]:
# !python3 -m spacy download en_core_web_lg

In [2]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [16]:
# Global Variables
TEXT_EMBEDDING_METHOD = 'SPACY' # ['ADA', 'WORD2VEC', 'SPACY']
GRAPH_EMBEDDING_METHOD = 'NODE2VEC' # ['NODE2VEC', 'GCN']

In [4]:
chrome_options = Options()
# chrome_options.add_argument("--headless")
# chrome_options.add_argument("--window-size=3072x1920");

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

In [5]:
driver.get('https://ant.design/components/form')

In [6]:
def get_xpath(element):
    xpath_script = """
    function getPathTo(element) {
        // if (element.id !== '')
        //     return 'id(\"'+element.id+'\")';
        if (element === document.body)
            return element.tagName;
        var ix= 0;
        var siblings= element.parentNode.childNodes;
        for (var i= 0; i<siblings.length; i++) {
            var sibling= siblings[i];
            if (sibling===element)
                return getPathTo(element.parentNode) + '/' + element.tagName + '[' + (ix + 1) + ']';
            if (sibling.nodeType===1 && sibling.tagName===element.tagName)
                ix++;
        }
    }
    const path = getPathTo(arguments[0]);
    if (path.startsWith('id(')) {
        return path;
    }
    return '//' + path;
    """
    return driver.execute_script(xpath_script, element)


def get_visual_spans(element):
    # Get the location and size of the root element
    location = element.location
    size = element.size

    # Calculate the boundaries of the root element
    x_span = (location['x'], location['x'] + size['width'])
    y_span = (location['y'], location['y'] + size['height'])
    
    return x_span, y_span


def set_attribute(element, key, value):
    driver.execute_script("arguments[0].setAttribute(arguments[1], arguments[2])", element, key, value)


def embed_properties(root):
    '''
    This function embeds the x_span, y_span, and xpath
    properties into WebElement as attributes for processing
    in the next steps. We do this because we want to convert
    the WebElement into a BeautifulSoup document.
    '''
    x_span, y_span = get_visual_spans(root)
    xpath = get_xpath(root)
    
    set_attribute(root, 'xpath', xpath)
    set_attribute(root, 'x_start', x_span[0])
    set_attribute(root, 'x_end', x_span[1])
    set_attribute(root, 'y_start', y_span[0])
    set_attribute(root, 'y_end', y_span[1])
    
    for child in root.find_elements(By.XPATH, '*'):
        embed_properties(child)
    
    return root

In [8]:
form = driver.find_elements(By.TAG_NAME, 'form')[48]
form = embed_properties(form)

form_doc = bs(form.get_attribute('outerHTML'), 'html.parser')

In [9]:
print(form_doc.prettify())

<form class="ant-form ant-form-horizontal css-dfjnss" id="complex-form" style="max-width: 600px;" x_end="939" x_start="339" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]" y_end="8501" y_start="8301">
 <div class="ant-form-item css-dfjnss" x_end="939" x_start="339" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]" y_end="8333" y_start="8301">
  <div class="ant-row ant-form-item-row css-dfjnss" x_end="939" x_start="339" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]/DIV[1]" y_end="8333" y_start="8301">
   <div class="ant-col ant-col-8 ant-form-item-label css-dfjnss" x_end="539" x_start="339" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]/DIV[1]/DIV[1]" y_end="8333" y_start="8301">
    <label class="" title="Username" x_e

# Get Processable Nodes

In [10]:
def is_displayed(element):
    if isinstance(element, NavigableString):
        return True
    
    x_start = float(element.attrs['x_start'])
    x_end = float(element.attrs['x_end'])
    y_start = float(element.attrs['y_start'])
    y_end = float(element.attrs['y_end'])
    
    return x_start != x_end and y_start != y_end


def should_skip_processing(element):
    should_skip = \
        isinstance(element, Comment) or \
        (isinstance(element, NavigableString) and element.strip() == '') or \
        not is_displayed(element)
    return should_skip


def is_force_not_keep(element):
    # TODO: list of elements not to keep, like children of svg
    return False


def is_force_keep(element):
    force_keep_tags = ['label', 'input', 'textarea', 'select', 'option', 'button']
    return element.name in force_keep_tags
    


def has_action_listener(element):
    # TODO: check if element has JavaScript action listeners
    # such as the case of <div> instead of <button> for submit
    return False


def is_processable(element):
    if not isinstance(element, NavigableString) and is_force_not_keep(element):
        return False
    
    if isinstance(element, NavigableString):
        if element.parent is not None and len(element.parent.contents) > 1:
            return True
    
    elif len(element.contents) == 0:
        return True
    
    elif len(element.contents) == 1 and isinstance(element.contents[0], NavigableString):
        return True
    
    elif is_force_keep(element):
        return True
    
    elif has_action_listener(element):
        return True
    
    return False
        


def get_processable_nodes(soup):
    nodes = []
    
    for element in soup.recursiveChildGenerator():
        if should_skip_processing(element):
            continue
        
        if is_processable(element):
            nodes.append(element)
    
    return nodes

In [11]:
form_processable_nodes = get_processable_nodes(form_doc)

In [12]:
form_processable_nodes[0]

<label class="" title="Username" x_end="539" x_start="459" xpath="//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[17]/SECTION[1]/FORM[1]/DIV[1]/DIV[1]/DIV[1]/LABEL[1]" y_end="8333" y_start="8301">Username</label>

# Create a Graph From Nodes

In [17]:
class EdgeDir(Enum):
    IN = 'IN'
    OUT = 'OUT'


class EdgeType(Enum):
    # connection via for attribute
    FOR = 'FOR'
    
    # neighbor connections
    CHILD = 'CHILD'
    # TODO: if no use, change all neighbour types to simple 'NEIGHBOUR'
    NLEFT = 'NLEFT'
    NRIGHT = 'NRIGHT'
    NTOP = 'NTOP'
    NBOTTOM = 'NBOTTOM'

In [18]:
class RelationGraph:
    def __init__(self):
        self._nodes = {}
        self._edges = {}
    
    
    def nodes(self):
        return list(self._nodes.values())
    
    
    def get_node(self, node_id):
        return self._nodes[node_id]
    
    
    def edges(self):
        return list(self._edges.values())
    
    
    def add_node(self, node):
        self._nodes[node.get_id()] = node
    
    
    def remove_node(self, node):
        self._nodes.pop(node.get_id())
    
    
    def add_edge(self, edge):
        source = edge.source
        target = edge.target
        
        source.add_edge(edge, EdgeDir.OUT)
        
        self._edges[edge.get_id()] = edge
    
    
    def remove_edge(self, edge):
        source = edge.source
        target = edge.target
        
        source.remove_edge(edge, EdgeDir.OUT)
        
        self._edges.pop(edge.get_id())
    
    
    def get_node_features(self):
        return {
            key: node.get_features() for key, node in self._nodes.items()
        }

In [19]:
def get_input_visible_text(element):
    text = ''
    attrs = element.attrs
    
    if 'placeholder' in attrs:
        text += f"{attrs['placeholder']} "
    if 'value' in attrs and attrs['value'].strip() != '':
        text += f", value is {attrs['value']} "
    if 'alt' in attrs:
        text += f", alt is {attrs['alt']} "
    
    text = text.strip().strip(', ')
    return text


def get_visible_text(element):
    if element.name == 'input' or \
        element.name == 'select' or \
        element.name == 'textarea':
        return get_input_visible_text(element)
    
    if element.string == None:
        return ''
    
    return str(element.string)


def get_null_embedding(dim=12288):
    return [0 for i in range(dim)]


def get_text_embedding(text, dim=12288):
    if text.strip() == '':
        return get_null_embedding(dim)
    
    if TEXT_EMBEDDING_METHOD == 'WORD2VEC':
        # TODO
        # average/something word2vec embeddings for each word
        pass
    elif TEXT_EMBEDDING_METHOD == 'SPACY':
        # TODO
        # https://spacy.io/usage/linguistic-features#vectors-similarity
        nlp = spacy.load("en_core_web_lg")
        doc = nlp(text)
        return doc.vector
    return get_embedding_openai(text.lower())


class RelationNode:
    def __init__(self, element):
        self.element = element
        
        self.xpath = self._find_xpath(element)
        self.x_span = self._find_span(element, 'x')
        self.y_span = self._find_span(element, 'y')
        
        self.visible_text = self._find_visible_text(element)
        self.features = self._calculate_features(self.visible_text)
        
        self.edges = {
            EdgeDir.OUT: {},
        }
        
        self.children_count = 0
    
    
    def _find_xpath(self, element):
        return element.attrs['xpath']
    
    
    def _find_span(self, element, axis):
        return (
            floor(float(element.attrs[f'{axis}_start'])),
            floor(float(element.attrs[f'{axis}_end']))
        )
    
    
    def _find_visible_text(self, element):
        return get_visible_text(element)
    
    
    def _calculate_features(self, visible_text):
        features = get_text_embedding(visible_text)
        return features
    
    
    def _change_children_count(self, edge, direction, increase=1):
        if edge.type == EdgeType.CHILD and direction == EdgeDir.OUT:
            self.children_count += increase
    
    
    def has_children(self):
        return self.children_count > 0
    
    
    def get_children(self):
        return list(filter(
            lambda x: x.type == EdgeType.CHILD,
            list(self.edges[EdgeDir.OUT].values())
        ))
    
    
    def add_edge(self, edge, direction):
        self.edges[direction][edge.get_id()] = edge
        self._change_children_count(edge, direction, 1)
    
    
    def remove_edge(self, edge, direction):
        self.edges[direction].pop(edge.get_id())
        self._change_children_count(edge, direction, -1)
    
    
    def get_visible_area(self):
        return (self.x_span[1] - self.x_span[0]) * (self.y_span[1] - self.y_span[0])
    
    
    def get_id(self):
        if 'id' in self.element.attrs:
            return self.element.attrs['id']
        return self.xpath
    
    
    def __repr__(self):
        return str(self)
   

    def __str__(self):
        return f'<{self.element.name}>{self.visible_text}</{self.element.name}> at y: {str(self.y_span)}, x: {str(self.x_span)}'


In [20]:
class RelationEdge:
    def __init__(self, source, target, conn_type):
        self.source = source
        self.target = target
        self.type = conn_type
        self.weight = 0
    
    
    def set_weight(self, weight):
        self.weight = weight
    
    
    def get_id(self):
        return f'edge {self.type.value} from {self.source} to {self.target}'
    
    
    def __repr__(self):
        return str(self)
    
    
    def __str__(self):
        return f'edge {self.type.value} weight {self.weight} from {self.source} to {self.target}'

In [21]:
def create_relations_graph(bs_doc):
    graph = RelationGraph()
    
    for n in tqdm(bs_doc):
        node = RelationNode(n)
        graph.add_node(node)

    return graph

In [28]:
relation_graph = create_relations_graph(form_processable_nodes)

  0%|          | 0/14 [00:00<?, ?it/s]

# Add Proximity Links

In [31]:
def tuple_avg(t):
    return (t[0] + t[1]) / 2


def span_binary_search(span_list, item_span):
    '''
    Performs a binary search on a span list, which is a list of tuples (start, end).
    The list is ordered by the first member of the tuple.
    The search key is (start + end) / 2.
    returns index of found item.
    '''
    if len(span_list) == 0:
        return -1
    
    if len(span_list) == 1:
        return 0
    
    middle = len(span_list) // 2
    if tuple_avg(item_span) >= tuple_avg(span_list[middle]):
        return middle + span_binary_search(span_list[middle:], item_span)
    
    return span_binary_search(span_list[:middle], item_span)

In [32]:
class SpanOrderedDict:
    def __init__(self):
        self.spans = []
        self.items_dict = {}
    
    
    def force_set_item(self, span, item):
        self.items_dict[span] = item
    
    
    def add_item(self, span, item):
        # print(span)
        span_to_add = self.find_span_to_add(span)
        # print(span_to_add)
        # print(self.spans, '\n')
        if span_to_add not in self.items_dict:
            self.items_dict[span_to_add] = []
        self.items_dict[span_to_add].append(item)
    
    
    def find_span_to_add(self, span):
        closest_span_idx = span_binary_search(self.spans, span)
        
        # if this is the first span that we are adding something to
        if closest_span_idx == -1:
            # print('first', span)
            self.spans = [span]
            return span

        closest_span = self.spans[closest_span_idx]
        
        # if new span is inside some other span
        if span[0] >= closest_span[0] and span[1] <= closest_span[1]:
            # print('inside', closest_span)
            return closest_span
        
        # if new span is before some other span
        if span[0] <= closest_span[0] and span[1] <= closest_span[0]:
            # print('before', closest_span)
            insert_idx = max(closest_span_idx - 1, 0)
            self.spans.insert(insert_idx, span)
            return span
        
        # if new span is after some other span
        if span[0] >= closest_span[1] and span[1] >= closest_span[1]:
            # print('outside', closest_span)
            insert_idx = closest_span_idx + 1
            self.spans.insert(insert_idx, span)
            return span
        
        # otherwise, the spans collide with eachother
        # print('collide', closest_span)
        span_start = min(closest_span[0], span[0])
        span_end = max(closest_span[1], span[1])
        new_span = (span_start, span_end)
        
        # swap span with new created span
        self.spans[closest_span_idx] = new_span
        items_dict = self.items_dict[closest_span]
        self.items_dict[new_span] = items_dict
        self.items_dict.pop(closest_span)

        return new_span
    
    
    def keys(self):
        return self.items_dict.keys()
    
    
    def values(self):
        return self.items_dict.values()
    
    
    def items(self):
        return self.items_dict.items()
    
    
    def __getitem__(self, key):
        return self.items_dict[key]

In [33]:
def create_span_ordered_dict(relation_nodes, axis):
    spans = SpanOrderedDict()
    
    for node in tqdm(relation_nodes):
        span = getattr(node, f'{axis}_span')
        spans.add_item(span, node)

    return spans


def create_2d_span_ordered_dict(relation_graph):
    y_spans = create_span_ordered_dict(relation_graph.nodes(), 'y')
    
    for y_span, nodes in y_spans.items():
        x_spans = create_span_ordered_dict(nodes, 'x')
        
        # sort the items so that the first item in multiple items list is the largets,
        # and therefore the representitive item.
        for x_span, sub_nodes in x_spans.items():
            sorted_sub_nodes = sorted(sub_nodes, key=lambda x: x.get_visible_area(), reverse=True)
            x_spans.force_set_item(x_span, sorted_sub_nodes)
        
        y_spans.force_set_item(y_span, x_spans)
    
    return y_spans

In [34]:
spans_2d = create_2d_span_ordered_dict(relation_graph)

  0%|          | 0/14 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [35]:
spans_2d.items_dict

{(8301, 8333): <__main__.SpanOrderedDict at 0x159bc1840>,
 (8357, 8389): <__main__.SpanOrderedDict at 0x159bc13f0>,
 (8413, 8445): <__main__.SpanOrderedDict at 0x159bc2a70>,
 (8469, 8501): <__main__.SpanOrderedDict at 0x159bc28f0>}

In [36]:
spans_2d[(8357, 8389)].items_dict

{(472, 539): [<label>Address</label> at y: (8357, 8389), x: (472, 539)],
 (551, 736): [<input></input> at y: (8358, 8388), x: (551, 736),
  <span>Select province</span> at y: (8358, 8388), x: (551, 736),
  <path></path> at y: (8370, 8376), x: (726, 736)],
 (747, 860): [<input>Input street</input> at y: (8357, 8389), x: (747, 860)]}

# Add "For" links

In [37]:
def add_for_links(relation_graph):
    for node in relation_graph.nodes():
        if 'for' in node.element.attrs:
            source = relation_graph.get_node(node.get_id())
            
            for_id = node.element.attrs['for']
            target = relation_graph.get_node(for_id)
            
            edge = RelationEdge(source, target, EdgeType.FOR)
            relation_graph.add_edge(edge)

    return relation_graph

In [38]:
relation_graph = add_for_links(relation_graph)

In [39]:
relation_graph.edges()

[]

# Add Neighbor Links for Each Node

## Parent-Child

In [40]:
def add_parent_child_links(spans_2d, relation_graph):
    for y_span, x_spans in spans_2d.items():
        x_nodes = list(x_spans.values())

        for nodes in x_nodes:
            for child_idx in range(1, len(nodes)):
                edge = RelationEdge(nodes[0], nodes[child_idx], EdgeType.CHILD)
                relation_graph.add_edge(edge)
    
    return relation_graph

In [41]:
relation_graph = add_parent_child_links(spans_2d, relation_graph)

In [42]:
relation_graph.edges()

[edge CHILD weight 0 from <input></input> at y: (8358, 8388), x: (551, 736) to <span>Select province</span> at y: (8358, 8388), x: (551, 736),
 edge CHILD weight 0 from <input></input> at y: (8358, 8388), x: (551, 736) to <path></path> at y: (8370, 8376), x: (726, 736),
 edge CHILD weight 0 from <button>Submit</button> at y: (8469, 8501), x: (539, 617) to <span>Submit</span> at y: (8474, 8496), x: (555, 601)]

## Left-Right

In [43]:
def add_left_right_links(spans_2d, relation_graph):
    for y_span, x_spans in spans_2d.items():
        x_nodes = list(x_spans.values())
        # TODO: refactor - create a separate function for edge addition
        for node_idx, nodes in enumerate(x_nodes):
            repr_node = nodes[0]
            # TODO: refactor these two ifs
            if node_idx - 1 >= 0:
                repr_left = x_nodes[node_idx - 1][0]
                edge = RelationEdge(repr_node, repr_left, EdgeType.NLEFT)
                relation_graph.add_edge(edge)
            if node_idx + 1 < len(x_nodes):
                repr_right = x_nodes[node_idx + 1][0]
                edge = RelationEdge(repr_node, repr_right, EdgeType.NRIGHT)
                relation_graph.add_edge(edge)

    return relation_graph

In [44]:
relation_graph = add_left_right_links(spans_2d, relation_graph)

In [45]:
relation_graph.edges()

[edge CHILD weight 0 from <input></input> at y: (8358, 8388), x: (551, 736) to <span>Select province</span> at y: (8358, 8388), x: (551, 736),
 edge CHILD weight 0 from <input></input> at y: (8358, 8388), x: (551, 736) to <path></path> at y: (8370, 8376), x: (726, 736),
 edge CHILD weight 0 from <button>Submit</button> at y: (8469, 8501), x: (539, 617) to <span>Submit</span> at y: (8474, 8496), x: (555, 601),
 edge NRIGHT weight 0 from <label>Username</label> at y: (8301, 8333), x: (459, 539) to <input>Please input</input> at y: (8301, 8333), x: (539, 699),
 edge NLEFT weight 0 from <input>Please input</input> at y: (8301, 8333), x: (539, 699) to <label>Username</label> at y: (8301, 8333), x: (459, 539),
 edge NRIGHT weight 0 from <input>Please input</input> at y: (8301, 8333), x: (539, 699) to <a>Need Help?</a> at y: (8309, 8326), x: (707, 782),
 edge NLEFT weight 0 from <a>Need Help?</a> at y: (8309, 8326), x: (707, 782) to <input>Please input</input> at y: (8301, 8333), x: (539, 699

## Top-Bottom

In [46]:
def spans_overlap(span1, span2):
    if span2[0] >= span1[1] and span2[1] > span1[1]:
        return False
    if span2[1] <= span1[0] and span2[0] < span1[0]:
        return False
    return True


def add_row_neighbor_links(relation_graph, node, row, edge_type):
    for nodes in row.values():
        repr_node = nodes[0]
        if spans_overlap(node.x_span, repr_node.x_span):
            edge = RelationEdge(node, repr_node, edge_type)
            relation_graph.add_edge(edge)
    return relation_graph


def add_top_bottom_links(spans_2d, relation_graph):
    rows = list(spans_2d.values())
    for y_idx, (y_span, x_spans) in enumerate(spans_2d.items()):
        # TODO: refactor - create a separate function for edge addition
        for nodes in x_spans.values():
            repr_node = nodes[0]
            # TODO: refactor these two ifs
            if y_idx - 1 >= 0:
                top_row = rows[y_idx - 1]
                relation_graph = add_row_neighbor_links(
                    relation_graph,
                    repr_node,
                    top_row,
                    EdgeType.NTOP
                )
            if y_idx + 1 < len(spans_2d.keys()):
                bottom_row = rows[y_idx + 1]
                relation_graph = add_row_neighbor_links(
                    relation_graph,
                    repr_node,
                    bottom_row,
                    EdgeType.NBOTTOM
                )
    return relation_graph

In [47]:
relation_graph = add_top_bottom_links(spans_2d, relation_graph)

In [48]:
relation_graph.edges()

[edge CHILD weight 0 from <input></input> at y: (8358, 8388), x: (551, 736) to <span>Select province</span> at y: (8358, 8388), x: (551, 736),
 edge CHILD weight 0 from <input></input> at y: (8358, 8388), x: (551, 736) to <path></path> at y: (8370, 8376), x: (726, 736),
 edge CHILD weight 0 from <button>Submit</button> at y: (8469, 8501), x: (539, 617) to <span>Submit</span> at y: (8474, 8496), x: (555, 601),
 edge NRIGHT weight 0 from <label>Username</label> at y: (8301, 8333), x: (459, 539) to <input>Please input</input> at y: (8301, 8333), x: (539, 699),
 edge NLEFT weight 0 from <input>Please input</input> at y: (8301, 8333), x: (539, 699) to <label>Username</label> at y: (8301, 8333), x: (459, 539),
 edge NRIGHT weight 0 from <input>Please input</input> at y: (8301, 8333), x: (539, 699) to <a>Need Help?</a> at y: (8309, 8326), x: (707, 782),
 edge NLEFT weight 0 from <a>Need Help?</a> at y: (8309, 8326), x: (707, 782) to <input>Please input</input> at y: (8301, 8333), x: (539, 699

# Calculate Embedding Distances from Nodes

## Hueristic

In [49]:
# TODO: improve this section

### Textual Distance

In [50]:
def get_text_similarity(node1, node2):
    embedding1 = node1.features
    embedding2 = node2.features
    try:
        sim = cosine_similarity(embedding1, embedding2)
        return 0 if np.isnan(sim) else sim
    except:
        return 0

In [51]:
relation_graph.nodes()

[<label>Username</label> at y: (8301, 8333), x: (459, 539),
 <input>Please input</input> at y: (8301, 8333), x: (539, 699),
 <a>Need Help?</a> at y: (8309, 8326), x: (707, 782),
 <label>Address</label> at y: (8357, 8389), x: (472, 539),
 <input></input> at y: (8358, 8388), x: (551, 736),
 <span>Select province</span> at y: (8358, 8388), x: (551, 736),
 <path></path> at y: (8370, 8376), x: (726, 736),
 <input>Input street</input> at y: (8357, 8389), x: (747, 860),
 <label>BirthDate</label> at y: (8413, 8445), x: (464, 539),
 <input>Input birth year</input> at y: (8413, 8445), x: (539, 731),
 <input>Input birth month</input> at y: (8413, 8445), x: (739, 931),
 <label> </label> at y: (8469, 8501), x: (529, 539),
 <button>Submit</button> at y: (8469, 8501), x: (539, 617),
 <span>Submit</span> at y: (8474, 8496), x: (555, 601)]

In [52]:
get_text_similarity(
    relation_graph.nodes()[0],
    relation_graph.nodes()[1]
)

0.45188928

In [53]:
get_text_similarity(
    relation_graph.nodes()[3],
    relation_graph.nodes()[5]
)

0.41530555

### Structural Distance

In [54]:
# !pip install lxml
# !pip install networkx
# !pip install node2vec

In [55]:
import networkx as nx
from bs4 import BeautifulSoup
from node2vec import Node2Vec


# Function to create a graph from the BeautifulSoup object
def create_graph(soup):
    G = nx.DiGraph()

    for tag in soup.find_all():
        # Assuming 'xpath' attribute exists for each tag
        xpath = tag.attrs['xpath']
        
        if xpath not in G:
            G.add_node(xpath)
        for child in tag.children:
            if isinstance(child, Comment) or isinstance(child, NavigableString):
                continue
            child_xpath = child.attrs.get('xpath')
            if child_xpath not in G:
                G.add_node(child_xpath)
            G.add_edge(xpath, child_xpath)

    return G


# Function to generate embeddings
def generate_embeddings(G, dimensions=1024, walk_length=30, num_walks=200, workers=4):
    # Create Node2Vec object
    node2vec = Node2Vec(G, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, workers=workers)

    # Train Node2Vec model
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    
    return model


# Create graph
G = create_graph(form_doc)

# Generate embeddings
model = generate_embeddings(G)

Computing transition probabilities:   0%|          | 0/59 [00:00<?, ?it/s]

Generating walks (CPU: 1):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 2):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 3):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 4):   0%|          | 0/50 [00:00<?, ?it/s]Generating walks (CPU: 2): 100%|██████████| 50/50 [00:00<00:00, 1966.74it/s]
Generating walks (CPU: 4): 100%|██████████| 50/50 [00:00<00:00, 2026.10it/s]
Generating walks (CPU: 1): 100%|██████████| 50/50 [00:00<00:00, 1860.23it/s]
Generating walks (CPU: 3): 100%|██████████| 50/50 [00:00<00:00, 1909.19it/s]


In [56]:
def get_structure_similarity(xpath1, xpath2):
    # Check if XPaths exist in the model
    if xpath1 not in model.wv or xpath2 not in model.wv:
        print("One or both of the XPaths are not in the model.")
        return None

    # Get vectors for XPaths
    vector1 = model.wv[xpath1]
    vector2 = model.wv[xpath2]

    # Calculate and return cosine similarity
    return cosine_similarity(vector1, vector2)

In [57]:
get_structure_similarity(
    relation_graph.nodes()[0].xpath,
    relation_graph.nodes()[1].xpath
)

0.97428304

### Combination

In [58]:
def combined_similarity(node1, node2, alpha=0.5):
    text_sim = get_text_similarity(node1, node2)
    structure_sim = get_structure_similarity(node1.xpath, node2.xpath)
    return alpha * text_sim + (1 - alpha) * structure_sim

In [59]:
combined_similarity(
    relation_graph.nodes()[0],
    relation_graph.nodes()[1]
)

0.7130861580371857

## Train GCN for Direct Embedding with Features

In [60]:
# TODO

# Rank Neighbors

For each element, we go through all of its neighbors, and calculate the combined measure of distance for them. If the neighbor has any children, we define the score as the maximum score for any of its children.

In [61]:
for node in tqdm(relation_graph.nodes()):
    print(node)
    
    edges = list(node.edges[EdgeDir.OUT].values())
    
    for edge in edges:
        if edge.type == EdgeType.CHILD:
            continue
        sim = combined_similarity(edge.source, edge.target)
        if edge.target.has_children():
            for target_child_edge in edge.target.get_children():
                sim = max(
                    sim,
                    combined_similarity(edge.source, target_child_edge.target)
                )
        edge.set_weight(sim)

  0%|          | 0/14 [00:00<?, ?it/s]

<label>Username</label> at y: (8301, 8333), x: (459, 539)
<input>Please input</input> at y: (8301, 8333), x: (539, 699)
<a>Need Help?</a> at y: (8309, 8326), x: (707, 782)
<label>Address</label> at y: (8357, 8389), x: (472, 539)
<input></input> at y: (8358, 8388), x: (551, 736)
<span>Select province</span> at y: (8358, 8388), x: (551, 736)
<path></path> at y: (8370, 8376), x: (726, 736)
<input>Input street</input> at y: (8357, 8389), x: (747, 860)
<label>BirthDate</label> at y: (8413, 8445), x: (464, 539)
<input>Input birth year</input> at y: (8413, 8445), x: (539, 731)
<input>Input birth month</input> at y: (8413, 8445), x: (739, 931)
<label> </label> at y: (8469, 8501), x: (529, 539)
<button>Submit</button> at y: (8469, 8501), x: (539, 617)
<span>Submit</span> at y: (8474, 8496), x: (555, 601)


  return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [66]:
def cutoff_low_score_edges(relation_graph):
    scores = list(filter(
        lambda x: x != 0,
        map(
            lambda x: x.weight,
            relation_graph.edges()
        )
    ))
    
    mean = np.mean(scores)
    std_dev = np.std(scores)

    # Set the cutoff to be one standard deviation above the mean
    cutoff = mean + std_dev

    print("Mean:", mean)
    print("Standard Deviation:", std_dev)
    print("Cutoff:", cutoff)
    
    for edge in relation_graph.edges():
        if edge.type != EdgeType.CHILD and edge.weight < cutoff:
            relation_graph.remove_edge(edge)
    return relation_graph

In [68]:
relation_graph = cutoff_low_score_edges(relation_graph)

Mean: 0.3964956261625048
Standard Deviation: 0.1948220535972591
Cutoff: 0.5913176797597639


In [69]:
relation_graph.edges()

[edge CHILD weight 0 from <input></input> at y: (8358, 8388), x: (551, 736) to <span>Select province</span> at y: (8358, 8388), x: (551, 736),
 edge CHILD weight 0 from <input></input> at y: (8358, 8388), x: (551, 736) to <path></path> at y: (8370, 8376), x: (726, 736),
 edge CHILD weight 0 from <button>Submit</button> at y: (8469, 8501), x: (539, 617) to <span>Submit</span> at y: (8474, 8496), x: (555, 601),
 edge NRIGHT weight 0.7130861580371857 from <label>Username</label> at y: (8301, 8333), x: (459, 539) to <input>Please input</input> at y: (8301, 8333), x: (539, 699),
 edge NLEFT weight 0.7130861580371857 from <input>Please input</input> at y: (8301, 8333), x: (539, 699) to <label>Username</label> at y: (8301, 8333), x: (459, 539),
 edge NRIGHT weight 0.6220728829503059 from <input>Please input</input> at y: (8301, 8333), x: (539, 699) to <a>Need Help?</a> at y: (8309, 8326), x: (707, 782),
 edge NLEFT weight 0.6220728829503059 from <a>Need Help?</a> at y: (8309, 8326), x: (707, 