In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import openai
from dotenv import load_dotenv

from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs

from method.ours import (
    create_driver,
    embed_properties,
    get_processable_nodes,
    create_relations_graph,
    create_2d_span_ordered_dict,
    add_for_links,
    add_parent_child_links,
    add_left_right_links,
    add_top_bottom_links,
)

In [3]:
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [4]:
# Global Variables
HEADLESS = False
TEXT_EMBEDDING_METHOD = 'ADA' # ['ADA', 'WORD2VEC', 'SPACY']
GRAPH_EMBEDDING_METHOD = 'NODE2VEC' # ['NODE2VEC', 'GCN']

In [5]:
driver = create_driver(HEADLESS)
driver.get('https://ant.design/components/form')

In [6]:
# form = driver.find_elements(By.TAG_NAME, 'form')[48]
form = driver.find_element(By.ID, 'register')
form = embed_properties(driver, form)

form_doc = bs(form.get_attribute('outerHTML'), 'html.parser')

In [7]:
form_processable_nodes = get_processable_nodes(form_doc)

In [8]:
relation_graph = create_relations_graph(form_processable_nodes, TEXT_EMBEDDING_METHOD)

100%|███████████████████████████████████████████| 39/39 [00:08<00:00,  4.56it/s]


In [9]:
spans_2d = create_2d_span_ordered_dict(relation_graph)

100%|████████████████████████████████████████| 39/39 [00:00<00:00, 77525.05it/s]
100%|██████████████████████████████████████████| 2/2 [00:00<00:00, 26715.31it/s]
100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 9903.91it/s]
100%|██████████████████████████████████████████| 2/2 [00:00<00:00, 25970.92it/s]
100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 8128.50it/s]
100%|██████████████████████████████████████████| 3/3 [00:00<00:00, 13457.66it/s]
100%|███████████████████████████████████████████| 4/4 [00:00<00:00, 9636.54it/s]
100%|██████████████████████████████████████████| 4/4 [00:00<00:00, 38479.85it/s]
100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 6311.97it/s]
100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 11915.64it/s]
100%|██████████████████████████████████████████| 2/2 [00:00<00:00, 24966.10it/s]
100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 10645.44it/s]
100%|███████████████████████

In [10]:
relation_graph = add_for_links(relation_graph)
relation_graph = add_parent_child_links(spans_2d, relation_graph)
relation_graph = add_left_right_links(spans_2d, relation_graph)
relation_graph = add_top_bottom_links(spans_2d, relation_graph)

In [11]:
import numpy as np
from tqdm import tqdm

import networkx as nx

from bs4 import NavigableString, Comment

from method.ours.relation_graph import EdgeDir, EdgeType


xpath_to_node = {
    node.xpath: node for node in relation_graph.nodes()
}

feature_len = len(list(xpath_to_node.values())[0].features)


def get_node_feature(xpath):
    if xpath in xpath_to_node:
        return xpath_to_node[xpath].features
    return [0 for i in range(feature_len)]


def create_graph(soup):
    G = nx.DiGraph()

    for tag in soup.find_all():
        xpath = tag.attrs['xpath']
        
        if xpath not in G:
            G.add_node(xpath, x=get_node_feature(xpath))
        for child in tag.children:
            if isinstance(child, Comment) or isinstance(child, NavigableString):
                continue
            child_xpath = child.attrs.get('xpath')
            if child_xpath not in G:
                G.add_node(child_xpath, x=get_node_feature(child_xpath))
            G.add_edge(xpath, child_xpath)
        
        if xpath in xpath_to_node:
            for edge in xpath_to_node[xpath].edges[EdgeDir.OUT].values():
                target_xpath = edge.target.xpath
                if target_xpath not in G:
                    G.add_node(target_xpath, x=get_node_feature(target_xpath))
                G.add_edge(xpath, target_xpath)

    return G

In [12]:
G = create_graph(form_doc)

In [13]:
from torch_geometric.utils import from_networkx


data = from_networkx(G)

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

class GraphConvolution(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GraphConvolution, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.weight = nn.Parameter(torch.FloatTensor(input_dim, output_dim))

    def forward(self, adjacency, input_feature):
        support = torch.mm(input_feature, self.weight)
        output = torch.mm(adjacency, support)
        return output

class GCNEmbedder(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GCNEmbedder, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.dropout = dropout

    def forward(self, adjacency, feature):
        x = F.relu(self.gc1(adjacency, feature))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(adjacency, x)
        return x

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nembed, nout, dropout):
        super(GCN, self).__init__()
        self.embedding = GCNEmbedder(nfeat, nhid, nembed, dropout)
        self.linear = nn.Linear(nembed, nout)
    
    def forward(self, adjacency, feature):
        x = self.embedding(adjacency, feature)
        x = self.linear(x)
        return x
    

# assuming you have loaded your graph into edge_index, and node features into x
# we create an adjacency matrix from edge_index
adjacency = torch.tensor(nx.adjacency_matrix(G).todense()).to_sparse().to(torch.float)

# creating the GCN model
model = GCN(nfeat=data.x.size(1), nhid=32, nembed=128, nout=adjacency.shape[0], dropout=0.5).to('cpu')
optimizer = Adam(model.parameters(), lr=0.01)

# creating the binary target tensor (assuming the adjacency matrix is binary)
target = adjacency.to_dense().flatten()

for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    
    # we feed the adjacency matrix and node features into the model
    out = model(adjacency, data.x)
    # the output is also a adjacency-like matrix, so we flatten it to match the target's shape
    out = out.flatten()
    
    # computing the binary cross entropy loss
    loss = F.binary_cross_entropy_with_logits(out, target)
    print('Epoch', epoch, 'Loss', loss.item())

    # backpropagation
    loss.backward()
    optimizer.step()

  adjacency = torch.tensor(nx.adjacency_matrix(G).todense()).to_sparse().to(torch.float)


Epoch 0 Loss 0.6929193139076233
Epoch 1 Loss 0.6880123615264893
Epoch 2 Loss 0.6831310391426086
Epoch 3 Loss 0.6782757043838501
Epoch 4 Loss 0.6734469532966614
Epoch 5 Loss 0.6686450242996216
Epoch 6 Loss 0.6638705134391785
Epoch 7 Loss 0.6591235399246216
Epoch 8 Loss 0.6544047594070435
Epoch 9 Loss 0.6497142910957336
Epoch 10 Loss 0.6450526714324951
Epoch 11 Loss 0.6404200792312622
Epoch 12 Loss 0.6358168721199036
Epoch 13 Loss 0.6312434077262878
Epoch 14 Loss 0.6266998648643494
Epoch 15 Loss 0.6221866011619568
Epoch 16 Loss 0.6177037954330444
Epoch 17 Loss 0.6132516860961914
Epoch 18 Loss 0.6088303923606873
Epoch 19 Loss 0.6044402122497559
Epoch 20 Loss 0.6000813245773315
Epoch 21 Loss 0.5957538485527039
Epoch 22 Loss 0.5914579033851624
Epoch 23 Loss 0.587193489074707
Epoch 24 Loss 0.5829610228538513
Epoch 25 Loss 0.5787602663040161
Epoch 26 Loss 0.5745913982391357
Epoch 27 Loss 0.5704545378684998
Epoch 28 Loss 0.5663496255874634
Epoch 29 Loss 0.5622767210006714
Epoch 30 Loss 0.55823

In [38]:
model(adjacency, data.x)

tensor([[-1.6531, -1.5911, -1.5284,  ..., -1.5305, -1.6413, -1.5513],
        [-1.6531, -1.5911, -1.5284,  ..., -1.5305, -1.6413, -1.5513],
        [-1.6531, -1.5911, -1.5284,  ..., -1.5305, -1.6413, -1.5513],
        ...,
        [-1.6531, -1.5911, -1.5284,  ..., -1.5305, -1.6413, -1.5513],
        [-1.6531, -1.5911, -1.5284,  ..., -1.5305, -1.6413, -1.5513],
        [-1.6531, -1.5911, -1.5284,  ..., -1.5305, -1.6413, -1.5513]],
       grad_fn=<AddmmBackward0>)