# Imports

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('.'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import time
from enum import Enum
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

# Driver

In [26]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [6]:
URL = "http://localhost:8080"
driver.get(URL)

# Utils

In [7]:
xpath_script = """
function getPathTo(element) {
    // if (element.id !== '')
    //     return 'id(\"'+element.id+'\")';
    if (element === document.body)
        return element.tagName;
    var ix= 0;
    var siblings= element.parentNode.childNodes;
    for (var i= 0; i<siblings.length; i++) {
        var sibling= siblings[i];
        if (sibling===element)
            return getPathTo(element.parentNode) + '/' + element.tagName + '[' + (ix + 1) + ']';
        if (sibling.nodeType===1 && sibling.tagName===element.tagName)
            ix++;
    }
}
const path = getPathTo(arguments[0]);
if (path.startsWith('id(')) {
    return path;
}
return '//' + path;
"""

In [8]:
def embed_xpath_into_elements(html_element):
    queue = [html_element]
    while len(queue) > 0:
        head = queue[0]
        xpath = driver.execute_script(xpath_script, head)
        driver.execute_script("arguments[0].setAttribute('XPATH',arguments[1])", head, xpath)
        children = head.find_elements(By.XPATH, '*')
        queue = [*queue, *children][1:]
    return html_element

In [9]:
class Models(Enum):
    SIMPLE = 'SIMPLE'
    
    GPT3_PLAIN = 'GPT3_PLAIN'
    GPT3_WITH_PARSER = 'GPT3_WITH_PARSER'
    
    GPT35_PLAIN = 'GPT35_PLAIN'
    GPT35_WITH_PARSER = 'GPT35_WITH_PARSER'
    
    GPT4_PLAIN = 'GPT4_PLAIN'
    GPT4_WITH_PARSER = 'GPT4_WITH_PARSER'

# Crawl Objects

In [10]:
class Action:
    def __init__(self, action_element_xpath):
        self.element_xpath = action_element_xpath
    
    
    def perform(self):
        raise NotImplementedError("Action not implemented")
    
    
    def __str__(self):
        return f"Action {type(self)} at XPATH: {self.element_xpath}"
    
    
    def __eq__(self, other):
        return type(self) == type(other) and self.element_xpath == other.element_xpath

In [11]:
class GoToRootAction(Action):
    def __init__(self):
        pass
    
    
    def perform(self):
        driver.get(URL)

In [12]:
class ClickAction(Action):
    def __init__(self, click_element_xpath):
        super().__init__(click_element_xpath)
    
    
    def perform(self):
        element = driver.find_element(By.XPATH, self.element_xpath)
        element.click()

In [13]:
from forms.page_transformer import driver_to_doc, simplify_doc

from forms.form_finder.simple_form_finder import find_form_by_tag as simple_find_form
from forms.form_parser.simple_form_parser import parse_form_inputs_without_labels as simple_parse_form
from forms.form_filler.simple_form_filler import fill_form_with_fixed_values as simple_fill_and_submit_form

from forms.form_parser.gpt_form_parser import gpt3_parse_form, gpt35_parse_form, gpt4_parse_form
from forms.form_filler.gpt_form_filler import gpt3_fill_form_plain, gpt35_fill_form_plain, gpt4_fill_form_plain


class FormAction(Action):
    def __init__(self, form_element_xpath):
        super().__init__(form_element_xpath)
    
    
    def perform(self):
        global model
        print("FormAction found")
        form = driver.find_element(By.XPATH, self.element_xpath)
    
        # embed_xpath_into_elements(form)
        
        if model == Models.SIMPLE:
            parsed_form = parse_form(form)
            fill_and_submit_form(parsed_form)
        
        if model == Models.GPT3_WITH_PARSER:
            parsed = gpt3_parse_form(form)
        
        if model == Models.GPT3_WITH_PARSER:
            parsed = gpt3_parse_form(form)
        
        if model == Models.GPT35_WITH_PARSER:
            parsed = gpt35_parse_form(form)
        
        if model == Models.GPT4_WITH_PARSER:
            parsed = gpt4_parse_form(form)
        
        if model == Models.GPT3_PLAIN:
            fill_values = gpt3_fill_form_plain(form)
        
        if model == Models.GPT35_PLAIN:
            fill_values = gpt35_fill_form_plain(form)
        
        if model == Models.GPT4_PLAIN:
            fill_values = gpt4_fill_form_plain(form)

        # print(parsed)
        print(fill_values)
        time.sleep(5)

In [14]:
class State:
    def __init__(self, url, html, action_route=[]):
        self.url = url
        self.source = html
        self.neighbors = {}
        self.actions = self._evaluate_actions()
        self.action_route = self._evaluate_action_route(action_route)
    
    
    def _evaluate_actions(self):
        click_actions = self._evaluate_actions_by_tag_name("a", ClickAction)
        form_actions = self._evaluate_actions_by_tag_name("form", FormAction)
        actions = { **click_actions, **form_actions }
        return actions
    
    
    def _evaluate_actions_by_tag_name(self, tag_name, ActionCreator):
        tags = driver.find_elements(By.TAG_NAME, tag_name)
        tags = list(filter(lambda x: x.is_displayed(), tags))
        tags_xpath = list(map(
            lambda x: driver.execute_script(xpath_script, x),
            tags
        ))
        tags_actions = list(map(
            lambda x: ActionCreator(x),
            tags_xpath
        ))
        return { action.element_xpath: action for action in tags_actions }
    
    
    def _evaluate_action_route(self, action_route):
        if len(action_route) == 0:
            return [GoToRootAction()]
        return action_route
    
    
    def add_neighbor(self, neighbor_state, action):
        self.neighbors[action.element_xpath] = neighbor_state
    
    
    def get_to_state(self):
        for action in self.action_route:
            action.perform()
    
    
    def perform_actions(self):
        for element_xpath in self.actions:
            action = self.actions[element_xpath]
            
            # try:
            for i in range(5):
                try:
                    action.perform()
                    break
                except:
                    time.sleep(1)
                    continue
            new_state = State(
                driver.current_url,
                driver.find_element(By.TAG_NAME, 'body').get_attribute('outerHTML'),
                [*self.action_route, action]
            )
            if self != new_state:
                self.add_neighbor(new_state, action)
            # except Exception as e:
            #     print("Could not perform", action)
            #     print(e)
            
            self.get_to_state()
    
    
    def __eq__(self, other):
        if type(other) != State:
            return False
        if self.url != other.url:
            return False
        if len(self.actions) != len(other.actions):
            return False
        for action in self.actions:
            if action not in other.actions:
                return False
        for action in other.actions:
            if action not in self.actions:
                return False
        return True

    
    def __str__(self):
        action_str = '\n'.join([action for action in self.actions])
        neighbor_str = '\n'.join([neighbor.url for neighbor in self.neighbors.values()])
        return f"""
            State: {self.url}
            Actions:
            {action_str}
            Neighbors:
            {neighbor_str}
        """

# Model Selection

In [35]:
model = Models.GPT4_PLAIN

# Execution

In [16]:
driver.get(URL)
driver.find_element(By.XPATH, '//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]').click()
driver.find_element(By.XPATH, '//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/UL[1]/LI[1]/A[1]').click()

state = State(
    driver.current_url,
    driver.find_element(By.TAG_NAME, 'body').get_attribute('outerHTML')
)

state.actions['//BODY/APP-ROOT[1]/APP-OWNER-LIST[1]/DIV[1]/DIV[1]/FORM[1]'].perform()

FormAction found
We want to fill inputs of an HTML form in the following format:
    HTML:
    <form>
        <label for="fname">First name:</label>
        <input type="text" id="fname" name="fname" xpath="a" />
        
        <label for="lname">Last name:</label>
        <input type="text" id="lname" name="lname" xpath="b" />
    </form>
    Values:
    [
        {
            "xpath": "a",
            "value": "John",
        },
        {
            "xpath": "b",
            "value": "Doe",
        }
    ]
    
    Generate values for the following form:
    <form _ngcontent-gxh-c36="" novalidate="" method="get" id="search-owner-form" class="form-horizontal ng-untouched ng-pristine ng-valid"><div _ngcontent-gxh-c36="" class="form-group"><div _ngcontent-gxh-c36="" id="lastNameGroup" class="control-group"><label _ngcontent-gxh-c36="" class="col-sm-2 control-label">Last name </label><div _ngcontent-gxh-c36="" class="col-sm-10"><input _ngcontent-gxh-c36="" size="30" maxlength="80" id

In [17]:
driver.get(URL)
driver.find_element(By.XPATH, '//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]').click()
driver.find_element(By.XPATH, '//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/UL[1]/LI[2]/A[1]').click()

state = State(
    driver.current_url,
    driver.find_element(By.TAG_NAME, 'body').get_attribute('outerHTML')
)

state.actions['//BODY/APP-ROOT[1]/APP-OWNER-ADD[1]/DIV[1]/DIV[1]/FORM[1]'].perform()

FormAction found
We want to fill inputs of an HTML form in the following format:
    HTML:
    <form>
        <label for="fname">First name:</label>
        <input type="text" id="fname" name="fname" xpath="a" />
        
        <label for="lname">Last name:</label>
        <input type="text" id="lname" name="lname" xpath="b" />
    </form>
    Values:
    [
        {
            "xpath": "a",
            "value": "John",
        },
        {
            "xpath": "b",
            "value": "Doe",
        }
    ]
    
    Generate values for the following form:
    <form _ngcontent-qcv-c38="" novalidate="" class="form-horizontal ng-untouched ng-pristine ng-invalid"><div _ngcontent-qcv-c38="" hidden="true" class="form-group"><input _ngcontent-qcv-c38="" type="text" hidden="true" id="id" name="id" class="form-control ng-untouched ng-pristine ng-valid"></div><div _ngcontent-qcv-c38="" class="form-group has-feedback"><label _ngcontent-qcv-c38="" for="firstName" class="col-sm-2 control-label

In [27]:
driver.get('https://ant.design/components/form')

state = State(
    driver.current_url,
    driver.find_element(By.TAG_NAME, 'body').get_attribute('outerHTML')
)

In [28]:
state.actions

{'//BODY/DIV[1]/DIV[1]/HEADER[1]/DIV[1]/DIV[1]/H1[1]/A[1]': <__main__.ClickAction at 0x13ce2ae90>,
 '//BODY/DIV[1]/DIV[1]/HEADER[1]/DIV[1]/DIV[2]/UL[1]/LI[1]/SPAN[1]/A[1]': <__main__.ClickAction at 0x13d2228f0>,
 '//BODY/DIV[1]/DIV[1]/HEADER[1]/DIV[1]/DIV[2]/UL[1]/LI[2]/SPAN[1]/A[1]': <__main__.ClickAction at 0x13de503d0>,
 '//BODY/DIV[1]/DIV[1]/HEADER[1]/DIV[1]/DIV[2]/UL[1]/LI[3]/SPAN[1]/A[1]': <__main__.ClickAction at 0x13de51090>,
 '//BODY/DIV[1]/DIV[1]/HEADER[1]/DIV[1]/DIV[2]/UL[1]/LI[4]/SPAN[1]/A[1]': <__main__.ClickAction at 0x13de52860>,
 '//BODY/DIV[1]/DIV[1]/HEADER[1]/DIV[1]/DIV[2]/UL[1]/LI[5]/SPAN[1]/A[1]': <__main__.ClickAction at 0x13de52890>,
 '//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[1]/SECTION[1]/UL[1]/LI[1]/SPAN[1]/A[1]': <__main__.ClickAction at 0x13de50e50>,
 '//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[1]/SECTION[1]/UL[1]/LI[2]/UL[1]/LI[1]/SPAN[1]/A[1]': <__main__.ClickAction at 0x13cef4f40>,
 '//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[1]/SECTION[1]/UL[1]/LI[2]/UL[1]/LI[2]/SPAN[1]/A[1]': <__mai

In [36]:
state.actions[
    '//BODY/DIV[1]/DIV[1]/MAIN[1]/DIV[2]/ARTICLE[1]/DIV[2]/DIV[1]/SECTION[1]/SECTION[23]/SECTION[1]/FORM[1]'
].perform()


FormAction found
We want to fill inputs of an HTML form in the following format:
    HTML:
    <form>
        <label for="fname">First name:</label>
        <input type="text" id="fname" name="fname" xpath="a" />
        
        <label for="lname">Last name:</label>
        <input type="text" id="lname" name="lname" xpath="b" />
    </form>
    Values:
    [
        {
            "xpath": "a",
            "value": "John",
        },
        {
            "xpath": "b",
            "value": "Doe",
        }
    ]
    
    Generate values for the following form:
    <form id="advanced_search" style="max-width:none;background:rgba(0, 0, 0, 0.02);border-radius:8px;padding:24px" class="ant-form ant-form-horizontal css-yp8pcc"><div class="ant-row css-yp8pcc" style="margin-left:-12px;margin-right:-12px"><div style="padding-left:12px;padding-right:12px" class="ant-col ant-col-8 css-yp8pcc"><div class="ant-form-item css-yp8pcc"><div class="ant-row ant-form-item-row css-yp8pcc"><div class="ant-c

Here's the values for the given form:

[
    {
        "id": "advanced_search_field-0",
        "type": "text",
        "value": "YourValueHere"
    },
    {
        "id": "advanced_search_field-1",
        "type": "search",
        "value": "YourValueHere"
    },
    {
        "id": "advanced_search_field-2",
        "type": "text",
        "value": "YourValueHere"
    },
    {
        "id": "advanced_search_field-3",
        "type": "text",
        "value": "YourValueHere"
    },
    {
        "id": "advanced_search_field-4",
        "type": "search",
        "value": "YourValueHere"
    },
    {
        "id": "advanced_search_field-5",
        "type": "text",
        "value": "YourValueHere"
    }
]


# Crawl

In [15]:
driver.get(URL)

counter = 0
MAX_COUNTER = 30

state_list = {}
initial_state = State(
    driver.current_url,
    driver.find_element(By.TAG_NAME, 'body').get_attribute('outerHTML')
)
crawl_queue = [initial_state]
state_list[driver.current_url] = [initial_state]

In [16]:
while len(crawl_queue) > 0 and counter < MAX_COUNTER:
    counter += 1
    
    next_state = crawl_queue[0]
    print(next_state)
    
    next_state.get_to_state()
    next_state.perform_actions()
    
    for neighbor_state in next_state.neighbors.values():
        if neighbor_state.url not in state_list:
            state_list[neighbor_state.url] = [neighbor_state]
            crawl_queue.append(neighbor_state)
        elif neighbor_state not in state_list[neighbor_state.url]:
            state_list[neighbor_state.url].append(neighbor_state)
            crawl_queue.append(neighbor_state)

    crawl_queue = crawl_queue[1:] 


            State: http://localhost:8080/petclinic/
            Actions:
            //BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/DIV[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[3]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[4]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[5]/A[1]
            Neighbors:
            
        

            State: http://localhost:8080/petclinic#
            Actions:
            
            Neighbors:
            
        

            State: http://localhost:8080/petclinic/welcome
            Actions:
            //BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/DIV[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[3]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[4]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/N


            State: http://localhost:8080/petclinic/vets
            Actions:
            //BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/DIV[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[3]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[4]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[5]/A[1]
            Neighbors:
            
        

            State: http://localhost:8080/petclinic/vets/add
            Actions:
            //BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/DIV[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[3]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[4]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[5]/A[1]
//BODY/APP-ROOT[1]/APP-VET-ADD[1]/DIV[1]/DIV[1]/FORM[1]
            Neighbors:
            
     

FormAction found
[{'label': None, 'tag': 'input', 'attributes': {'_ngcontent-qlp-c36': '', 'size': '30', 'maxlength': '80', 'id': 'lastName', 'name': 'lastName', 'value': '', 'class': 'form-control ng-untouched ng-pristine ng-valid', 'type': 'text'}, 'element': <selenium.webdriver.remote.webelement.WebElement (session="b94faec0614af381d25db1fa5e7a0bd5", element="ada67d63-ed6f-4fe2-a3f2-2945863ca258")>}, {'tag': 'button', 'attributes': {'_ngcontent-qlp-c36': '', 'type': 'submit', 'class': 'btn btn-default'}, 'element': <selenium.webdriver.remote.webelement.WebElement (session="b94faec0614af381d25db1fa5e7a0bd5", element="d914d358-ddb1-4db7-9623-6449cbb23cd6")>}]

            State: http://localhost:8080/petclinic/owners/add
            Actions:
            //BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/DIV[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/UL[1]/LI[1]/A[1]



            State: http://localhost:8080/petclinic/vets
            Actions:
            //BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/DIV[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[3]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[3]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[3]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[4]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[5]/A[1]
            Neighbors:
            
        

            State: http://localhost:8080/petclinic/vets/add
            Actions:
            //BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/DIV[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/UL[1]/LI[


            State: http://localhost:8080/petclinic/owners/3
            Actions:
            //BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/DIV[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[3]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[4]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[5]/A[1]
            Neighbors:
            
        

            State: http://localhost:8080/petclinic/owners/4
            Actions:
            //BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/DIV[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[1]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[2]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[3]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[4]/A[1]
//BODY/APP-ROOT[1]/DIV[1]/NAV[1]/DIV[1]/UL[1]/LI[5]/A[1]
            Neighbors:
            
        


In [17]:
# url coverage
print(len(state_list))
# state coverage
print(sum([len(state_list[s]) for s in state_list]))

19
44
