In [1]:
import os
import pandas as pd
import networkx as nx
from selenium import webdriver
from selenium.webdriver.common.by import By

In [2]:
driver = webdriver.Chrome('drivers/chromedriver108')

In [3]:
BASE_URL = 'https://www.google.com/'
NAME = 'google'
MAX_NODES = 1

In [4]:
INLINE_ELEMENTS = [
    'a',
    'abbr',
    'acronym',
    'b',
    'bdo',
    'big',
    'br',
    'cite',
    'code',
    'dfn',
    'em',
    'i',
    'input',
    'kbd',
    'map',
    'object',
    'output',
    'q',
    'samp',
    'script',
    'small',
    'span',
    'strong',
    'style',
    'sub',
    'sup',
    'svg',
    'time',
    'tt',
    'var'
]

In [5]:
def get_connected_urls(body_element):
    return list(map(lambda x: x.get_attribute('href'), body_element.find_elements(By.TAG_NAME, 'a')))

In [6]:
def urls_have_same_origin(url1, url2):
    get_origin = lambda url: url.split('://')[1].split('/')[0]
    return get_origin(url1) == get_origin(url2)

In [7]:
def unchecked_bfs_app_traverse(driver, start_url, max_nodes):
    traversal_graph = nx.Graph()
    traversal_queue = [start_url]
    
    traversal_graph.add_node(start_url)
    
    while len(traversal_graph.nodes) < max_nodes and len(traversal_queue) > 0:
        driver.get(traversal_queue[0])
        
        body = driver.find_element(By.TAG_NAME, 'body')
        connected_urls = get_connected_urls(body)
        
        for url in connected_urls:
            if url == traversal_queue[0] \
                    or url.split('#')[0] in traversal_graph.nodes \
                    or not urls_have_same_origin(url, start_url):
                continue
            traversal_queue.append(url)
            traversal_graph.add_node(url)
            traversal_graph.add_edge(traversal_queue[0], url)
        
        traversal_queue = traversal_queue[1:]
    
    return traversal_graph

In [8]:
def create_data_directory(name):
    if not os.path.exists('data/{0}'.format(name)):
        os.mkdir('data/{0}'.format(name))
        os.mkdir('data/{0}/screenshots'.format(name))

In [9]:
def crawl_features(driver, url, name):
    body = driver.find_element(By.TAG_NAME, 'body')
    
    counter = 1
    traverse_list = [body]
    id_dict = { body: 0 }
    adjacency_list = []
    feature_dict = {}
    screenshot_directories = { body: 'data/{0}/screenshots/0'.format(name) }
    
    while len(traverse_list) > 0:
        element = traverse_list[0]
        element_id = id_dict[element]
        
        try:
            if element.tag_name not in INLINE_ELEMENTS:
                if element in screenshot_directories:
                    if not os.path.exists(screenshot_directories[element]):
                        os.mkdir(screenshot_directories[element])
                    element.screenshot('{0}/{1}.png'.format(screenshot_directories[element], element_id))
                else:
                    parent_element = element.find_element(By.XPATH, './..')
                    parent_directory = screenshot_directories[element.parent]
                    new_directory = '{0}/{1}'.format(parent_directory, element_id)
                    if not os.exists(new_directory):
                        os.mkdir(new_directory)
                    element.screenshot('{0}/{1}.png'.format(new_directory, element_id))
                    screenshot_directories[element] = element_directory
                
                children = element.find_elements(By.XPATH, '*')
                
                feature_dict[element_id] = driver.execute_script(
                    '''
                        const style = getComputedStyle(arguments[0]);
                        const transformed = Object.keys(style).reduce((acc, key) => {
                            console.log(key, !Number.isNaN(+key))
                            if (Number.isNaN(+key)) {
                                acc[key] = style[key];
                            }
                            return acc;
                        }, {});
                        return transformed;
                    ''',
                    element
                )
                feature_dict[element_id]['tagName'] = element.tag_name
                
                for child in children:
                    if child.tag_name == 'script' or child.tag_name == 'style':
                        continue
                    id_dict[child] = counter
                    adjacency_list.append((element_id, counter))
                    traverse_list.append(child)
                    counter += 1
        except Exception as e:
            print(e)

        traverse_list = traverse_list[1:]
    
    dom_graph = nx.Graph()
    dom_graph.add_edges_from(adjacency_list)
    nx.write_adjlist(dom_graph, 'data/{0}/{1}.adj'.format(name, url))
    
    feature_df = pd.DataFrame(feature_dict).swapaxes(0, 1)
    feature_df.to_csv('data/{0}/{1}.csv'.format(name, url))

In [10]:
traversal_graph = unchecked_bfs_app_traverse(driver, BASE_URL, MAX_NODES)

In [11]:
create_data_directory(NAME)

In [12]:
for idx, url in enumerate(traversal_graph.nodes):
    driver.get(url)
    crawl_features(driver, idx, NAME)

<selenium.webdriver.remote.webelement.WebElement (session="36924d111b2b7b04391e0355d4ea2844", element="9ea897e5-a0d2-43aa-be97-e4f2888d4926")> body
<selenium.webdriver.remote.webelement.WebElement (session="36924d111b2b7b04391e0355d4ea2844", element="52c00275-9343-44c5-863d-26e5bacae9a9")> div
<selenium.webdriver.chrome.webdriver.WebDriver (session="36924d111b2b7b04391e0355d4ea2844")>
<selenium.webdriver.remote.webelement.WebElement (session="36924d111b2b7b04391e0355d4ea2844", element="df234134-cc99-4f13-857e-53cb230fd4d8")> div
<selenium.webdriver.chrome.webdriver.WebDriver (session="36924d111b2b7b04391e0355d4ea2844")>
<selenium.webdriver.remote.webelement.WebElement (session="36924d111b2b7b04391e0355d4ea2844", element="f678581a-4e08-41a5-850c-ac9d77537e25")> textarea
<selenium.webdriver.chrome.webdriver.WebDriver (session="36924d111b2b7b04391e0355d4ea2844")>
<selenium.webdriver.remote.webelement.WebElement (session="36924d111b2b7b04391e0355d4ea2844", element="eb04ecb5-f29d-4a47-9f8a-