In [1]:
import re
from scipy.spatial import KDTree
from webcolors import CSS3_HEX_TO_NAMES, hex_to_rgb

from bs4 import BeautifulSoup as bs, NavigableString, Comment
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [2]:
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

In [3]:
driver.get('http://localhost:3000/antd')

# Get Form

In [4]:
form = driver.find_element(By.ID, 'form')

# Embed CSS Properties Into Object

In [5]:
IMPORTANT_PROPERTIES = ['backgroundColor', 'color', 'borderColor', 'fontSize', 'fontWeight']
FONT_WEIGHT_NAMES = {
        '100': 'thin',
        '200': 'extra light',
        '300': 'light',
        '400': 'normal',
        '500': 'medium',
        '600': 'semi bold',
        '700': 'bold',
        '800': 'extra bold',
        '900': 'heavy',
}


def convert_rgb_to_names(rgb_tuple):
    css3_db = CSS3_HEX_TO_NAMES
    names = []
    rgb_values = []
    for color_hex, color_name in css3_db.items():
        names.append(color_name)
        rgb_values.append(hex_to_rgb(color_hex))
    
    kdt_db = KDTree(rgb_values)
    distance, index = kdt_db.query(rgb_tuple)
    return names[index]


def convert_color(rgba_string):
    rgba_tuple = list(
        map(
            lambda x: int(x),
            re.findall(r'\d+', rgba_string)
        )
    )
    alpha = 1 if len(rgba_tuple) == 3 else rgba_tuple[3]
    color_name = convert_rgb_to_names(rgba_tuple[:3])
    return '{0} {1}'.format(color_name, alpha)


def convert_font_weight(font_weight):
    global FONT_WEIGHT_NAMES
    return FONT_WEIGHT_NAMES[font_weight]


def embed_properties_into_element(element):
    global IMPORTANT_PROPERTIES
    
    properties = driver.execute_script('''
        const style = window.getComputedStyle(arguments[0]);
        console.log(style);
        const properties = {};
        Object.keys(style).forEach((key) => {
            if (isNaN(key)) {
                properties[key] = style[key];
            }
        });
        return properties;
    ''', element)
    
    property_converters = {
        'color': convert_color,
        'backgroundColor': convert_color,
        'borderColor': convert_color,
        'fontWeight': convert_font_weight
    }
    
    for p in IMPORTANT_PROPERTIES:
        value = properties[p]
        if p in property_converters:
            value = property_converters[p](value)
        driver.execute_script("arguments[0].setAttribute(arguments[1], arguments[2])", element, p, value)

In [6]:
'''
queue = [form]

while len(queue) > 0:
    element = queue[0]
    children = element.find_elements(By.XPATH, '*')
    queue = [*queue, *children]
    embed_properties_into_element(element)
    queue = queue[1:]
'''

"\nqueue = [form]\n\nwhile len(queue) > 0:\n    element = queue[0]\n    children = element.find_elements(By.XPATH, '*')\n    queue = [*queue, *children]\n    embed_properties_into_element(element)\n    queue = queue[1:]\n"

# Remove Zero-Size Elements

In [7]:
'''
queue = [form]

while len(queue) > 0:
    element = queue[0]
    size = element.size
    if size['height'] == 0 or size['width'] == 0:
        driver.execute_script("return arguments[0].remove();", element);
    else:
        children = element.find_elements(By.XPATH, '*')
        queue = [*queue, *children]
        embed_properties_into_element(element)
    queue = queue[1:]
'''

'\nqueue = [form]\n\nwhile len(queue) > 0:\n    element = queue[0]\n    size = element.size\n    if size[\'height\'] == 0 or size[\'width\'] == 0:\n        driver.execute_script("return arguments[0].remove();", element);\n    else:\n        children = element.find_elements(By.XPATH, \'*\')\n        queue = [*queue, *children]\n        embed_properties_into_element(element)\n    queue = queue[1:]\n'

# Convert to Beautiful Soup

In [8]:
'''
doc = bs(form.get_attribute('innerHTML'))
'''

"\ndoc = bs(form.get_attribute('innerHTML'))\n"

# Remove Unnecessary Attributes from Elements

In [9]:
'''
for element in doc.find_all():
    if type(element) != NavigableString or type(element) != Comment:
        del element['class']
        del element['style']
        # for icons: remove unnecessary data of the icon
        del element['d']
'''

"\nfor element in doc.find_all():\n    if type(element) != NavigableString or type(element) != Comment:\n        del element['class']\n        del element['style']\n        # for icons: remove unnecessary data of the icon\n        del element['d']\n"

# Remove Unnecesary Elements

In [10]:
'''
for element in doc.find_all():
    if type(element) == Comment:
        element.extract()
'''

UNNECESSARY_TAGS = [
    'style', 'script', 'noscript', 'link'
]

'''
for tag in doc.find_all:
    if tag.name in UNNECESSARY_TAGS:
        element.extract()'''

'\nfor element in doc.find_all():\n    if type(element) == Comment:\n        element.extract()\n'

# Merge Tags With a Single Child

In [12]:
def merge_attributes(parent_attributes, child_attributes):
    merged = parent_attributes.copy()
    for key, value in child_attributes.items():
        merged[key] = value
    return merged


def merge_single_child_parents(root):
    children = list(root.children)
    if len(children) > 1:
        for child in children:
            merge_single_child_parents(child)
    if len(children) == 1 and type(children[0]) != NavigableString:
        only_child = children[0]
        root.attrs = merge_attributes(root.attrs, only_child.attrs)
        childs_children = list(only_child.children)
        for i in range(len(childs_children)):
            root.insert(i + 1, childs_children[i])
        only_child.extract()
        merge_single_child_parents(root)

In [13]:
# merge_single_child_parents(doc.form)

# Delete Non-Informative CSS Attributes

In [14]:
INFORMATION_CUTOFF = 0.8

def find_attribute_count(doc, attribute):
    counter = {}
    for element in doc.find_all():
        attr_value = element.attrs[attribute]
        if attr_value not in counter:
            counter[attr_value] = 0
        counter[attr_value] += 1
    counter = dict(sorted(counter.items(), key=lambda item: item[1], reverse=True))
    
    max_value = list(counter.values())[0]
    for key, value in counter.items():
        counter[key] /= max_value
    return counter


# you can do this with colors as well, but the dark-mode/light-mode can cause problems in understanding
def map_font_sizes(doc):
    font_count = find_attribute_count(doc, 'fontsize')
    max_repeated_font_size = int(list(font_count.keys())[0].replace('px', ''))
    for element in doc.find_all():
        if type(element) != NavigableString:
            element.attrs['fontsize'] = int(element.attrs['fontsize'].replace('px', '')) / max_repeated_font_size
            if 'fontsize' in element.attrs and element.attrs['fontsize'] == 1:
                del element.attrs['fontsize']


def map_font_weight(doc):
    for element in doc.find_all():
        if type(element) != NavigableString:
            if 'fontweight' in element.attrs and element.attrs['fontweight'] == 'normal':
                del element.attrs['fontweight']

In [15]:
# map_font_sizes(doc)
# map_font_weight(doc)

# All in One

In [16]:
def simplify_form():
    form = driver.find_element(By.ID, 'form')
    
    queue = [form]

    while len(queue) > 0:
        element = queue[0]
        children = element.find_elements(By.XPATH, '*')
        queue = [*queue, *children]
        embed_properties_into_element(element)
        queue = queue[1:]
    
    queue = [form]

    while len(queue) > 0:
        element = queue[0]
        size = element.size
        if size['height'] == 0 or size['width'] == 0:
            driver.execute_script("return arguments[0].remove();", element);
        else:
            children = element.find_elements(By.XPATH, '*')
            queue = [*queue, *children]
            embed_properties_into_element(element)
        queue = queue[1:]
    
    doc = bs(form.get_attribute('innerHTML'))
    
    for element in doc.find_all():
        if type(element) != NavigableString or type(element) != Comment:
            del element['class']
            del element['style']
            # for icons: remove unnecessary data of the icon
            del element['d']
    
    for element in doc.find_all():
        if type(element) == Comment:
            element.extract()
    
    merge_single_child_parents(doc.form)
    
    map_font_sizes(doc)
    map_font_weight(doc)
    
    return str(doc)

In [17]:
form = simplify_form()

In [18]:
form

'<form backgroundcolor="black 0" bordercolor="black 0" color="black 0"><div backgroundcolor="black 0" bordercolor="black 0" color="black 0"></div><div backgroundcolor="black 0" bordercolor="black 0" color="black 0"><div backgroundcolor="black 0" bordercolor="black 0" color="black 0"><div backgroundcolor="black 0" bordercolor="gainsboro 1" color="black 0"><span aria-activedescendant="rc_select_1_list_0" aria-autocomplete="list" aria-controls="rc_select_1_list" aria-expanded="false" aria-haspopup="listbox" aria-owns="rc_select_1_list" autocomplete="off" backgroundcolor="black 0" bordercolor="black 0" color="black 0" disabled="" id="rc_select_1" readonly="" role="combobox" type="search" unselectable="on" value=""></span><span backgroundcolor="black 0" bordercolor="black 0" color="black 0"></span></div><span aria-hidden="true" backgroundcolor="black 0" bordercolor="black 0" color="black 0" fontsize="0.8571428571428571" unselectable="on"><span aria-hidden="true" aria-label="down" background

In [19]:
len(form)

1634