# Features extraction

Turn raw data into features.

## Extract HTML features

The following features are extracted from the web page HTML:

- **LineOfCode**: Number of lines of code in the HTML.
- **LargestLineLength**: Length of the largest line of code in the HTML. This is used to detect obfuscated code.
- **HasTitle**: Whether the HTML has a title tag.
- **Title**: The title of the page.
- **DomainTitleMatchScore**: The score of the page title matching the domain name. Out of 100.
- **URLTitleMatchScore**: The score of the page title matching the URL. Out of 100.
- **HasFavicon**: Whether the page has a favicon.
- **Robots**: Does the website have a robots.txt file or a robots meta tag.
- **IsResponsive**: Whether the website is responsive.
- **NoOfURLRedirect**: Number of URL redirects.
- **NoOfSelfRedirect**: Number of redirects to the same domain.
- **HasDescription**: Whether the page has a meta description.
- **NoOfPopup**: Number of popups.
- **NoOfiFrame**: Number of iframes.
- **HasExternalFormSubmit**: Whether the page has an external form submit.
- **HasSocialNet**: Whether the page has social network links.
- **HasSubmitButton**: Whether the page has a submit button.
- **HasHiddenFields**: Whether the page has hidden fields.
- **HasPasswordField**: Whether the page has password fields.
- **Bank**: Whether the page is a bank page.
- **Pay**: Whether the page is a payment page.
- **Crypto**: Whether the page is a cryptocurrency page.
- **HasCopyrightInfo**: Whether the page has copyright information.
- **NoOfImage**: Number of images.
- **NoOfCSS**: Number of CSS files.
- **NoOfJS**: Number of JS files.
- **NoOfSelfRef**: Number of links to the same domain.
- **NoOfEmptyRef**: Number of empty links.
- **NoOfExternalRef**: Number of links to external domains.

### Install libraries

In [None]:
%pip install beautifulsoup4 requests

Note: you may need to restart the kernel to use updated packages.


### Import libraries

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

### Extract features

In [None]:
def LineOfCode(html):
    return len(re.findall('\n', html))

In [None]:
def LargestLineLength(html):
    max = 0
    for line in html.split('\n'):
        if len(line) > max:
            max = len(line)
    return max

In [None]:
def HasFavicon(url: str, soup: BeautifulSoup):
    favicon = soup.find('link', rel='icon')
    if favicon is not None:
        return int(True)

    favicon_url = urlparse(url)._replace(path='/favicon.ico').geturl()
    response = requests.get(favicon_url)
    if response.status_code == 200:
        return int(True)

    return False

In [None]:
def HasRobots(url: str, soup: BeautifulSoup):
    # Check if meta robots tag exists before making a request
    if soup:
        meta = soup.find('meta', attrs={'name': 'robots'})
        if meta:
            return int(True) # for readability
    
    # If no meta tag, make a request to the robots.txt file
    if url:
        robots_url = urlparse(url)._replace(path='/robots.txt').geturl()
        response = requests.get(robots_url)
        if response.status_code == 200:
            return int(True)
    
    return int(False)

In [None]:
def IsResponsive(soup: BeautifulSoup):
    # Check if viewport meta tag exists
    meta = soup.find('meta', attrs={'name': 'viewport'})
    if meta:
        return int(True)

    # Check for conditionally loaded stylesheets
    stylesheet = soup.find('link', attrs={'rel': 'stylesheet', 'media': 'screen'})
    if stylesheet:
        return int(True)

    # Check if inline style contains media queries
    style = soup.find('style', string=re.compile('@media'))
    if style:
        return int(True)
    
    # Checking if the page is responsive is not a trivial task
    # This function may return false negatives
    # For example, a page may be responsive without using media queries.
    # Above checks don't cover all possible cases.
    
    return int(False)

In [None]:
def NoOfPopup(soup: BeautifulSoup):
    count = 0
    
    # Check for new dialog element
    popups = soup.find_all('dialog')
    count += len(popups)

    # Check for window.open() calls
    scripts = soup.find_all('script', string=re.compile('window.open'))
    count += len(scripts)

    return count

In [None]:
def HasExternalFormSubmit(soup: BeautifulSoup):
    forms = soup.find_all('form')
    for form in forms:
        action = form.get('action')
        if action and not action.startswith('/'):
            return int(True)
    
    return int(False)

In [None]:
def HasSocialNet(soup: BeautifulSoup):
    social_media = [
        'facebook', 'twitter', 'x.com', 'linkedin', 'instagram', 'youtube', 
        'pinterest', 'tumblr', 'snapchat', 'reddit', 'tiktok', 'whatsapp', 
        'wechat', 'qq', 'telegram', 'viber', 'line', 'vk', 'odnoklassniki', 
        'myspace', 'flickr', 'meetup', 'mix', 'deviantart', 'livejournal', 
        'badoo', 'stumbleupon', 'digg', 'friendster', 'classmates', 'xing', 
        'renren', 'douban', 'vkontakte', 'qzone', 'baidu', 'weibo', 'kakao', 
        'naver', 'skype', 'discord', 'slack', 'signal', 'mastodon', 'parler', 
        'gab', 'clubhouse', 'ello', 'peach', 'plurk', 'mewe', 'minds', 'diaspora'
    ]

    social_media_regex = re.compile('|'.join(social_media), re.IGNORECASE)
    
    # Check if any social media link exists (no need to check all)
    social_media_link = soup.find('a', href=social_media_regex)

    if social_media_link:
        return int(True)
    
    return int(False)

In [None]:
def HasCopyrightInfo(soup: BeautifulSoup):
    copyright_variants = ['©', '(c)', 'copyright', 'all rights reserved']
    copyright_regex = re.compile('|'.join(copyright_variants), re.IGNORECASE)
    
    return int(soup.find(string=copyright_regex) is not None)

In [None]:
# Count self-referencing links
def NoOfSelfRef(soup: BeautifulSoup):
    count = 0
    
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href is not None and (href.startswith('/') or href.startswith('#')):
            count += 1

    return count

# Count empty links
def NoOfEmptyRef(soup: BeautifulSoup):
    count = 0
    
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href is None or href == '':
            count += 1
    
    return count

# Count external links
def NoOfExternalRef(url: str, soup: BeautifulSoup):
    count = 0
    netloc = urlparse(url).netloc
    
    links = soup.find_all('a')
    for link in links:
        href = link.get('href')
        if href is not None and urlparse(href).netloc != netloc:
            count += 1
    
    return count

In [None]:
test_url = 'https://www.google.com/search?q=alan+turing' # Link with robots.txt
# test_url = 'https://shorturl.at/qzDIE' # Link with redirects
# test_url = 'https://example.com'

def HTMLFeatures(url):
    response = requests.get(url, allow_redirects=True)
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')

    return {
        'LineOfCode': LineOfCode(html),
        'LargestLineLength': LargestLineLength(html),
        'HasTitle': int(soup.title is not None),
        'Title': soup.title.string if soup.title else '',
        'DomainTitleMatchScore': None,
        'URLTitleMatchScore': None,
        'HasFavicon': HasFavicon(url, soup),
        'Robots': HasRobots(url, soup),
        'IsResponsive': IsResponsive(soup),
        'NoOfURLRedirect': len(response.history),
        'NoOfSelfRedirect': len([redirect for redirect in response.history[1:] if urlparse(redirect.url).hostname == urlparse(url).hostname]),
        'HasDescription': int(soup.find('meta', attrs={'name': 'description'}) is not None),
        'NoOfPopup': NoOfPopup(soup),
        'NoOfiFrame': len(soup.find_all('iframe')),
        'HasExternalFormSubmit': HasExternalFormSubmit(soup),
        'HasSocialNet': HasSocialNet(soup),
        'HasSubmitButton': int(soup.find('input', type='submit') is not None),
        'HasHiddenFields': int(soup.find('input', type='hidden') is not None),
        'HasPasswordField': int(soup.find('input', type='password') is not None),
        'Bank': None,
        'Pay': None,
        'Crypto': None,
        'HasCopyrightInfo': HasCopyrightInfo(soup),
        'NoOfImage': len(soup.find_all('img')),
        'NoOfCSS': len(soup.find_all('link', rel='stylesheet')),
        'NoOfJS': len(soup.find_all('script')),
        'NoOfSelfRef': NoOfSelfRef(soup),
        'NoOfEmptyRef': NoOfEmptyRef(soup),
        'NoOfExternalRef': NoOfExternalRef(url, soup),
    }

HTMLFeatures(test_url)

{'LineOfCode': 30,
 'LargestLineLength': 45695,
 'HasTitle': 1,
 'Title': 'alan turing - Recherche Google',
 'DomainTitleMatchScore': None,
 'URLTitleMatchScore': None,
 'HasFavicon': 1,
 'Robots': 1,
 'IsResponsive': 0,
 'NoOfURLRedirect': 0,
 'NoOfSelfRedirect': 0,
 'HasDescription': 0,
 'NoOfPopup': 0,
 'NoOfiFrame': 0,
 'HasExternalFormSubmit': 0,
 'HasSocialNet': 1,
 'HasSubmitButton': 0,
 'HasHiddenFields': 1,
 'HasPasswordField': 0,
 'Bank': None,
 'Pay': None,
 'Crypto': None,
 'HasCopyrightInfo': 1,
 'NoOfImage': 9,
 'NoOfCSS': 0,
 'NoOfJS': 9,
 'NoOfSelfRef': 54,
 'NoOfEmptyRef': 0,
 'NoOfExternalRef': 56}