# Webscraping

Output:

```
TLD: value
DOMAIN: value
HOSTNAME: value
PATH: /value
LINKS:
    Same hostname: 
    Same domain:
    Different domain:

```



In [1]:
import re #regex library
from bs4 import BeautifulSoup # work with html
import requests 
import sys # work with extra cmd argument
from pprint import pprint

In [2]:
def hostname(url):
    '''
        The function is to find the HOSTNAME
    '''
    
    #pattern of a URL
    if re.search(r"https://", url):
        del_https = url.replace("https://", "")
    if re.search(r"http://", url):
        del_https = url.replace("http://", "")
    
    #extract only Fully Qualified Domain Name 
    fqdn_pattern = r'[^\/]*'
    fqdn = re.match(fqdn_pattern, del_https).group(0) 
    
    return fqdn
    

In [3]:
# enrich the domains found
known_domains =  ['.com', '.net', '.org', '.co', '.us']
known_domains = set(known_domains)

def domain(hostname):
    '''
        Find the DOMAIN of a URL
    '''
    
    
    #usually the url has the form: www.abc.domain with domain can be .gov.uk
    splits = hostname.split('.', 2) 
    
    #dotted domain: .com
    known_domains.add('.' + splits[-1])

    return splits[-1]

In [4]:
def main_path(url):
    '''
        Find the PATH of URL
    '''
    hname = hostname(url)
    try:
        pattern = "https://" + hname
        pth = url.replace(pattern, "")
    except:
        pattern = "http://" + hname
        pth = url.replace(pattern, "")
    
    return pth

In [5]:
def sub_links(main_url):
    '''
        Find the hyperlinks from the given URL
    '''
    # open and get all the texts in the website
    page = requests.get(main_url)
    texts = str(BeautifulSoup(page.text, 'html.parser'))

    # general pattern
    pattern1, pattern2 = r'"https://.*?"', r'"http://.*?"'
    
    # find all the links 
    links = re.findall(pattern1, texts) + re.findall(pattern2, texts)
    
    return links

In [6]:
def grouping(sub_links):
    
    '''
        Group the URL based on DOMAIN, HOSTNAME
    '''
    
    for link in sub_links:
        link = link.replace('"', "") # remove the quote sign 

        # remove the unwanted info in some links separated by whitespace e.g. www.abc.com width=100    
        unrelated_info = re.search(' ', link)
        if unrelated_info:
            link = link[:unrelated_info.start()]
            
        # extract info of the found link
        sub_hname = hostname(link)
        sub_domain = domain(sub_hname)
        
        # compare and append
        if main_hname == sub_hname:
            same_hname.append(link)
        elif main_domain == sub_domain:
            same_domain.append(link)
        elif main_domain != sub_domain:
            diff_domain.append(link)

In [7]:
def display():
    '''
        Use this to display the results
    '''
    
    print("TLD:", main_domain)
    print("HOSTNAME:", main_hname)
    print("PATH:", path)
    print("LINKS:")
    print("Same hostname:\n")
    pprint(same_hname)
    print("\nSame domain:\n")
    pprint(same_domain)
    print("\nDifferent domain:\n")
    pprint(diff_domain)

Test with URL = https://edition.cnn.com/

In [8]:
url = "https://edition.cnn.com/"

same_hname, same_domain, diff_domain = list(), list(), list()

main_hname = hostname(url)
main_domain = domain(main_hname)
path = main_path(url)

hidden_links = sub_links(url)
grouping(hidden_links)

display()

TLD: com
HOSTNAME: edition.cnn.com
PATH: /
LINKS:
Same hostname:

['https://edition.cnn.com',
 'https://edition.cnn.com',
 'https://edition.cnn.com',
 'https://edition.cnn.com',
 'https://edition.cnn.com',
 'https://edition.cnn.com',
 'https://edition.cnn.com',
 'https://edition.cnn.com/2020/08/15/politics/trump-house-analysis/index.html\\',
 'https://edition.cnn.com/2020/08/16/politics/biden-conventions-analysis/index.html\\',
 'https://edition.cnn.com/2020/05/22/perspectives/homeowners-renters-cities-zillow-coronavirus/index.html\\',
 'https://edition.cnn.com/2020/08/02/motorsport/lewis-hamilton-formula-one-british-grand-prix-motorsport-spt-intl/index.html\\',
 'https://edition.cnn.com',
 'https://edition.cnn.com',
 'https://edition.cnn.com/business',
 'https://edition.cnn.com/business']

Same domain:

['https://www.googletagservices.com/tag/js/gpt.js',
 'https://c.amazon-adsystem.com/aax2/apstag.js',
 'https://www.cnn.com',
 'https://www.cnn.com',
 'https://plus.google.com/+cnn/post

Test with https://www.bbc.com/news/world-europe-53795871

In [9]:
url = "https://www.bbc.com/news/world-europe-53795871"

same_hname, same_domain, diff_domain = list(), list(), list()

main_hname = hostname(url)
main_domain = domain(main_hname)
path = main_path(url)

hidden_links = sub_links(url)
grouping(hidden_links)

display()

TLD: com
HOSTNAME: www.bbc.com
PATH: /news/world-europe-53795871
LINKS:
Same hostname:

['https://www.bbc.com/news/world-europe-53795871',
 'https://www.bbc.com/news/amp/world-europe-53795871',
 'https://www.bbc.com/news/world-europe-53795871',
 'https://www.bbc.com/news/world-europe-53795871',
 'https://www.bbc.com/',
 'https://www.bbc.com/news',
 'https://www.bbc.com/sport',
 'https://www.bbc.com/reel',
 'https://www.bbc.com/worklife',
 'https://www.bbc.com/travel',
 'https://www.bbc.com/future',
 'https://www.bbc.com/culture',
 'https://www.bbc.com/culture/music',
 'https://www.bbc.com/weather',
 'https://www.bbc.com/wwscripts/data',
 'https://www.bbc.com/wwscripts/flag',
 'https://www.bbc.com/news/world-europe-53795871',
 'https://www.bbc.com/news/world-europe-53795871',
 'https://www.bbc.com/news/world-europe-53795871',
 'https://www.bbc.com/news/world-europe-53795871',
 'https://www.bbc.com/russian/live/news-53698604',
 'https://www.bbc.com/news/world-europe-53795871',
 'https://

Test with  https://www.f-secure.com/en/about-us/careers/job-openings

In [10]:
url = "https://www.f-secure.com/en/about-us/careers/job-openings"

same_hname, same_domain, diff_domain = list(), list(), list()

main_hname = hostname(url)
main_domain = domain(main_hname)
path = main_path(url)

hidden_links = sub_links(url)
grouping(hidden_links)

display()

TLD: com
HOSTNAME: www.f-secure.com
PATH: /en/about-us/careers/job-openings
LINKS:
Same hostname:

['https://www.f-secure.com/en/about-us/careers/job-openings',
 'https://www.f-secure.com/content/dam/f-secure/en/about/careers/og/og-job-opening.jpg',
 'https://www.f-secure.com/en/about-us/careers/job-openings',
 'https://www.f-secure.com/gb-en/about-us/careers/job-openings',
 'https://www.f-secure.com/us-en/about-us/careers/job-openings',
 'https://www.f-secure.com/en/about-us/careers/job-openings',
 'https://www.f-secure.com/en',
 'https://www.f-secure.com/en/web/investors_global/home',
 'https://www.f-secure.com/en/home',
 'https://www.f-secure.com/en/home',
 'https://www.f-secure.com/en/home/products',
 'https://www.f-secure.com/en/home/download',
 'https://www.f-secure.com/en/home/buy-or-renew',
 'https://www.f-secure.com/en/business',
 'https://www.f-secure.com/en/business',
 'https://www.f-secure.com/en/business/solutions',
 'https://www.f-secure.com/en/business/support-and-downlo