<a href="https://colab.research.google.com/github/livxy/Colab-Notebooks/blob/main/notebooks/ExtractAllWebsiteLinks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Let's install the dependencies:
!pip3 install requests bs4 colorama

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting colorama
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.5


In [2]:
#import modules
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama

In [3]:
# init the colorama module
colorama.init()
GREEN = colorama.Fore.GREEN
GRAY = colorama.Fore.LIGHTBLACK_EX
RESET = colorama.Fore.RESET
YELLOW = colorama.Fore.YELLOW

In [4]:
# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()

In [5]:
#Function to validate URLs:
def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

In [6]:
#Function to return all the valid URLs of a webpage
def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    #Get all HTML a tags (anchor tags that contains all the links of the web page):
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue
        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid(href):
            # not a valid URL
            continue
        if href in internal_urls:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in external_urls:
                print(f"{GRAY}[!] External link: {href}{RESET}")
                external_urls.add(href)
            continue
        print(f"{GREEN}[*] Internal link: {href}{RESET}")
        urls.add(href)
        internal_urls.add(href)
    return urls


In [7]:
# number of urls visited so far will be stored here
total_urls_visited = 0

def crawl(url, max_urls=30):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    print(f"{YELLOW}[*] Crawling: {url}{RESET}")
    links = get_all_website_links(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)

In [8]:
#Test time!
if __name__ == "__main__":
    crawl("https://benny.fun/")
    print("[+] Total Internal links:", len(internal_urls))
    print("[+] Total External links:", len(external_urls))
    print("[+] Total URLs:", len(external_urls) + len(internal_urls))

[*] Crawling: https://benny.fun/
[*] Internal link: https://benny.fun/
[*] Internal link: https://benny.fun/portfolio/
[!] External link: https://github.com/bentettmar
[!] External link: https://instagram.com/ben.tettmar
[!] External link: https://twitter.com/bentettmar
[!] External link: https://www.youtube.com/channel/UCNoWhqTYBrC3gFWDYNW8j5w
[*] Internal link: https://benny.fun/cdn-cgi/l/email-protection
[*] Internal link: https://ipinfo.benny.fun/
[!] External link: https://menunote.app/
[!] External link: https://github.com/FrostChat
[!] External link: https://github.com/bentettmar/discord-img-host
[!] External link: https://github.com/bentettmar/py-cookie-clicker
[*] Crawling: https://benny.fun/portfolio/
[*] Crawling: https://benny.fun/
[*] Crawling: https://ipinfo.benny.fun/
[!] External link: https://github.com/bentettmar/ipinfo
[!] External link: https://github.com/bentettmar/ipinfo/releases/latest/download/IPInfo.zip
[*] Crawling: https://benny.fun/cdn-cgi/l/email-protection

In [57]:
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from IPython.display import HTML as html_print
from IPython.display import display

def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)

def print_color(t):
    display(html_print(' '.join([cstr(ti, color=ci) for ti,ci in t])))

# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()

total_urls_visited = 0


def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue
        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid(href):
            # not a valid URL
            continue
        if href in internal_urls:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in external_urls:
                message = print_color(((f'[*] Internal link: ', 'blue'),))
                print_color((('[!] External link: ', 'green'),(f'{href}', '')))
                external_urls.add(href)
            continue
        message = print_color(((f'[*] Internal link: ', 'blue'),))
        print(f'{message}'.join({href})) 
        urls.add(href)
        internal_urls.add(href)
    return urls


def crawl(url, max_urls=30):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    print(f"[*] Crawling: {url}")
    links = get_all_website_links(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)


if __name__ == "__main__":
    crawl("https://benny.fun/")
    print_color(((f"[+] Total Internal links:  {len(internal_urls)}", "red"),))
    print_color(((f"[+] Total External links: {len(external_urls)}", "red"),))
    print_color(((f"[+] Total URLs: {len(external_urls)  + len(internal_urls)}", "red"),))

[*] Crawling: https://benny.fun/


https://benny.fun/


https://benny.fun/portfolio/


https://benny.fun/cdn-cgi/l/email-protection


https://ipinfo.benny.fun/


[*] Crawling: https://benny.fun/portfolio/
[*] Crawling: https://benny.fun/
[*] Crawling: https://ipinfo.benny.fun/


[*] Crawling: https://benny.fun/cdn-cgi/l/email-protection


In [22]:
from IPython.display import HTML as html_print
from IPython.display import display

def cstr(s, color='black'):
    return "<text style=color:{}>{}</text>".format(color, s)

def print_color(t):
    display(html_print(' '.join([cstr(ti, color=ci) for ti,ci in t])))

print_color((('hello my name is', 'black'),('jhjfd','red')))
print_color((('hello my name is', 'green'),))