In [1]:
import requests
from bs4 import BeautifulSoup, NavigableString, Comment
import re
from urllib.parse import urljoin
from IPython.display import display, HTML

In [2]:
def extract_text_from_dom(soup, class_name, elements=None):
    # Define a function to filter visible tags
    def tag_visible(element):
        if isinstance(element, Comment):
            return False
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if class_name and class_name not in element.get('class', []):
            return False
        if elements and element.name not in elements:
            return False
        return True if isinstance(element, NavigableString) else tag_visible(element.parent)

    # Extract text from visible tags
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    visible_text = u" ".join(t.strip() for t in visible_texts)

    # Split text into sentences using regular expressions
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', visible_text)
    return '\n'.join(sentences)

def extract_text_from_website(url, class_name=None, elements=None, visited_urls=None, max_depth=1):
    # initialize visited_urls as an empty set if its not provided
    if visited_urls is None:
        visited_urls = set()

    # check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return ""

    # add the current URL to visited URLs
    visited_urls.add(url)

    extracted_text = ""
    try:
        # fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # extract text from the current page
        extracted_text = extract_text_from_dom(soup, class_name, elements)

        # find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            # Convert relative links to absolute links
            subpage_url = urljoin(url, subpage_url)
            if subpage_url not in visited_urls: # and max_depth > 0
                extracted_text += extract_text_from_website(subpage_url, class_name, elements, visited_urls) #, max_depth - 1
                
    except Exception as e:
        print(f"Error processing URL '{url}': {e}")

    return extracted_text

def extract_text_to_file(url, file_path, visited_urls=None, max_depth=1):
    # init visited_urls as an empty set if its not provided
    if visited_urls is None:
        visited_urls = set()

    # check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return ""

    # ddd the current URL to visited URLs
    visited_urls.add(url)

    extracted_text = ""
    try:
        # fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # extract text from the current page
        extracted_text = extract_text_from_dom(soup)

        # find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            # convert relative links to absolute links
            subpage_url = urljoin(url, subpage_url)
            if subpage_url not in visited_urls: # and max_depth > 0
                extracted_text += extract_text_to_file(subpage_url, file_path, visited_urls) #, max_depth - 1

    except Exception as e:
        print(f"Error processing URL '{url}': {e}")

    return extracted_text


def display_text_in_new_tab(text):
    html_content = f"""
    <script type="text/javascript">
        var newWindow = window.open("", "_blank");
        newWindow.document.write(`<pre>{text}</pre>`);
        newWindow.document.close();
    </script>
    """
    display(HTML(html_content))

In [None]:
# Example usage
url = "https://en.wikipedia.org/wiki/Snake"
class_name = "group_content__20JQH elements_fontFamilyMontserrat__3GCpc"  # specify the class name
elements = ['p','h2','h3','h4']
extracted_text = extract_text_from_website(url)
display_text_in_new_tab(extracted_text)
file_path = "C:\\Users\\lukas\\OneDrive\\Desktop\\skola\\UKF\\bakalarka\\output\\outputEARTH.txt"
extract_text_to_file(url, file_path)
print(extracted_text)
# write the extracted_text to the file, had to use utf-8 for some characters
with open(file_path, 'w', encoding='utf-8') as f:
    f.write(extracted_text)

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Error processing URL 'https://opensource.org/programs#content': 429 Client Error: Too Many Requests for url: https://opensource.org/programs#content
Error processing URL 'https://www.cdc.gov/spanish/az/a.html': 404 Client Error: Not Found for url: https://www.cdc.gov/spanish/az/a.html
Error processing URL 'https://www.cdc.gov//search.cdc.gov/search/spanish/index.html#advanced': 404 Client Error: Not Found for url: https://www.cdc.gov//search.cdc.gov/search/spanish/index.html#advanced
Error processing URL 'https://www.cdc.gov/az/a.html': 404 Client Error: Not Found for url: https://www.cdc.gov/az/a.html
Error processing URL 'https://www.cdc.gov/az/b.html': 404 Client Error: Not Found for url: https://www.cdc.gov/az/b.html
Error processing URL 'https://www.cdc.gov/az/c.html': 404 Client Error: Not Found for url: https://www.cdc.gov/az/c.html
Error processing URL 'https://www.cdc.gov/az/d.html': 404 Client Error: Not Found for url: https://www.cdc.gov/az/d.html
Error processing URL 'https

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Error processing URL 'javascript:setupNavLinks('featured');': No connection adapters were found for "javascript:setupNavLinks('featured');"
Error processing URL 'javascript:setupNavLinks('subscribe');': No connection adapters were found for "javascript:setupNavLinks('subscribe');"
Error processing URL 'javascript:setupNavLinks('manage');': No connection adapters were found for "javascript:setupNavLinks('manage');"
Error processing URL 'mailto:emailupdates@cdc.gov': No connection adapters were found for 'mailto:emailupdates@cdc.gov'
Error processing URL 'mailto:subscriptions@cdc.gov': No connection adapters were found for 'mailto:subscriptions@cdc.gov'
Error processing URL 'https://www.cdc.gov/DataStatistics/ /health-topics.html': 404 Client Error: Not Found for url: https://www.cdc.gov/DataStatistics/%20/health-topics.html
Error processing URL 'https://www.cdc.gov/DataStatistics/ /datastatistics/index.html': 404 Client Error: Not Found for url: https://www.cdc.gov/DataStatistics/%20/da

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Error processing URL 'https://github.com/en/site-policy': 404 Client Error: Not Found for url: https://github.com/en/site-policy
Error processing URL 'https://github.com/en/site-policy/github-terms': 404 Client Error: Not Found for url: https://github.com/en/site-policy/github-terms
Error processing URL 'https://github.com/en/site-policy/github-terms/github-terms-of-service': 404 Client Error: Not Found for url: https://github.com/en/site-policy/github-terms/github-terms-of-service
Error processing URL 'https://github.com/en/site-policy/github-terms/github-corporate-terms-of-service': 404 Client Error: Not Found for url: https://github.com/en/site-policy/github-terms/github-corporate-terms-of-service
Error processing URL 'https://github.com/en/site-policy/github-terms/github-terms-for-additional-products-and-features': 404 Client Error: Not Found for url: https://github.com/en/site-policy/github-terms/github-terms-for-additional-products-and-features
Error processing URL 'https://githu

Error processing URL 'https://github.com/en/site-policy/other-site-policies/github-government-takedown-policy': 404 Client Error: Not Found for url: https://github.com/en/site-policy/other-site-policies/github-government-takedown-policy
Error processing URL 'https://github.com/en/site-policy/other-site-policies/github-username-policy': 404 Client Error: Not Found for url: https://github.com/en/site-policy/other-site-policies/github-username-policy
Error processing URL 'https://github.com/en/site-policy/other-site-policies/guidelines-for-legal-requests-of-user-data': 404 Client Error: Not Found for url: https://github.com/en/site-policy/other-site-policies/guidelines-for-legal-requests-of-user-data
Error processing URL 'https://github.com/en/site-policy/other-site-policies/github-account-recovery-policy': 404 Client Error: Not Found for url: https://github.com/en/site-policy/other-site-policies/github-account-recovery-policy
Error processing URL 'https://github.com/en/site-policy/conten