In [1]:
import requests
from bs4 import BeautifulSoup, NavigableString, Comment
import re
from urllib.parse import urljoin
from IPython.display import display, HTML

In [3]:
def extract_text_from_dom(soup, class_name):
    # Define a function to filter visible tags
    def tag_visible(element):
        if isinstance(element, Comment):
            return False
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if class_name and class_name not in element.get('class', []):
            return False
        return True if isinstance(element, NavigableString) else tag_visible(element.parent)

    # Extract text from visible tags
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    visible_text = u" ".join(t.strip() for t in visible_texts)

    # Split text into sentences using regular expressions
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', visible_text)
    return '\n'.join(sentences)

def extract_text_from_website(url, class_name=None, visited_urls=None, max_depth=1):
    # initialize visited_urls as an empty set if its not provided
    if visited_urls is None:
        visited_urls = set()

    # check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return ""

    # add the current URL to visited URLs
    visited_urls.add(url)

    extracted_text = ""
    try:
        # fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # extract text from the current page
        extracted_text = extract_text_from_dom(soup, class_name)

        # find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            # Convert relative links to absolute links
            subpage_url = urljoin(url, subpage_url)
            if subpage_url not in visited_urls and max_depth > 0:
                extracted_text += extract_text_from_website(subpage_url, class_name, visited_urls, max_depth - 1)
    except Exception as e:
        print(f"Error processing URL '{url}': {e}")

    return extracted_text

def extract_text_to_file(url, file_path, visited_urls=None, max_depth=1):
    # init visited_urls as an empty set if its not provided
    if visited_urls is None:
        visited_urls = set()

    # check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return ""

    # ddd the current URL to visited URLs
    visited_urls.add(url)

    extracted_text = ""
    try:
        # fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # extract text from the current page
        extracted_text = extract_text_from_dom(soup)

        # find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            # convert relative links to absolute links
            subpage_url = urljoin(url, subpage_url)
            if subpage_url not in visited_urls and max_depth > 0:
                extracted_text += extract_text_to_file(subpage_url, file_path, visited_urls, max_depth - 1)

    except Exception as e:
        print(f"Error processing URL '{url}': {e}")

    return extracted_text


def display_text_in_new_tab(text):
    html_content = f"""
    <script type="text/javascript">
        var newWindow = window.open("", "_blank");
        newWindow.document.write(`<pre>{text}</pre>`);
        newWindow.document.close();
    </script>
    """
    display(HTML(html_content))

In [None]:
# Example usage
url = "https://en.wikipedia.org/wiki/Snake"
class_name = "mw-content-text"  # specify the class name
extracted_text = extract_text_from_website(url)
display_text_in_new_tab(extracted_text)
file_path = "C:\\Users\\lukas\\OneDrive\\Desktop\\skola\\UKF\\bakalarka\\output\\outputWIKI.txt"
extract_text_to_file(url, file_path)
print(extracted_text)
# write the extracted_text to the file, had to use utf-8 for some characters
with open(file_path, 'w', encoding='utf-8') as f:
    f.write(extracted_text)

Error processing URL 'https://en.wikipedia.org/w/index.php?title=Jeffrey_L.Weinell&action=edit&redlink=1': 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Jeffrey_L.Weinell
Error processing URL 'https://en.wikipedia.org/w/index.php?title=Rafe_M.Brown&action=edit&redlink=1': 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Rafe_M.Brown
Error processing URL 'https://en.wikipedia.org/w/index.php?title=Monique_Bourgeois&action=edit&redlink=1': 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Monique_Bourgeois
Error processing URL 'https://en.wikipedia.org/w/index.php?title=Herndon_G._Dowling&action=edit&redlink=1': 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Herndon_G._Dowling
Error processing URL 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4438441': 403 Client Error: Forbidden for url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4438441/
Error processing URL 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC43725

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Error processing URL 'https://www.science.org/content/article/fossils-oldest-known-snakes-unearthed': 403 Client Error: Forbidden for url: https://www.science.org/content/article/fossils-oldest-known-snakes-unearthed
Error processing URL 'https://www.science.org': 403 Client Error: Forbidden for url: https://www.science.org/
Error processing URL 'https://doi.org/10.1126%2Fscience.adh2449': 403 Client Error: Forbidden for url: https://www.science.org/doi/10.1126/science.adh2449
Error processing URL 'http://www.wildlifenews.co.uk/articles2000/march/march2500a.htm': HTTPSConnectionPool(host='wildlifenews.co.uk', port=443): Max retries exceeded with url: /articles2000/march/march2500a.htm (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1129)')))
Error processing URL 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4681343': 403 Client Error: Forbidden for url: https://www.ncbi.nlm.nih.gov/pmc/articles

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Error processing URL 'https://doi.org/10.1080%2F00222938400770131': 403 Client Error: Forbidden for url: https://www.tandfonline.com/doi/abs/10.1080/00222938400770131
Error processing URL 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3760860': 403 Client Error: Forbidden for url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3760860/
Error processing URL 'https://doi.org/10.1111%2Fbrv.12056': 403 Client Error: Forbidden for url: https://onlinelibrary.wiley.com/doi/10.1111/brv.12056
Error processing URL 'https://doi.org/10.1093%2Ficb%2F27.1.5': 403 Client Error: Forbidden for url: https://academic.oup.com/icb/article-lookup/doi/10.1093/icb/27.1.5
Error processing URL 'https://www.ias.ac.in/article/fulltext/secb/020/01/0001-0029': 403 Client Error: Forbidden for url: https://www.ias.ac.in/article/fulltext/secb/020/01/0001-0029


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Error processing URL 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9185726': 403 Client Error: Forbidden for url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9185726/
Error processing URL 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9748774': 403 Client Error: Forbidden for url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9748774/
Error processing URL 'https://doi.org/10.1098%2Frspb.2022.1702': 403 Client Error: Forbidden for url: https://royalsocietypublishing.org/doi/10.1098/rspb.2022.1702
Error processing URL 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3497136': 403 Client Error: Forbidden for url: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3497136/
Error processing URL 'https://doi.org/10.1098%2Frsbl.2012.0666': 403 Client Error: Forbidden for url: https://royalsocietypublishing.org/doi/10.1098/rsbl.2012.0666
Error processing URL 'https://doi.org/10.1093%2Fjhered%2Fesr080': 403 Client Error: Forbidden for url: http://academic.oup.com/jhered/article/102/6/759/836740/Consecuti