In [1]:
import requests
from bs4 import BeautifulSoup, NavigableString, Comment
import re
from urllib.parse import urljoin
from IPython.display import display, HTML

In [5]:
def extract_text_from_dom(soup, class_name): #, elements=None
     # try to find the <main> element
    main = soup.find('main')

    # if <main> is not found, fall back to the <body> element
    if main is None:
        main = soup.find('body')
        
    # Define a function to filter visible tags
    def tag_visible(element):
        if isinstance(element, Comment):
            return False
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        if class_name and class_name not in element.get('class', []):
            return False
  #      if elements and element.name not in elements:
  #          return False
        if element.name == 'sup':  # Ignore superscript text
            return False
        return True if isinstance(element, NavigableString) else tag_visible(element.parent)

    # Extract text from visible tags
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    visible_text = u" ".join(t.strip() for t in visible_texts)

    # Split text into sentences using regular expressions
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', visible_text)
    return '\n'.join(sentences)

def extract_text_from_website(url, class_name=None, visited_urls=None, max_depth=1): #, elements=None
    # initialize visited_urls as an empty set if its not provided
    if visited_urls is None:
        visited_urls = set()

    # check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return ""

    # add the current URL to visited URLs
    visited_urls.add(url)

    extracted_text = ""
    try:
        # fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Stop extraction if 'See also' section is found
        see_also = soup.find(id='See_also')
        if see_also is not None:
            soup = BeautifulSoup(str(soup).split(str(see_also))[0], 'html.parser')

        # extract text from the current page
        extracted_text = extract_text_from_dom(soup, class_name) #, elements

        # find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            # Convert relative links to absolute links
            subpage_url = urljoin(url, subpage_url)
            if subpage_url not in visited_urls and max_depth > 0: 
                extracted_text += extract_text_from_website(subpage_url, class_name, visited_urls, max_depth - 1) #, elements
                
    except Exception as e:
        print(f"Error processing URL '{url}': {e}")
        
    print(extracted_text)
    return extracted_text

def extract_text_to_file(url, file_path, visited_urls=None, max_depth=1):
    # init visited_urls as an empty set if its not provided
    if visited_urls is None:
        visited_urls = set()

    # check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return ""

    # ddd the current URL to visited URLs
    visited_urls.add(url)

    extracted_text = ""
    try:
        # fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # extract text from the current page
        extracted_text = extract_text_from_dom(soup)

        # find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            # convert relative links to absolute links
            subpage_url = urljoin(url, subpage_url)
            if subpage_url not in visited_urls and max_depth > 0:
                extracted_text += extract_text_to_file(subpage_url, file_path, visited_urls, max_depth - 1) 

    except Exception as e:
        print(f"Error processing URL '{url}': {e}")

    return extracted_text


def display_text_in_new_tab(text):
    html_content = f"""
    <script type="text/javascript">
        var newWindow = window.open("", "_blank");
        newWindow.document.write(`<pre>{text}</pre>`);
        newWindow.document.close();
    </script>
    """
    display(HTML(html_content))

In [None]:
# Example usage
url = "https://en.wikipedia.org/wiki/Mathematical_proof"
class_name = "mw-content-text"  # specify the class name
#elements = ['p','h2','h3','h4']
extracted_text = extract_text_from_website(url)
display_text_in_new_tab(extracted_text)
file_path = "C:\\Users\\lukas\\OneDrive\\Desktop\\skola\\UKF\\bakalarka\\output\\outputWIKI.txt"
#extract_text_to_file(url, file_path)
#print(extracted_text)
# write the extracted_text to the file, had to use utf-8 for some characters
with open(file_path, 'w', encoding='utf-8') as f:
    f.write(extracted_text)

   Jump to content         Main menu       Main menu  move to sidebar  hide    Navigation    Main page Contents Current events Random article About Wikipedia Contact us Donate      Contribute    Help Learn to edit Community portal Recent changes Upload file                     Search             Search                               Create account   Log in          Personal tools        Create account  Log in      Pages for logged out editors learn more     Contributions Talk                              Contents  move to sidebar  hide      (Top)       1 History and etymology         2 Nature and purpose         3 Methods of proof     Toggle Methods of proof subsection       3.1 Direct proof         3.2 Proof by mathematical induction         3.3 Proof by contraposition         3.4 Proof by contradiction         3.5 Proof by construction         3.6 Proof by exhaustion         3.7 Probabilistic proof         3.8 Combinatorial proof         3.9 Nonconstructive proof         3.10 Statisti