In [3]:
import requests
from bs4 import BeautifulSoup, NavigableString, Comment
import re
from IPython.display import display, HTML

In [4]:
def extract_text_from_dom(soup):
    # Define a function to filter visible tags
    def tag_visible(element):
        if isinstance(element, Comment):
            return False
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
            return False
        return True if isinstance(element, NavigableString) else tag_visible(element.parent)

    # Extract text from visible tags
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    visible_text = u" ".join(t.strip() for t in visible_texts)

    # Split text into sentences using regular expressions
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', visible_text)
    return '\n'.join(sentences)

def extract_text_from_website(url, visited_urls=None):
    # Initialize visited_urls as an empty set if it's not provided
    if visited_urls is None:
        visited_urls = set()

    # Check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return ""

    # Add the current URL to visited URLs
    visited_urls.add(url)

    try:
        # Fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # Parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract text from the current page
        extracted_text = extract_text_from_dom(soup)

        # Find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # Extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            if subpage_url.startswith('/') or subpage_url.startswith('.') or subpage_url.startswith('#'):
                subpage_url = urljoin(url, subpage_url)
            if subpage_url.startswith(url):
                extracted_text += extract_text_from_website(subpage_url, visited_urls)

        return extracted_text

    except Exception as e:
        print(f"Error processing URL '{url}': {e}")
        return ""

def display_text_in_new_tab(text):
    html_content = f"""
    <script type="text/javascript">
        var newWindow = window.open("", "_blank");
        newWindow.document.write(`<pre>{text}</pre>`);
        newWindow.document.close();
    </script>
    """
    display(HTML(html_content))

In [5]:
# Example usage
url = "https://medium.com/"
extracted_text = extract_text_from_website(url)
display_text_in_new_tab(extracted_text)

Error processing URL 'https://medium.com/': name 'urljoin' is not defined
