In [7]:
import requests
from bs4 import BeautifulSoup, NavigableString, Comment
import re
from urllib.parse import urljoin
from IPython.display import display, HTML
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
import os

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [14]:
def extract_text_from_dom(soup):
    # try to find the <main> element
    main = soup.find('main')

    # if <main> is not found, fall back to the <body> element
    if main is None:
        main = soup.find('body')

    # Define a function to filter visible tags
    def tag_visible(element):
        if isinstance(element, Comment):
            return False
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]','table','figure']: #ignored elements
            return False
        return True if isinstance(element, NavigableString) else tag_visible(element.parent)

    # Extract text from visible tags
    texts = main.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    visible_text = u" ".join(t.strip() for t in visible_texts)

    # Split text into sentences using regular expressions
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', visible_text)
    return '\n'.join(sentences)


def extract_text_from_website(url, visited_urls=None, max_depth=1):
    # initialize visited_urls as an empty set if its not provided
    if visited_urls is None:
        visited_urls = set()

    # check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return ""

    # add the current URL to visited URLs
    visited_urls.add(url)

    extracted_text = ""
    try:
        # fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # extract text from the current page
        extracted_text = extract_text_from_dom(soup)

        # find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            # convert relative links to absolute links
            subpage_url = urljoin(url, subpage_url)
            if subpage_url not in visited_urls and max_depth > 0:
                extracted_text += extract_text_from_website(subpage_url, visited_urls, max_depth - 1)

    except Exception as e:
        print(f"Error processing URL '{url}': {e}")

    return extracted_text

def extract_text_to_file(url, file_path, visited_urls=None, max_depth=1):
    # init visited_urls as an empty set if its not provided
    if visited_urls is None:
        visited_urls = set()

    # check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return ""

    # ddd the current URL to visited URLs
    visited_urls.add(url)

    extracted_text = ""
    try:
        # fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # extract text from the current page
        extracted_text = extract_text_from_dom(soup)

        # find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            # convert relative links to absolute links
            subpage_url = urljoin(url, subpage_url)
            if subpage_url not in visited_urls and max_depth > 0:
                extracted_text += extract_text_to_file(subpage_url, file_path, visited_urls, max_depth - 1)
                
        #print(extracted_text)

    except Exception as e:
        print(f"Error processing URL '{url}': {e}")

    return extracted_text


def display_text_in_new_tab(text):
    html_content = f"""
    <script type="text/javascript">
        var newWindow = window.open("", "_blank");
        newWindow.document.write(`<pre>{text}</pre>`);
        newWindow.document.close();
    </script>
    """
    display(HTML(html_content))
    
def text_analytics(file_path):
    # Open the file and read the text
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Remove duplicate sentences
    sentences = list(dict.fromkeys(sentences))

    # Remove sentences that look like logs or have multiple dates
    sentences = [sent for sent in sentences if not re.search(r'(\d{2} , \d{4}){2,}', sent) and not re.search(r'(Follow us on|Subscribe to|About Terms|Privacy Help|About Terms Privacy Help|@|Help Desk|Help Support| Help Center)', sent)]

    # Tokenize the text into words
    words = word_tokenize(text)

    # Filter out non-alphabetic words and stopwords
    stop_words = set(stopwords.words('english')) #the, a, and, of...
    words = [word for word in words if word.isalpha() and word not in stop_words]

    word_counts = Counter(words)
    most_common_words = word_counts.most_common(5)

    # Create the analytics file
    base_name = os.path.basename(file_path)
    name, ext = os.path.splitext(base_name)
    analytics_file_path = os.path.join(os.path.dirname(file_path), f"{name}_analytics{ext}")

    # Write the analytics to the file
    with open(analytics_file_path, 'w', encoding='utf-8') as f:
        f.write("5 most common words in extracted text:\n")
        for word, frequency in most_common_words:
            f.write(f"{word}: {frequency}\n")
        f.write("\n\n")
        for sentence in sentences:
            f.write(sentence + "\n")

In [17]:
# Example usage
#url = "https://medium.com/@sierraelman/the-future-of-poetry-26dabfc2f50a"
url = "https://www.york.ac.uk/teaching/cws/wws/webpage1.html"
extracted_text = extract_text_from_website(url)
display_text_in_new_tab(extracted_text)
file_path = "C:\\Users\\lukas\\OneDrive\\Desktop\\skola\\UKF\\bakalarka\\output\\output2.txt"
extract_text_to_file(url, file_path)
text_analytics(file_path)
print("DONE!")

# write the extracted_text to the file, had to use utf-8 for some characters
with open(file_path, 'w', encoding='utf-8') as f:
    f.write(extracted_text)

DONE!
