In [1]:
import requests
from bs4 import BeautifulSoup, NavigableString, Comment
import re
from urllib.parse import urljoin
from IPython.display import display, HTML
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
import os
from textblob import TextBlob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def get_unique_filename(file_path):
    #split the file path into directory, name, and extension
    directory, filename = os.path.split(file_path)
    name, ext = os.path.splitext(filename)

    #if the file doesn't exist, return the original file path
    if not os.path.exists(file_path):
        return file_path

    #if the file exists, generate a new file name
    i = 1
    while True:
        # Generate a new file name
        new_name = f"{name}_{i}{ext}"
        new_file_path = os.path.join(directory, new_name)

        #if the new file name doesnt exist, return it
        if not os.path.exists(new_file_path):
            return new_file_path

        #if the new file name exists, increment the counter and go again
        i += 1

def extract_text_from_dom(soup, id_name): #, elements=None
     # try to find the <main> element
    main = soup.find(id=id_name)

    # if <main> is not found, fall back to the <body> element
    if main is None:
        main = soup.find('body')
        
    # Define a function to filter visible tags
    def tag_visible(element):
        if isinstance(element, Comment):
            return False
        if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]','table','figure']:
            return False
        if element.name == 'sup':  # Ignore superscript text
            return False
        return True if isinstance(element, NavigableString) else tag_visible(element.parent)

    # Extract text from visible tags
    texts = main.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    visible_text = u" ".join(t.strip() for t in visible_texts)

    # Remove text within square brackets
    visible_text = re.sub(r'\[.*?\]', '', visible_text)

    # Split text into sentences using regular expressions
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', visible_text)
    return '\n'.join(sentences)

def extract_text_from_website(url, file_path, id_name=None, visited_urls=None, max_depth=1): #, elements=None
    # initialize visited_urls as an empty set if its not provided
    if visited_urls is None:
        visited_urls = set()

    # check if the URL has been visited to avoid infinite recursion
    if url in visited_urls:
        return

    # add the current URL to visited URLs
    visited_urls.add(url)

    try:
        # fetch the webpage content
        response = requests.get(url)
        response.raise_for_status()

        # parse the HTML content
        soup = BeautifulSoup(response.content, 'html.parser')

        # Stop extraction if 'See also' section is found
        see_also = soup.find(id='See_also')
        if see_also is not None:
            soup = BeautifulSoup(str(soup).split(str(see_also))[0], 'html.parser')

        # extract text from the current page
        extracted_text = extract_text_from_dom(soup, id_name) #, elements

        # write the extracted_text to the file, had to use utf-8 for some characters
        with open(file_path, 'a', encoding='utf-8') as f:
            f.write(extracted_text)

        # find all links on the page
        links = soup.find_all('a', href=True, text=True)

        # extract text from linked pages recursively
        for link in links:
            subpage_url = link['href']
            # Convert relative links to absolute links
            subpage_url = urljoin(url, subpage_url)
            if subpage_url not in visited_urls and max_depth > 0: 
                extract_text_from_website(subpage_url, file_path, id_name, visited_urls, max_depth - 1) #, elements
                        
    except Exception as e:
        print(f"Error processing URL '{url}': {e}")
        
def text_analytics(file_path):
    #open file and read the text
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    #tokenize text into sentences
    sentences = sent_tokenize(text)

    #remove duplicate sentences
    sentences = list(dict.fromkeys(sentences))

    banned_strings = ['submit a request','follow us on', 'subscribe to', 'about terms', 'privacy help', 'about terms privacy help', '@', 'help desk', 'help support', 'help center', 'homepage', 'navbar']
    sentences = [sent for sent in sentences if not re.search(r'\b(\d{2} , \d{4})\b', sent.lower()) and not any(banned_string in sent.lower() for banned_string in banned_strings)]
    
    #tokenize text into words
    words = word_tokenize(text)

    #filter out non-alphabetic words and stopwords
    stop_words = set(stopwords.words('english')) #the, a, and, of...
    words = [word for word in words if word.isalpha() and word not in stop_words]

    word_counts = Counter(words)
    most_common_words = word_counts.most_common(5)
    
    #compute average word length and round it
    word_lengths = [len(word) for word in words]
    avg_word_length = round(sum(word_lengths) / len(word_lengths))

    #compute sentiment of the text and convert it to percentages
    sentiment = TextBlob(text).sentiment
    sentiment_dict = dict(sentiment._asdict())  #convert sentiment to dictionary
    for key in sentiment_dict:
        sentiment_dict[key] = round(sentiment_dict[key] * 100, 1)  #to percentage


    #create analytics file
    base_name = os.path.basename(file_path)
    name, ext = os.path.splitext(base_name)
    analytics_file_path = os.path.join(os.path.dirname(file_path), f"{name}_analytics{ext}")

    #write analytics to the file
    with open(analytics_file_path, 'w', encoding='utf-8') as f:
        f.write("5 most common words in extracted text:\n")
        for word, frequency in most_common_words:
            f.write(f"{word}: {frequency}\n")
        f.write("\n\n")
        f.write(f"Average word length: {avg_word_length}\n")
        f.write("Sentiment:\n")
        for key, value in sentiment_dict.items():
            f.write(f"{key}={value}%\n")
        f.write("\n\n")
        for sentence in sentences:
            f.write(sentence + "\n")

In [3]:
url = "https://en.wikipedia.org/wiki/Types_of_volcanic_eruptions"
id_name = "mw-content-text"  # specify the id name
file_path = "C:\\Users\\lukas\\OneDrive\\Desktop\\skola\\UKF\\bakalarka\\output\\outputWIKI.txt"
file_path = get_unique_filename(file_path)
extract_text_from_website(url, file_path, id_name)
text_analytics(file_path)
print("DONE!")

Error processing URL 'https://en.wikipedia.org/w/index.php?title=Hjorleifshofdi&action=edit&redlink=1': 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Hjorleifshofdi
DONE!
