In [1]:
!pip install nltk



In [2]:
import nltk
import sys
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from typing import List


def summarize(text: str) -> str:
    # Tokenize the text by sentences
    sentences = nltk.sent_tokenize(text)

    # Calculate the scores of each sentence
    sentence_scores = _calculate_sentence_scores(
        sentences, _create_dictionary_table(text))

    # Get the average score
    average_scores = _calculate_average_score(sentence_scores)

    # Compare each sentence to the average score
    # and get the highest score sentences
    summary = _get_article_summary(sentences, sentence_scores, average_scores)

    return summary


def _create_dictionary_table(text: str) -> dict:

    # Removing stop words
    stop_words = set(stopwords.words("english"))

    words = nltk.word_tokenize(text)

    # Reducing words to their root form
    stem = PorterStemmer()

    # Creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table


def _calculate_sentence_scores(sentences: List[str], frequency_table: dict) -> dict:

    # Algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        # sentence_wordcount = (len(nltk.word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]
                                    ] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]
                                    ] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]
                                                        ] / sentence_wordcount_without_stop_words

    return sentence_weight


def _calculate_average_score(sentence_weight: dict) -> int:

    # Calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    # Getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score


def _get_article_summary(sentences: List[str], sentence_weight: dict, threshold: int) -> str:
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary


In [3]:
import requests
from bs4 import BeautifulSoup
from typing import List

def _get_html_text(url: str) -> str:
    # Fetch the HTML content and return it as text
    return requests.get(url).text

def parse_html_to_paragraphs(url: str, tags: List[str]) -> str:
    html_content = _get_html_text(url)

    # Parsing the URL content and storing in a variable
    parsed = BeautifulSoup(html_content, 'html.parser')

    # Get all the content in the tas
    paragraphs = parsed.find_all(tags)

    content = ''

    # Looping through the paragraphs and adding them to the variable
    for paragraph in paragraphs:
        content += paragraph.text

    return content

In [4]:
def summarize_url(url: str, tags: List[str] = ['p'], method: str = 'naive'):
    # Get the content from the url
    content = parse_html_to_paragraphs(url, tags)

    # If no content is fetched
    if content == "":
        print(f"No content of {tags} is fetched from {url}")
        sys.exit(0)

    summary = ""

    # Summarize the content
    if method == "naive":
        summary = summarize(content)

    # Print the summary to stdout
    print(summary)

In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Marcus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
summarize_url("https://wikipedia.org/wiki/Rococo", ["h1", "p"])

 It was known as the style rocaille, or rocaille style. [2] It soon spread to other parts of Europe, particularly northern Italy, Austria, southern Germany, Central Europe and Russia. [3] It also came to influence the other arts, particularly sculpture, furniture, silverware, glassware, painting, music, and theatre. [5]
The word rococo was first used as a humorous variation of the word rocaille. [6][7] Rocaille was originally a method of decoration, using pebbles, seashells and cement, which was often used to decorate grottoes and fountains since the Renaissance. It was the first appearance in print of the term "rocaille" to designate the style. [10] The carved or molded seashell motif was combined with palm leaves or twisting vines to decorate doorways, furniture, wall panels and other architectural elements. It was used in 1828 for decoration "which belonged to the style of the 18th century, overloaded with twisting ornaments." "[12]
In the 19th century, the term was used to describe