In [1]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
Collecting docopt<0.7,>=0.6.1
  Downloading docopt-0.6.2.tar.gz (25 kB)
Collecting breadability>=0.1.20
  Downloading breadability-0.1.20.tar.gz (32 kB)
Collecting pycountry>=18.2.23
  Downloading pycountry-22.3.5.tar.gz (10.1 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Building wheels for collected packages: breadability, docopt, pycountry
  Building wheel for breadability (setup.py): started
  Building wheel for breadability (setup.py): finished with status 'done'
  Created wheel for breadability: filename=breadability-0.1.20-py2.py3-none-any.whl size=21712 sha256=262916ba3e35d7fc8058c0ec655898fe61e4cdb1765b3f8143b5078af9c30a47
  Stored in directo

In [2]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.0.5-py3-none-any.whl (1.9 MB)
Installing collected packages: reportlab
Successfully installed reportlab-4.0.5


In [4]:
import requests                           #used to make HTTP requests to the specified URL and retrieve the webpage content.

from bs4 import BeautifulSoup             # parse and navigate HTML or XML content for web scraping purposes in Python.
import nltk                               #NLTK helps analyze and process human language data using Python.
import re                                 #re is used to clean the text

#The mentioned lines import necessary components for text summarization using Latent Semantic Analysis (LSA) 
#through the sumy library.
from sumy.parsers.plaintext import PlaintextParser  #used to create a parser for plain text 

from sumy.nlp.tokenizers import Tokenizer           #used to tokenize text for text summarization
from sumy.nlp.stemmers import Stemmer               # used to perform stemming on text for text summarization
from sumy.utils import get_stop_words               #used to retrieve stop words for text summarization
from sumy.summarizers.lsa import LsaSummarizer      #used for Latent Semantic Analysis (LSA) based text summarization 

In [5]:
# Download NLTK punkt tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91630\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Function to clean the text
def clean_text(text):
    # Remove numbers, spaces, symbols, and brackets
    cleaned_text = re.sub(r'\d', '', text)                   # Remove numbers
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)         # Remove extra spaces
    cleaned_text = re.sub(r'\([^)]*\)', '', cleaned_text)    # Remove content between parentheses
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)      # Remove symbols and brackets
    return cleaned_text.strip()

In [7]:
# Function to summarize text using extractive summarization
def summarize_text(text):
    cleaned_text = clean_text(text)                                          # Clean the input text
    
    parser = PlaintextParser.from_string(cleaned_text, Tokenizer('english')) # Create a parser for the cleaned text
    
    stemmer = Stemmer('english')                                  # Initialize a stemmer for English
    summarizer = LsaSummarizer(stemmer)                           # Initialize an LSA summarizer using the stemmer
    summarizer.stop_words = get_stop_words('english')             # Set stop words for the summarizer
    summary = summarizer(parser.document, 2)                      # Adjust the number of sentences in the summary as needed
    return ' '.join([str(sentence) for sentence in summary])      # Join the summary sentences and return

In [8]:
# Wikipedia URL to scrape
url = "https://en.wikipedia.org/wiki/Alexander_the_Great"

In [9]:
# Send a request and get the webpage content
response = requests.get(url)                       #send get request to the url and store it to response
soup = BeautifulSoup(response.text, 'html.parser') #Break down the webpage's structure and understand its components 

In [10]:
# Extract all headings and content
#Extract the text content from each found heading and store it in a list called headings.
headings = [heading.text for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]

In [11]:
# Clean the headings by removing content between parentheses
cleaned_headings = [re.sub(r'\([^)]*\)', '', heading).strip() for heading in headings]
content = [p.text for p in soup.find_all(['p'])]        # Extract text from paragraphs

In [12]:
# Generate summaries for each section
section_summaries = [summarize_text(section_content) for section_content in content]

In [13]:
# Combine headings and summaries
section_headings_and_summaries = dict(zip(cleaned_headings, section_summaries))

In [14]:
# Print section headings and summaries with headings in bold
for heading, summary in section_headings_and_summaries.items():
    print(f"\033[1m{heading}\033[0m")  # \033[1m sets bold, \033[0m resets formatting
    print(summary)
    print()

[1mContents[0m


[1mAlexander the Great[0m
Alexander III of Macedon  commonly known as Alexander the Greata was a king of the ancient Greek kingdom of Macedona He succeeded his father Philip II to the throne in BC at the age of  and spent most of his ruling years conducting a lengthy military campaign throughout Western Asia and Egypt By the age of  he had created one of the largest empires in history stretching from Greece to northwestern India He was undefeated in battle and is widely considered to be one of historys greatest and most successful military commanders

[1mEarly life[0m
Until the age of  Alexander was tutored by Aristotle In BC shortly after his assumption of kingship over Macedon he campaigned in the Balkans and reasserted control over Thrace and parts of Illyria before marching on the city of Thebes which was subsequently destroyed in battle Alexander then led the League of Corinth and used his authority to launch the panHellenic project envisaged by his father a