### Importing the packages

In [1]:
# Load the packages
import requests
from bs4 import BeautifulSoup

### Making a get request

In [2]:
# Defining the url of the site
base_site = "https://en.wikipedia.org/wiki/Music"

# Making a get request
response = requests.get(base_site)
response

<Response [200]>

In [3]:
# Extracting the HTML
html = response.content

### Making the soup

In [4]:
# Convert HTML to a BeautifulSoup object. This will allow us to parse out content from the HTML more easily.
# Using the default parser as it is included in Python
soup = BeautifulSoup(html, "html.parser")

### 1. Extract all existing titles of links

In [5]:
# Find all links on the page 
linked_href = soup.find_all('a' , href=True)
linked_href

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=C13_en.wikipedia.org&amp;uselang=en" title="Support us by donating to the Wikimedia Foundation"><span>Donate</span></a>,
 <a href=

In [6]:
links = soup.find_all('a' )
clean_links = [l for l in links if l.get('href') !=None]
clean_links

[<a class="mw-jump-link" href="#bodyContent">Jump to content</a>,
 <a accesskey="z" href="/wiki/Main_Page" title="Visit the main page [z]"><span>Main page</span></a>,
 <a href="/wiki/Wikipedia:Contents" title="Guides to browsing Wikipedia"><span>Contents</span></a>,
 <a href="/wiki/Portal:Current_events" title="Articles related to current events"><span>Current events</span></a>,
 <a accesskey="x" href="/wiki/Special:Random" title="Visit a randomly selected article [x]"><span>Random article</span></a>,
 <a href="/wiki/Wikipedia:About" title="Learn about Wikipedia and how it works"><span>About Wikipedia</span></a>,
 <a href="//en.wikipedia.org/wiki/Wikipedia:Contact_us" title="How to contact Wikipedia"><span>Contact us</span></a>,
 <a href="https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&amp;utm_medium=sidebar&amp;utm_campaign=C13_en.wikipedia.org&amp;uselang=en" title="Support us by donating to the Wikimedia Foundation"><span>Donate</span></a>,
 <a href=

In [7]:
# Dropping the links without 'href' attribute

In [8]:
# Getting all titles
title_list = []
for link in linked_href:
    title = link.get('title')
    title_list.append(title)

print(title_list)

[None, 'Visit the main page [z]', 'Guides to browsing Wikipedia', 'Articles related to current events', 'Visit a randomly selected article [x]', 'Learn about Wikipedia and how it works', 'How to contact Wikipedia', 'Support us by donating to the Wikimedia Foundation', 'Guidance on how to use and edit Wikipedia', 'Learn how to edit Wikipedia', 'The hub for editors', 'A list of recent changes to Wikipedia [r]', 'Add images or other media for use on Wikipedia', None, 'Search Wikipedia [f]', 'You are encouraged to create an account and log in; however, it is not mandatory', "You're encouraged to log in; however, it's not mandatory. [o]", 'You are encouraged to create an account and log in; however, it is not mandatory', "You're encouraged to log in; however, it's not mandatory. [o]", None, 'A list of edits made from this IP address [y]', 'Discussion about edits from this IP address [n]', None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, N

In [9]:
# Removing the 'None' titles
for t in title_list:
    if t == None:
        title_list.remove(t)
title_list

['Visit the main page [z]',
 'Guides to browsing Wikipedia',
 'Articles related to current events',
 'Visit a randomly selected article [x]',
 'Learn about Wikipedia and how it works',
 'How to contact Wikipedia',
 'Support us by donating to the Wikimedia Foundation',
 'Guidance on how to use and edit Wikipedia',
 'Learn how to edit Wikipedia',
 'The hub for editors',
 'A list of recent changes to Wikipedia [r]',
 'Add images or other media for use on Wikipedia',
 'Search Wikipedia [f]',
 'You are encouraged to create an account and log in; however, it is not mandatory',
 "You're encouraged to log in; however, it's not mandatory. [o]",
 'You are encouraged to create an account and log in; however, it is not mandatory',
 "You're encouraged to log in; however, it's not mandatory. [o]",
 'A list of edits made from this IP address [y]',
 'Discussion about edits from this IP address [n]',
 'Musiek – Afrikaans',
 'Musik – Swiss German',
 'ሙዚቃ – Amharic',
 'Muusik – Inari Sami',
 'Drēam – Old

### 2. Extract all heading 2 strings.

In [10]:
# Inspect all h2 tags
tag = soup.find_all('h2')
tag

[<h2 class="vector-pinnable-header-label">Contents</h2>,
 <h2><span class="mw-headline" id="Etymology_and_terminology">Etymology and terminology</span></h2>,
 <h2><span class="mw-headline" id="History">History</span></h2>,
 <h2><span class="mw-headline" id="Creation">Creation</span></h2>,
 <h2><span class="mw-headline" id="Art_and_entertainment">Art and entertainment</span></h2>,
 <h2><span class="mw-headline" id="Elements">Elements</span></h2>,
 <h2><span class="mw-headline" id="Philosophy">Philosophy</span></h2>,
 <h2><span class="mw-headline" id="Psychology">Psychology</span></h2>,
 <h2><span class="mw-headline" id="Sociological_aspects">Sociological aspects</span></h2>,
 <h2><span class="mw-headline" id="Media_and_technology">Media and technology</span></h2>,
 <h2><span class="mw-headline" id="Education">Education</span></h2>,
 <h2><span class="mw-headline" id="Academic_study">Academic study</span></h2>,
 <h2><span class="mw-headline" id="Therapy">Therapy</span></h2>,
 <h2><span cl

In [11]:
# Get the text
tags = []
for t in tag:
    tags.append(t.string)
tags

['Contents',
 'Etymology and terminology',
 'History',
 'Creation',
 'Art and entertainment',
 'Elements',
 'Philosophy',
 'Psychology',
 'Sociological aspects',
 'Media and technology',
 'Education',
 'Academic study',
 'Therapy',
 'See also',
 'References',
 'Further reading',
 'External links']

### 3. Print the whole footer text.

In [12]:
# By inspection: we see that the footer is contained inside a ...

footer_tags = soup.find_all('footer')
for footer in footer_tags:
    footer_text = footer.get_text()
    print(footer_text)




 This page was last edited on 6 May 2023, at 19:29 (UTC).
Text is available under the Creative Commons Attribution-ShareAlike License 3.0;
additional terms may apply.  By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.


Privacy policy
About Wikipedia
Disclaimers
Contact Wikipedia
Mobile view
Developers
Statistics
Cookie statement








In [13]:
foot = soup.find('footer')
print(foot.text)



 This page was last edited on 6 May 2023, at 19:29 (UTC).
Text is available under the Creative Commons Attribution-ShareAlike License 3.0;
additional terms may apply.  By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.


Privacy policy
About Wikipedia
Disclaimers
Contact Wikipedia
Mobile view
Developers
Statistics
Cookie statement








In [14]:
print()


