# Install the beautifulsoup library for parsing HTML content

Beautiful Soup, this Python library parses HTML content and HTML data into a tree-like structure, making it easy to navigate and extract specific information.

In [None]:
! pip install beautifulsoup4
! pip install pandas

# Extracting data with Beautiful Soup

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = " https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-PY0220EN-SkillsNetwork/labs/project/amazon_data_webpage.html"
html_data = requests.get(url)
soup = BeautifulSoup(html_data.content, 'html.parser')
title = soup.title
print(title)

# Get table columns
thead = soup.find_all("th")
for head in thead:
    print(head.text)

amazon_data = pd.DataFrame(columns=["Date", "Open", "High", "Low", "Close", "Volume"])

for row in soup.find("tbody").find_all("tr"):
    col = row.find_all("td")
    date = col[0].text
    Open = col[1].text
    high = col[2].text
    low = col[3].text
    close = col[4].text
    adj_close = col[5].text
    volume = col[6].text
    
    amazon_data = pd.concat([amazon_data, pd.DataFrame({"Date":[date], "Open":[Open], "High":[high], "Low":[low], "Close":[close], "Adj Close":[adj_close], "Volume":[volume]})], ignore_index=True)

amazon_data.head()

<title>Amazon.com, Inc. (AMZN) Stock Historical Prices &amp; Data - Yahoo Finance</title>
Date
Open
High
Low
Close*
Adj Close**
Volume


Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,"Jan 01, 2021",3270.0,3363.89,3086.0,3206.2,71528900,3206.2
1,"Dec 01, 2020",3188.5,3350.65,3072.82,3256.93,77556200,3256.93
2,"Nov 01, 2020",3061.74,3366.8,2950.12,3168.04,90810500,3168.04
3,"Oct 01, 2020",3208.0,3496.24,3019.0,3036.15,116226100,3036.15
4,"Sep 01, 2020",3489.58,3552.25,2871.0,3148.73,115899300,3148.73


In [6]:
# Send a request to the website
url = 'https://www.tiobe.com/tiobe-index/'
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Find the main headlines h1 tag
headlines = soup.find_all('h1')

# Print the headlines
for headline in headlines:
    print(headline.text)

TIOBE Index for January 2025


In [6]:
# Find the main headlines h2 tag
headlines = soup.find_all('h2')

# Print the headlines
for headline in headlines:
    print(headline.text)

Other programming languages
The Next 50 Programming Languages
This Month's Changes in the Index
Very Long Term History
Programming Language Hall of Fame
Bugs & Change Requests
Get your own proof of concept


In [7]:
# Find the tables in the website
tables = soup.find_all('table')

# Print the tables
for table in tables:
    print(table)

<table class="table table-striped table-top20" id="top20">
<thead><tr>
<th style="width: 15%">Jan 2025</th>
<th style="width: 15%">Jan 2024</th>
<th style="width: 15%" title="Difference compared to last year">Change</th>
<th colspan="2" style="width: 25%">Programming Language</th>
<th style="width: 15%">Ratings</th>
<th style="width: 15%" title="Difference compared to last year">Change</th>
</tr></thead>
<tbody>
<tr><td>1</td><td>1</td><td></td><td class="td-top20"><img alt="Python page" src="/wp-content/themes/tiobe/tiobe-index/images/Python.png" style="vertical-align:middle"/></td><td>Python</td><td>23.28%</td><td>+9.32%</td></tr><tr><td>2</td><td>3</td><td><img alt="change" src="/wp-content/themes/tiobe/tpci/images/up.png"/></td><td class="td-top20"><img alt="C++ page" src="/wp-content/themes/tiobe/tiobe-index/images/C__.png" style="vertical-align:middle"/></td><td>C++</td><td>10.29%</td><td>+0.33%</td></tr><tr><td>3</td><td>4</td><td><img alt="change" src="/wp-content/themes/tiobe/

In [5]:
# Find the first table in the website
table = soup.find('table')

# Print the table
print(table)

# Print table parent
print(table.parent)

# Print table attributes
print(table.attrs)

# Print table id
print(table['id'])
print(table.get('id'))

<table class="table table-striped table-top20" id="top20">
<thead><tr>
<th style="width: 15%">Jan 2025</th>
<th style="width: 15%">Jan 2024</th>
<th style="width: 15%" title="Difference compared to last year">Change</th>
<th colspan="2" style="width: 25%">Programming Language</th>
<th style="width: 15%">Ratings</th>
<th style="width: 15%" title="Difference compared to last year">Change</th>
</tr></thead>
<tbody>
<tr><td>1</td><td>1</td><td></td><td class="td-top20"><img alt="Python page" src="/wp-content/themes/tiobe/tiobe-index/images/Python.png" style="vertical-align:middle"/></td><td>Python</td><td>23.28%</td><td>+9.32%</td></tr><tr><td>2</td><td>3</td><td><img alt="change" src="/wp-content/themes/tiobe/tpci/images/up.png"/></td><td class="td-top20"><img alt="C++ page" src="/wp-content/themes/tiobe/tiobe-index/images/C__.png" style="vertical-align:middle"/></td><td>C++</td><td>10.29%</td><td>+0.33%</td></tr><tr><td>3</td><td>4</td><td><img alt="change" src="/wp-content/themes/tiobe/

In [9]:
# Find the table with the class table-top20 and id is top20
table = soup.find('table', class_='table-top20', id='top20')

# Print the table
print(table)

<table class="table table-striped table-top20" id="top20">
<thead><tr>
<th style="width: 15%">Jan 2025</th>
<th style="width: 15%">Jan 2024</th>
<th style="width: 15%" title="Difference compared to last year">Change</th>
<th colspan="2" style="width: 25%">Programming Language</th>
<th style="width: 15%">Ratings</th>
<th style="width: 15%" title="Difference compared to last year">Change</th>
</tr></thead>
<tbody>
<tr><td>1</td><td>1</td><td></td><td class="td-top20"><img alt="Python page" src="/wp-content/themes/tiobe/tiobe-index/images/Python.png" style="vertical-align:middle"/></td><td>Python</td><td>23.28%</td><td>+9.32%</td></tr><tr><td>2</td><td>3</td><td><img alt="change" src="/wp-content/themes/tiobe/tpci/images/up.png"/></td><td class="td-top20"><img alt="C++ page" src="/wp-content/themes/tiobe/tiobe-index/images/C__.png" style="vertical-align:middle"/></td><td>C++</td><td>10.29%</td><td>+0.33%</td></tr><tr><td>3</td><td>4</td><td><img alt="change" src="/wp-content/themes/tiobe/

In [10]:
# Print the table columns
table_columns = table.find_all('th')
for column in table_columns:
    print(column.text)

Jan 2025
Jan 2024
Change
Programming Language
Ratings
Change


In [18]:
# Print the table rows
# Navigate the DOM tree to find the table rows
table_body = table.find('tbody')
table_rows = table_body.find_all('tr')

# Print the table rows
for row in table_rows:
    print(row)

<tr><td>1</td><td>1</td><td></td><td class="td-top20"><img alt="Python page" src="/wp-content/themes/tiobe/tiobe-index/images/Python.png" style="vertical-align:middle"/></td><td>Python</td><td>23.28%</td><td>+9.32%</td></tr>
<tr><td>2</td><td>3</td><td><img alt="change" src="/wp-content/themes/tiobe/tpci/images/up.png"/></td><td class="td-top20"><img alt="C++ page" src="/wp-content/themes/tiobe/tiobe-index/images/C__.png" style="vertical-align:middle"/></td><td>C++</td><td>10.29%</td><td>+0.33%</td></tr>
<tr><td>3</td><td>4</td><td><img alt="change" src="/wp-content/themes/tiobe/tpci/images/up.png"/></td><td class="td-top20"><img alt="Java page" src="/wp-content/themes/tiobe/tiobe-index/images/Java.png" style="vertical-align:middle"/></td><td>Java</td><td>10.15%</td><td>+2.28%</td></tr>
<tr><td>4</td><td>2</td><td><img alt="change" src="/wp-content/themes/tiobe/tpci/images/down.png"/></td><td class="td-top20"><img alt="C page" src="/wp-content/themes/tiobe/tiobe-index/images/C.png" sty

In [19]:
# Print the table data
first_row = table_rows[0]
print(first_row)

<tr><td>1</td><td>1</td><td></td><td class="td-top20"><img alt="Python page" src="/wp-content/themes/tiobe/tiobe-index/images/Python.png" style="vertical-align:middle"/></td><td>Python</td><td>23.28%</td><td>+9.32%</td></tr>


In [20]:
# Navigate the DOM tree to find the table data
second_row = first_row.find_next('tr')
print(second_row)

<tr><td>2</td><td>3</td><td><img alt="change" src="/wp-content/themes/tiobe/tpci/images/up.png"/></td><td class="td-top20"><img alt="C++ page" src="/wp-content/themes/tiobe/tiobe-index/images/C__.png" style="vertical-align:middle"/></td><td>C++</td><td>10.29%</td><td>+0.33%</td></tr>


In [22]:
# Navigate the next using next_sibling
third_row = second_row.find_next_sibling('tr')
print(third_row)

<tr><td>3</td><td>4</td><td><img alt="change" src="/wp-content/themes/tiobe/tpci/images/up.png"/></td><td class="td-top20"><img alt="Java page" src="/wp-content/themes/tiobe/tiobe-index/images/Java.png" style="vertical-align:middle"/></td><td>Java</td><td>10.15%</td><td>+2.28%</td></tr>


In [25]:
# Find first row data
first_row_data = first_row.find_all('td')
for data in first_row_data:
    # If the data has children, print the children
    if data.findChildren():
        print(data.findChildren())
    else:
        print(data.text)
    print(data.text)

1
1
1
1


[<img alt="Python page" src="/wp-content/themes/tiobe/tiobe-index/images/Python.png" style="vertical-align:middle"/>]

Python
Python
23.28%
23.28%
+9.32%
+9.32%


In [34]:
# List of the top 20 programming languages
top_20_languages = []

# Loop through the table rows
for row in table_rows:
    # Find the table data
    row_data = row.find_all('td')
    # Extract the data from the table data
    language = row_data[4].text
    top_20_languages.append(language)

# Print the top 20 programming languages
print("Top 20 Programming Languages")
for rank, language in enumerate(top_20_languages, start=1):
    print(f"{rank}. {language}")

Top 20 Programming Languages
1. Python
2. C++
3. Java
4. C
5. C#
6. JavaScript
7. Go
8. SQL
9. Visual Basic
10. Fortran
11. Delphi/Object Pascal
12. Scratch
13. PHP
14. Rust
15. MATLAB
16. Ruby
17. Assembly language
18. R
19. Swift
20. COBOL
