In [1]:
import requests
from bs4 import BeautifulSoup

In [24]:
# This cell is to be skipped if you will be scraping data from a locally saved XML/HTTP file.
# Simply open the file and assign it to a variable i.e
# file = saved_file
# Then instantiate your soup with the file;
# soup = Beautifulsoup('file')

# URL link to the website being used in the article.
URL = 'https://devoxsoftware.com/blog/top-50-programming-languages-to-learn-in-2022/'

# Response request stored in response variable.
response = requests.get(URL)

In [3]:
# Extracted html content stored inside html_content variable.
html_content = response.content

# Site content Encoding stored in encoding variable.
encoding = response.encoding

# Request status code stored in status_code variable.
status_code = response.status_code

# Outputs
print('Status - ', status_code)
print('Content encoding -', encoding)
# Un-comment next line and re-run this cell to view the raw content of the HTML file.
# print('HTML CONTENT \n\n', html_content)

Status -  200
Content encoding - UTF-8


In [17]:
# Instantiation of beautifulsoup
soup = BeautifulSoup(html_content)


# In an event when your soup isn't working and in the error message theres a suggestion to download a parser
# You can simply enter 'pip install lxml' in your terminal and instantiate your soup as seen below
# You may have to restart your kernel before the parser will be recognized by your environment.
# ------------------------------------------------------------------------------------------------ #
# soup = BeautifulSoup(html_content, 'lxml')
# ------------------------------------------------------------------------------------------------ #

# Formatting / beautifying of HTML content.
formatted_view = soup.prettify()
print('#######  Formatted HTML content view #######\n\n')
# Un-comment next line and re-run this cell to view the formatted content of the HTML file.
# print(formatted_view)

#######  Formatted HTML content view #######




In [18]:
# NOTE: The '.text' method is what pulls out the text content or else you will
# have the entire line of code returned to you. Try experimenting and without the '.text'  and see
# the output that will be returned to you.
 

# Method 1, using find.
# This method will check for and return the first h1 tag found in the HTML file.
heading = soup.find('h1')
print("First h1 tag text content")
print("Heading using method 1 -", heading.text.strip())

# Method 2, using findAll or find_all, NB: They both perform the same function.
# This method will check for all h1 tags and store them as a list.
# In this website, there is only 1 h1 tag so it will be returning a list containing just 1 tag.
print("\n List containing all h1 tags")
headings = soup.findAll('h1')
# This will output a list which you have to loop through and apply '.text'
# Prints out list of headings.
print(headings, '\n')
print("all h1 tag text contents")
for head in headings:
    print(head.text.strip())

First h1 tag text content
Heading using method 1 - Top 50 Programming Languages to Learn in 2022

 List containing all h1 tags
[<h1 class="post-single__title">
        Top 50 Programming Languages to Learn in 2022    </h1>] 

all h1 tag text contents
Top 50 Programming Languages to Learn in 2022


In [19]:
# Extracting the first language alone.
language = soup.find('h3')
soup.find()
print("First language is -", language.text, '\n')

# Extracting all languages on the web page
# I'll be using find_all instead of findAll here just to show that they both work.
languages_list = soup.find_all('h3')
# Due to the fact that I made use of the find_all method here, I can't apply '.text' because the results are in a list
# I have to loop through the list before then applying the '.text' method.


print("Top 50 languges are:")
for languages in languages_list:
    print(languages.text)

First language is - C  

Top 50 languges are:
C 
Python
Java
C++
C#
Virtual Basic
JavaScript
PHP
Assembly Language
SQL
Groovy
Classic Visual Basic
Fortran
R
Ruby 
Swift
Matlab
Go
Prolog
Perl
SAS
Delphi/Object Pascal 
Objective-C
Rust
Scratch
Julia
Ada
Lisp
Dart
PL/SQL
(Visual) FoxPro
Scala
ABAP
COBOL
Logo
F#
Kotlin
Transact-SQL
Lua
Scheme
Ladder Logic
VBScript
D
Clojure
LabVIEW
Nim
VHDL
Apex
TypeScript
Bash
Final words


In [30]:
# Extracting all links on web page

# Finding all anchor tags. Anchor tags are used to create hyper links on web pages.
all_anchors = soup.findAll('a')
# All anchor tags are found and stored inside a list. Loop through and apply the get method to extract the links.
for anchor in all_anchors:
    print(anchor.get('href'))

https://devoxsoftware.com
https://devoxsoftware.com/expertise/
https://devoxsoftware.com/expertise/outsource-web-development/
https://devoxsoftware.com/expertise/outsource-web-design/
https://devoxsoftware.com/expertise/outsource-mobile-development/
https://devoxsoftware.com/expertise/qa-outsourcing/
https://devoxsoftware.com/expertise/devops-consulting/
https://devoxsoftware.com/expertise/
https://devoxsoftware.com/technology/
https://devoxsoftware.com/technology/angular-development-company/
https://devoxsoftware.com/technology/microsoft-net-development-company/
https://devoxsoftware.com/technology/java-development-company/
https://devoxsoftware.com/technology/python-development-company/
https://devoxsoftware.com/technology/react-js-development-services/
https://devoxsoftware.com/technology/vue-js-development-company/
https://devoxsoftware.com/technology/node-js-development-services/
https://devoxsoftware.com/technology/
https://devoxsoftware.com/services/
https://devoxsoftware.com/se

In [32]:
# Making use of the already existing all_anchors list
# Looping through and checking for anchor tag whose text content is 'Services'

for anchor in all_anchors:
    if anchor.text == 'Services':
        print('Services link is -', anchor.get('href'),'.')

Services link is - https://devoxsoftware.com/services/ .
Services link is - https://devoxsoftware.com/services/ .


In [23]:
# Searching for all spans
spans = soup.findAll('span')

# Looping through span list and checking for span whose text is 'the language of the year'
# Then extracting the href value of its parent class.

for span in spans:
    if span.text == 'the language of the year':
        print(span.parent.get('href'))

https://www.tiobe.com/tiobe-index/cplusplus/
