In [4]:
# import the requests library (1 line)
import requests
from bs4 import BeautifulSoup

You will start with the basics: how to do a simple request to an API endpoint.

You will use the requests external library through the import keyword. NOTE: external libraries need to be installed first. Check the request Quickstart section of the documentation to:

Use the get() method to connect to this endpoint: https://country-leaders.onrender.com/status
Check if the status_code is equal to 200, which means OK.
if OK, print() the `text`` of the response.
if not, print() the status_code.
Here is an explanation of HTTP status codes.

In [181]:
# assign the root url (without /status) to the root_url variable for ease of reference (1 line)
root_url = "https://country-leaders.onrender.com"

# assign the /status endpoint to another variable called status_url (1 line)
status_url = "/status"

# query the /status endpoint using the get() method and store it in the req variable (1 line)
base_url = requests.get(root_url + status_url)
# check the status_code using a condition and print appropriate messages (4 lines)

if base_url.status_code == 200:
    print(base_url)
else:
    print(f"Request failed with status code {base_url.status_code}")



<Response [200]>


Cookies anyone? -
It looks like the access to this API is restricted... Query the /cookie endpoint and extract the appropriate field to access your cookie.

You will need to use this cookie in each of the following API requests.

In [185]:
cookies_endpoint = "/cookie"
# Query the enpoint, set the cookies variable and display it (2 lines)
cookies_url = root_url + cookies_endpoint
req = requests.get(cookies_url)
cookie = req.cookies
print(cookie)

<RequestsCookieJar[<Cookie user_cookie=51c152af-0a36-40c7-8f6f-bc3606aa0f76 for country-leaders.onrender.com/>]>


Dealing with JSON:
JSON is the preferred format to deal with data over the web. You cannot avoid it so you would better get acquainted

In [182]:
#session = requests.Session()
countries_endpoint = "/countries"
countries_url = root_url + countries_endpoint
response = requests.get(countries_url, cookies = cookie)

if response.status_code == 200:
    countries = response.json()
    print(countries)
else:
    print(response.status_code, response.text)

['fr', 'be', 'ma', 'us', 'ru']


Getting the actual data from the API -> Query the /leaders endpoint.

In [186]:
# Set the leaders_url variable (1 line)
leaders_endpoint = "/leaders"
leaders_url = root_url + leaders_endpoint
# query the /leaders endpoint, assign the output to the leaders variable (1 line)
param = {"country": "fr"}
req = requests.get(leaders_url, params = param, cookies = cookie)
# display the leaders variable (1 line)
leaders = req.json()
leaders


[{'id': 'Q157',
  'first_name': 'François',
  'last_name': 'Hollande',
  'birth_date': '1954-08-12',
  'death_date': None,
  'place_of_birth': 'Rouen',
  'wikipedia_url': 'https://fr.wikipedia.org/wiki/Fran%C3%A7ois_Hollande',
  'start_mandate': '2012-05-15',
  'end_mandate': '2017-05-14'},
 {'id': 'Q329',
  'first_name': 'Nicolas',
  'last_name': 'Sarkozy',
  'birth_date': '1955-01-28',
  'death_date': None,
  'place_of_birth': 'Paris',
  'wikipedia_url': 'https://fr.wikipedia.org/wiki/Nicolas_Sarkozy',
  'start_mandate': '2007-05-16',
  'end_mandate': '2012-05-15'},
 {'id': 'Q2038',
  'first_name': 'François',
  'last_name': 'Mitterrand',
  'birth_date': '1916-10-26',
  'death_date': '1996-01-08',
  'place_of_birth': 'Jarnac',
  'wikipedia_url': 'https://fr.wikipedia.org/wiki/Fran%C3%A7ois_Mitterrand',
  'start_mandate': '1981-05-21',
  'end_mandate': '1995-05-17'},
 {'id': 'Q2042',
  'first_name': 'Charles',
  'last_name': 'de Gaulle',
  'birth_date': '1890-11-22',
  'death_date': '

 A sneak peak at the data (finally) -> Look inside a few examples. Notice the dictionary keys available for each entry. You have your first example of structured data. This data was sanitized for your benefit, meaning it is readily exploitable without modification.

You will also notice there is a Wikipedia link for each entry. You will need to extract additional information there. This will be a case of semi-structured data.

The /countries endpoint returns a list of several country codes.

You need to loop through this list and query the /leaders endpoint for each one. Save each json result in a dictionary called leaders_per_country.



In [176]:
def get_leaders():
    base_url = "https://country-leaders.onrender.com"
    cookie_url = base_url + cookies_endpoint
    countries_url = base_url + countries_endpoint
    leaders_url = base_url + leaders_endpoint

    # Get the cookie
    cookie = requests.get(cookie_url).cookies

    # Get the list of countries
    countries = requests.get(countries_url, cookies=cookie).json()

    leaders_per_country = {}
    for country in countries:
        # Query leaders for each country using cookie and params
        res = requests.get(leaders_url, cookies=cookie, params={"country": country})
        if res.status_code == 200:
            leaders_per_country[country] = res.json()
        else:
            leaders_per_country[country] = []

    return leaders_per_country

# Test the function


In [148]:
leaders_per_country = get_leaders()
print(leaders_per_country)

{'be': [{'id': 'Q12978', 'first_name': 'Guy', 'last_name': 'Verhofstadt', 'birth_date': '1953-04-11', 'death_date': None, 'place_of_birth': 'Dendermonde', 'wikipedia_url': 'https://nl.wikipedia.org/wiki/Guy_Verhofstadt', 'start_mandate': '1999-07-12', 'end_mandate': '2008-03-20'}, {'id': 'Q12981', 'first_name': 'Yves', 'last_name': 'Leterme', 'birth_date': '1960-10-06', 'death_date': None, 'place_of_birth': 'Wervik', 'wikipedia_url': 'https://nl.wikipedia.org/wiki/Yves_Leterme', 'start_mandate': '2009-11-25', 'end_mandate': '2011-12-06'}, {'id': 'Q12983', 'first_name': 'Herman', 'last_name': 'None', 'birth_date': '1947-10-31', 'death_date': None, 'place_of_birth': 'Etterbeek', 'wikipedia_url': 'https://nl.wikipedia.org/wiki/Herman_Van_Rompuy', 'start_mandate': '2008-12-30', 'end_mandate': '2009-11-25'}, {'id': 'Q14989', 'first_name': 'Léon', 'last_name': 'Delacroix', 'birth_date': '1867-12-27', 'death_date': '1929-10-15', 'place_of_birth': 'Saint-Josse-ten-Noode', 'wikipedia_url': 'htt

Extracting data from Wikipedia ->
Query one of the leaders' Wikipedia urls and display its text (not JSON).

In [187]:
import requests
from bs4 import BeautifulSoup

# 1. Query one Wikipedia URL and display the raw HTML text (not JSON)
wikipedia_url = "https://nl.wikipedia.org/wiki/Yves_Leterme"  # example URL
response = requests.get(wikipedia_url)
print(response.text[:120])  # print first 50 chars of raw HTML to check

# 2. Use BeautifulSoup to parse the HTML and prettify the output
soup = BeautifulSoup(response.text, "html.parser")
print(soup.prettify()[:120])  # print first 50 chars prettified

# 3. Extract all paragraphs and save to paragraphs variable
paragraphs = soup.find_all("p")
print(f"Found {len(paragraphs)} paragraphs")

# 4. Loop over paragraphs to find the first meaningful paragraph text
first_paragraph = ""
for p in paragraphs:
    text = p.get_text().strip()
    # Condition: paragraph must have enough length and not be empty or contain only references
    if len(text) > 50:
        first_paragraph = text
        break

print(first_paragraph)

# 5. Wrap the above into a reusable function
def get_first_paragraph(wikipedia_url):
    print(wikipedia_url)  # Keep this print for debugging

    res = requests.get(wikipedia_url)
    if res.status_code != 200:
        return "Failed to load Wikipedia page"

    soup = BeautifulSoup(res.text, "html.parser")
    paragraphs = soup.find_all("p")

    for p in paragraphs:
        text = p.get_text().strip()
        if len(text) > 50:
            return text
    return "No valid paragraph found."

# Test the function with an example
test_url = "https://nl.wikipedia.org/wiki/Yves_Leterme"
print(get_first_paragraph(test_url))

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-
Found 47 paragraphs
Yves Camille Désiré Leterme (Wervik, 6 oktober 1960) is een voormalig Belgisch politicus. Hij was onder meer Vlaams minister-president en premier van België. Hij was lang boegbeeld van de CD&V en boegbeeld van het toenmalige kartel CD&V/N-VA.
https://nl.wikipedia.org/wiki/Yves_Leterme
Yves Camille Désiré Leterme (Wervik, 6 oktober 1960) is een voormalig Belgisch politicus. Hij was onder meer Vlaams minister-president en premier van België. Hij was lang boegbeeld van de CD&V en boegbeeld van het toenmalige kartel CD&V/N-VA.


Regular expressions to the rescue ->
Now that you have extracted the content of the first paragraph, the only thing that remains to finish your Wikipedia scraper is to sanitize the output.

Indeed some Wikipedia references, HTML code, phonetic pronunciation etc. may linger. You might find regular expressions handy to get rid of them and obtain pristine text. You will find some useful documentation about regular expressions here

Once you have one of your regex working online, try it in the cell below.

In [189]:
import re
import requests
from bs4 import BeautifulSoup

def get_first_paragraph(wikipedia_url):

    print(wikipedia_url)
    res = requests.get(wikipedia_url)
    soup = BeautifulSoup(res.text, "html.parser")
    paragraphs = soup.find_all("p")

    for p in paragraphs:
        text = p.get_text().strip()
        if len(text) > 50:
            # Remove multiple spaces/newlines
            text = re.sub(r"\s+", " ", text)
            # Remove phonetic pronunciations within slashes or parentheses
            text = re.sub(r"\((?:IPA:)?[^)]*\)", "", text)
            # Remove curly braces or other brackets
            text = re.sub(r"\{[^}]*\}", "", text)
            # Remove multiple spaces/newlines
            text = re.sub(r"\s+", " ", text)
            return text.strip()
    return "No valid paragraph found."

# Test it
test_url = "https://nl.wikipedia.org/wiki/Yves_Leterme"
print(get_first_paragraph(test_url))

https://nl.wikipedia.org/wiki/Yves_Leterme
Yves Camille Désiré Leterme is een voormalig Belgisch politicus. Hij was onder meer Vlaams minister-president en premier van België. Hij was lang boegbeeld van de CD&V en boegbeeld van het toenmalige kartel CD&V/N-VA.


In [179]:
import requests
from bs4 import BeautifulSoup
import re
import time

def get_first_paragraph(wikipedia_url, session=None):
    if session is None:
        session = requests.Session()
    print(wikipedia_url)
    res = session.get(wikipedia_url)
    if res.status_code != 200:
        raise Exception(f"Failed to load Wikipedia page: {res.status_code}")

    soup = BeautifulSoup(res.text, "html.parser")
    paragraphs = soup.find_all("p")
    for p in paragraphs:
        text = p.get_text().strip()
        if len(text) > 50:
            # Remove phonetic pronunciations within slashes or parentheses
            text = re.sub(r"\((?:IPA:)?[^)]*\)", "", text)
            # Remove curly braces or other brackets
            text = re.sub(r"\{[^}]*\}", "", text)
            # Remove HTML character entities (&nbsp;, &amp; etc.)
            text = re.sub(r"&\w+?;", "", text)
            # Remove multiple spaces/newlines
            text = re.sub(r"\s+", " ", text)
            return text.strip()
    return "No valid paragraph found."

def get_cookie(base_url):
    res = requests.get(base_url + cookies_endpoint)
    if res.status_code != 200:
        raise Exception("Failed to get cookie")
    return res.cookies

def get_countries(base_url, cookies):
    res = requests.get(base_url + countries_endpoint, cookies=cookies)
    if res.status_code != 200:
        raise Exception("Failed to get countries")
    return res.json()

def get_leaders(base_url):
    cookie = get_cookie(base_url)
    countries = get_countries(base_url, cookie)
    leaders_per_country = {}
    session = requests.Session()

    for country in countries:
        res = requests.get(base_url + "/leaders", params={"country": country}, cookies=cookie)
        if res.status_code == 403:  # cookie expired or forbidden
            cookie = get_cookie(base_url)
            res = requests.get(base_url + "/leaders", params={"country": country}, cookies=cookie)
        res.raise_for_status()
        leaders = res.json()

        for leader in leaders:
            wiki_url = leader.get("wikipedia_url")
            if wiki_url:
                leader["first_paragraph"] = get_first_paragraph(wiki_url, session)

            else:
                leader["first_paragraph"] = "No Wikipedia URL available"
        leaders_per_country[country] = leaders

    return leaders_per_country

# Test
base_url = "https://country-leaders.onrender.com"
leaders_per_country = get_leaders(base_url)
print(next(iter(leaders_per_country.items())))  # print one sample entry


https://fr.wikipedia.org/wiki/Fran%C3%A7ois_Hollande
https://fr.wikipedia.org/wiki/Nicolas_Sarkozy
https://fr.wikipedia.org/wiki/Fran%C3%A7ois_Mitterrand
https://fr.wikipedia.org/wiki/Charles_de_Gaulle
https://fr.wikipedia.org/wiki/Jacques_Chirac
https://fr.wikipedia.org/wiki/Val%C3%A9ry_Giscard_d%27Estaing
https://fr.wikipedia.org/wiki/Georges_Pompidou
https://fr.wikipedia.org/wiki/Adolphe_Thiers
https://fr.wikipedia.org/wiki/Napol%C3%A9on_III
https://fr.wikipedia.org/wiki/Paul_Doumer
https://fr.wikipedia.org/wiki/Alain_Poher
https://fr.wikipedia.org/wiki/Albert_Lebrun
https://fr.wikipedia.org/wiki/Ren%C3%A9_Coty
https://fr.wikipedia.org/wiki/Vincent_Auriol
https://fr.wikipedia.org/wiki/Patrice_de_Mac_Mahon
https://fr.wikipedia.org/wiki/%C3%89mile_Loubet
https://fr.wikipedia.org/wiki/Raymond_Poincar%C3%A9
https://fr.wikipedia.org/wiki/Sadi_Carnot_(homme_d%27%C3%89tat)
https://fr.wikipedia.org/wiki/Alexandre_Millerand
https://fr.wikipedia.org/wiki/Gaston_Doumergue
https://fr.wikipedia.

Putting it all together ->
Let's go back to your get_leaders() function and update it with an inner loop over each leader. You will query the url provided and extract the first paragraph using the get_first_paragraph() function you just finished. You will then update that leader's dictionary and move on to the next one.

Notice, the rest of the code should not change since you modify the leader's data one by one.

In [170]:
import json

# Save the leaders_per_country dictionary to a JSON file (3 lines)
def save(leaders_per_country, filename="leaders.json"):
    with open(filename, "w", encoding="utf-8") as file:
        json.dump(leaders_per_country, file, indent = 4, ensure_ascii=False)

# Read the file back and check if data is consistent (3 lines)
def load(filename="leaders.json"):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(file)
    return data

# Call the save function (1 line)
save(leaders_per_country)

# test reading and comparing
loaded_data = load()
print(loaded_data == leaders_per_country)


True


In [188]:
# src/leaders_scraper.py
import requests
from bs4 import BeautifulSoup
import json
import re


class CookieExpiredException(Exception):
    pass

class WikipediaScraper:
    def __init__(self):
        self.base_url = "https://country-leaders.onrender.com"
        self.country_endpoint = "/countries"
        self.leaders_endpoint = "/leaders"
        self.cookies_endpoint = "/cookie"
        self.leaders_data = {}
        self.cookie = self.refresh_cookie()

    def refresh_cookie(self):
        res = requests.get(self.base_url + self.cookies_endpoint)
        if res.status_code != 200:
            raise CookieExpiredException("Failed to refresh cookie")
        return res.cookies

    def get_countries(self):
        res = requests.get(self.base_url + self.country_endpoint, cookies=self.cookie)
        if res.status_code != 200:
            raise CookieExpiredException("Cookie expired  while getting countries")
        return res.json()

    def get_leaders(self, country):
        res = requests.get(self.base_url + self.leaders_endpoint, cookies=self.cookie, params={"country": country})
        if res.status_code != 200:
            raise CookieExpiredException("Cookie expired while getting leaders")
        leaders = res.json()

        for leader in leaders:
            wiki_url = leader.get("wikipedia_url")
            if wiki_url:
                try:
                    paragraph = self.get_first_paragraph(wiki_url)
                    leader["first_paragraph"] = paragraph
                except Exception as error:
                    leader["first_paragraph"] = f"Error: {str(error)}"
            else:
                leader["first_paragraph"] = "No Wikipedia URL available"
        self.leaders_data[country] = leaders

    def get_first_paragraph(self, wikipedia_url):
        print(f"Scraping: {wikipedia_url}")
        res = requests.get(wikipedia_url)
        if res.status_code != 200:
            raise Exception("Failed to load Wikipedia page")

        soup = BeautifulSoup(res.text, "html.parser")
        paragraphs = soup.find_all("p")

        for p in paragraphs:
            text = p.get_text().strip()
            if len(text) > 50:
                # Remove phonetic pronunciations within slashes or parentheses
                text = re.sub(r"\((?:IPA:)?[^)]*\)", "", text)
                # Remove curly braces or other brackets
                text = re.sub(r"\{[^}]*\}", "", text)
                # Remove HTML character entities (&nbsp;, &amp; etc.)
                text = re.sub(r"&\w+?;", "", text)
                # Remove multiple spaces/newlines
                text = re.sub(r"\s+", " ", text)
                return text

        return "No valid paragraph found."

    def to_json_file(self, filepath):
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(self.leaders_data, f, indent = 4, ensure_ascii = False)


In [None]:
leaders_per_country = {country['name']: requests.get(f"{root_url}/leaders?country={country['code']}").json() for country in countries}