In [1]:
import requests
import pandas as pd
import matplotlib.pyplot as plt
# For async requests
import aiohttp
import asyncio
import nest_asyncio


In [2]:
def get_all_data(endpoint, params = {"format": "json"}):

    """
    Fetches and retrieves data from a specified API endpoint and returns it as a Pandas DataFrame.

    Args:
        endpoint (str): The URL of the API endpoint to retrieve data from.
        params (dict, optional): Parameters to include in the API request. Defaults to {"format": "json"}.

    Returns:
        pd.DataFrame: A Pandas DataFrame containing the retrieved data.

    Raises:
        requests.exceptions.RequestException: If there is an issue with the HTTP request.

    Note:
        This function sends an initial request to the specified API endpoint and checks if there is more data available
        than can be retrieved in a single response. If so, it adjusts the 'per_page' parameter to retrieve all available
        data and sends another request to fetch the remaining data. The data is then converted into a Pandas DataFrame.
    """

    response = requests.get(endpoint, params = params)
    [metadata, data] = response.json()
    if int(metadata["total"]) > int(metadata["per_page"]):
        params["per_page"] = metadata["total"]
        response = requests.get(endpoint, params = params)
        [metadata, data] = response.json()
    return pd.DataFrame(data)


In [3]:
endpoint = "https://api.worldbank.org/v2/country/all/indicator/NY.GDP.PCAP.PP.KD"

gdp_percapita_df = get_all_data(endpoint)
gdp_percapita_df["Country_Name"] = gdp_percapita_df["country"].apply(lambda x: x["value"])
gdp_percapita_df = gdp_percapita_df.rename(columns={"value": "GDP_Per_Capita"})
cols_to_drop = [x for x in gdp_percapita_df.columns if x not in ["Country_Name", "date", 'GDP_Per_Capita']]
gdp_percapita_df = gdp_percapita_df.drop(columns=cols_to_drop)
gdp_percapita_df.shape

(16758, 3)

In [4]:
endpoint = "https://api.worldbank.org/v2/country/all/indicator/SP.DYN.LE00.IN"

gdp_le_df = get_all_data(endpoint)
gdp_le_df["Country_Name"] = gdp_le_df["country"].apply(lambda x: x["value"])
gdp_le_df = gdp_le_df.rename(columns={"value": "Life_Expectancy"})
cols_to_drop = [x for x in gdp_le_df.columns if x not in ["Country_Name", "date", 'Life_Expectancy']]
gdp_le_df = gdp_le_df.drop(columns=cols_to_drop)
gdp_le_df.shape


(16758, 3)

In [5]:
merged_df = pd.merge(gdp_percapita_df, gdp_le_df, on=["Country_Name", "date"] )
merged_df = merged_df.dropna()

In [6]:
endpoint = "http://api.worldbank.org/v2/country"

countries_df = get_all_data(endpoint)
countries_df = countries_df[countries_df["capitalCity"].astype(bool)]
countries_df = countries_df[[ 'iso2Code', 'name', 'capitalCity', 'longitude', 'latitude']]
countries_df = countries_df.rename(columns={"name": "Country_Name"})

In [7]:
triple_merged_df = pd.merge(countries_df, merged_df, on=["Country_Name"], how="inner")
triple_merged_df.head()

Unnamed: 0,iso2Code,Country_Name,capitalCity,longitude,latitude,date,GDP_Per_Capita,Life_Expectancy
0,AW,Aruba,Oranjestad,-70.0167,12.5167,2021,38866.333486,74.626
1,AW,Aruba,Oranjestad,-70.0167,12.5167,2020,33155.243239,75.723
2,AW,Aruba,Oranjestad,-70.0167,12.5167,2019,40780.516485,76.248
3,AW,Aruba,Oranjestad,-70.0167,12.5167,2018,40706.749216,76.072
4,AW,Aruba,Oranjestad,-70.0167,12.5167,2017,38865.188195,75.903


In [8]:
#Only US
endpoint = "http://api.worldbank.org/v2/country/us/indicator/SP.POP.TOTL"
df = get_all_data(endpoint)

In [9]:
#Only US 2021
endpoint = "http://api.worldbank.org/v2/country/us/indicator/SP.POP.TOTL"
params = {
    'format': "json",
    "date":2021
}

df = get_all_data(endpoint, params=params)
df

Unnamed: 0,indicator,country,countryiso3code,date,value,unit,obs_status,decimal
0,"{'id': 'SP.POP.TOTL', 'value': 'Population, to...","{'id': 'US', 'value': 'United States'}",USA,2021,332031554,,,0


In [10]:
#Only US 2000:2021
endpoint = "http://api.worldbank.org/v2/country/us/indicator/SP.POP.TOTL"
params = {
    'format': "json",
    "date":"2000:2021"
}
df = get_all_data(endpoint, params=params)
df.shape

(22, 8)

In [11]:
#Only US and Canada 2000:2021
endpoint = "http://api.worldbank.org/v2/country/us;ca/indicator/SP.POP.TOTL"
params = {
    'format': "json",
    "date":"2000:2021"
}
df = get_all_data(endpoint, params=params)
df.shape

(44, 8)

In [12]:
#Synchronous solution
response_data = [] # this will hold all the data
endpoint = "https://api.worldbank.org/v2/country/all/indicator/NY.GDP.PCAP.PP.KD"
params = {
    'format': "json",
    'page':1,
    # "date":"2010:2021"
}
response = requests.get(endpoint, params = params)
[metadata, data] = response.json()
response_data.extend(data)
total_pages = metadata["pages"]
page = 1
while page < total_pages:
    page += 1
    params["page"] = page
    response = requests.get(endpoint, params = params)
    [metadata, data] = response.json()
    response_data.extend(data)
    if page%50 == 0:
        print(f"recieved page {page} of {total_pages}")



recieved page 50 of 336
recieved page 100 of 336
recieved page 150 of 336
recieved page 200 of 336
recieved page 250 of 336
recieved page 300 of 336


In [13]:
#async solution

response_data = [] #This will hold all the data recieved 
total_pages = 0

async def fetch_data(session, page):
    """
    Fetches data from the World Bank API for a specific page.

    Args:
        session (aiohttp.ClientSession): An aiohttp session for making HTTP requests.
        page (int): The page number to fetch data from.

    Returns:
        dict: JSON response containing data from the World Bank API.
    """

    endpoint = "https://api.worldbank.org/v2/country/all/indicator/NY.GDP.PCAP.PP.KD"
    params = {
        'format': "json",
        'page': page,
        # "date":"2010:2021"
    }
    async with session.get(endpoint, params=params) as response:
        data = await response.json()
        return data

async def fetch_all():
    """
    Fetches data from the World Bank API for all available pages and aggregates the results.

    Returns:
        None
    """
    async with aiohttp.ClientSession() as session:
        # Send one request to get the number of pages
        initial_response = await fetch_data(session, 1)
        [metadata, data] = initial_response
        response_data.extend(data)
        total_pages = metadata["pages"]
        # if the total pages exceeds 1, then prepare multiple requests for pages 2, 3,4 ...etc
        tasks = [fetch_data(session, page) for page in range(2, total_pages + 1)]
        # send all these prepared requests simultaneously and then gather the resultsas they come in
        results = await asyncio.gather(*tasks)
        print("All data recieved!....Please wait as I put it together.....")
        for result in results:
            [metadata, data] = result
            response_data.extend(data)
            # print(f"packaging page {metadata['page']} of {total_pages}")
        print('Done!')

nest_asyncio.apply()
asyncio.run(fetch_all())



All data recieved!....Please wait as I put it together.....
Done!


In [15]:
endpoint = "http://api.worldbank.org/v2/indicator"

indicators_df = get_all_data(endpoint)
result = indicators_df[indicators_df["name"].str.startswith("Public Expenditure on Education")]
result["id"]

16623    SE.XPD.EDUC.ZS
Name: id, dtype: object