# World Bank: Income Groups

## Parameters

In [1]:
dest_dir = "/tmp/wb_income"

## Imports & Paths
Import the required libraries and define paths to load files (including data files and standardisation mappings for item and element names).

In [2]:
import json
from pathlib import Path
import pandas as pd
from owid import catalog
from etl.paths import BASE_DIR as base_path

In [3]:
path_dataset = base_path / "data/meadow/wb/2021-07-01/wb_income"
path_country_mapping = base_path / "etl" / "steps" / "data" / "garden" / "wb" / "2021-07-01" / "wb_income.country_mapping.json"
path_income_mapping = base_path / "etl" / "steps" / "data" / "garden" / "wb" / "2021-07-01" / "wb_income.income_mapping.json"

## Load meadow datasets
In this step we load the required datasets from Garden: FBS and FBSH

In [42]:
# Read datasets
ds_meadow = catalog.Dataset(path_dataset)

In [43]:
# Bulk data and items metadata
df = ds_meadow["bulk"]

In [44]:
print(df.shape)
df.head()

(265, 5)


Unnamed: 0_level_0,Code,Region,Income group,Lending category,Other (EMU or HIPC)
Economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aruba,ABW,Latin America & Caribbean,High income,,
Afghanistan,AFG,South Asia,Low income,IDA,HIPC
Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,
Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,
Andorra,AND,Europe & Central Asia,High income,,


## Clean dataset

### Drop rows and columns

In [45]:
# Drop supranational regions
df = df.dropna(subset=["Region"])

In [46]:
# Rename & drop columns
column_keep_rename = {
    "Economy": "entity",
    "Income group": "income_group",
}
df = df.reset_index()
df = df[column_keep_rename].rename(columns=column_keep_rename)

In [47]:
df.head()

Unnamed: 0,entity,income_group
0,Aruba,High income
1,Afghanistan,Low income
2,Angola,Lower middle income
3,Albania,Upper middle income
4,Andorra,High income


### Harmonize entities

In [48]:
# Harmonize entities
with open(path_country_mapping) as f:
    country_mapping = json.load(f)
df = df.assign(entity=df.entity.replace(country_mapping))

### Harmonize income group names

In [49]:
# Harmonize entities
with open(path_income_mapping) as f:
    income_mapping = json.load(f)
df = df.assign(income_group=df.income_group.replace(income_mapping))

### Add extra countries

In [50]:
# Define additional entities
extra = [
    ["Falkland Islands", "High-income countries"],
    ["Guernsey", "High-income countries"],
    ["Jersey", "High-income countries"],
    ["Saint Helena", "High-income countries"],
    ["Montserrat", "High-income countries"],
    ["Northern Cyprus", "High-income countries"],
    ["Wallis and Futuna", "High-income countries"],
    ["Anguilla", "High-income countries"],
]
df_extra = pd.DataFrame(extra, columns=["entity", "income_group"])

In [65]:
# Merge
df = pd.concat([df, df_extra]).sort_values("entity").reset_index(drop=True)

In [67]:
df.head()

Unnamed: 0,entity,income_group
0,Aruba,High-income countries
1,Afghanistan,Low-income countries
2,Angola,Lower-middle-income countries
3,Albania,Upper-middle-income countries
4,Andorra,High-income countries


## Create Garden dataset

### Metadata
First, we create the metadata for this new dataset FBSC. Most of its content comes from concatenating FBS and FBSH fields. Checksum field is left to `None`, as it is unclear what we should use here (TODO).

In [76]:
# Define metadata
metadata = df.metadata

### Create dataset and add tables
Finally, we add the tables to the dataset.

In [96]:
ds_garden = catalog.Dataset.create_empty(dest_dir)

In [97]:
# Propagate metadata
ds_garden.metadata = ds_meadow.metadata
ds_garden.save()

In [99]:
# Add bulk table
df.metadata.short_name = "bulk"
ds_garden.add(df)

In [100]:
ds_garden.save()