# World Bank: Income Groups

## Parameters

In [1]:
dest_dir = "/tmp/wb_income"

## Imports & Paths
Import the required libraries and define paths to load files (including data files and standardisation mappings for item and element names).

In [2]:
import json
from pathlib import Path
import pandas as pd
from owid import catalog
from etl.paths import BASE_DIR, DATA_DIR

In [3]:
HERE = BASE_DIR / "etl/steps/data/garden/wb/2021-07-01"

path_dataset = DATA_DIR / "meadow/wb/2021-07-01/wb_income"
path_country_mapping = HERE / "wb_income.country_mapping.json"
path_income_mapping = HERE / "wb_income.income_mapping.json"

## Load meadow datasets
In this step we load the required datasets from Garden: FBS and FBSH

In [4]:
path_dataset

PosixPath('/Users/mojmir/projects/etl/data/meadow/wb/2021-07-01/wb_income')

In [14]:
# Read datasets
ds_meadow = catalog.Dataset(path_dataset)

In [15]:
ds_meadow.table_names

['wb_income_group']

In [16]:
# Bulk data and items metadata
df = ds_meadow["wb_income_group"]

In [17]:
print(df.shape)
df.head()

(265, 5)


Unnamed: 0_level_0,code,region,income_group,lending_category,other_emu_or_hipc
economy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aruba,ABW,Latin America & Caribbean,High income,,
Afghanistan,AFG,South Asia,Low income,IDA,HIPC
Angola,AGO,Sub-Saharan Africa,Lower middle income,IBRD,
Albania,ALB,Europe & Central Asia,Upper middle income,IBRD,
Andorra,AND,Europe & Central Asia,High income,,


## Clean dataset

### Drop rows and columns

In [18]:
# Drop supranational regions
df = df.dropna(subset=["region"])

In [19]:
# Rename & drop columns
column_keep_rename = {
    "economy": "country",
    "income_group": "income_group",
}
df = df.reset_index()
df = df[column_keep_rename].rename(columns=column_keep_rename)

  df = df[column_keep_rename].rename(columns=column_keep_rename)


In [20]:
df.head()

Unnamed: 0,country,income_group
0,Aruba,High income
1,Afghanistan,Low income
2,Angola,Lower middle income
3,Albania,Upper middle income
4,Andorra,High income


### Harmonize entities

In [21]:
# Harmonize entities
with open(path_country_mapping) as f:
    country_mapping = json.load(f)
df = df.assign(country=df.country.replace(country_mapping))

### Harmonize income group names

In [22]:
# Harmonize entities
with open(path_income_mapping) as f:
    income_mapping = json.load(f)
df = df.assign(income_group=df.income_group.replace(income_mapping))

### Add extra countries

In [23]:
# Define additional entities
extra = [
    ["Falkland Islands", "High-income countries"],
    ["Guernsey", "High-income countries"],
    ["Jersey", "High-income countries"],
    ["Saint Helena", "High-income countries"],
    ["Montserrat", "High-income countries"],
    ["Northern Cyprus", "High-income countries"],
    ["Wallis and Futuna", "High-income countries"],
    ["Anguilla", "High-income countries"],
]
df_extra = pd.DataFrame(extra, columns=["country", "income_group"])

In [24]:
# Merge
df = pd.concat([df, df_extra]).sort_values("country").reset_index(drop=True)

In [25]:
df.set_index("country", inplace=True)

In [26]:
df.head()

Unnamed: 0_level_0,income_group
country,Unnamed: 1_level_1
Afghanistan,Low-income countries
Albania,Upper-middle-income countries
Algeria,Lower-middle-income countries
American Samoa,Upper-middle-income countries
Andorra,High-income countries


## Create Garden dataset

### Metadata
First, we create the metadata for this new dataset FBSC. Most of its content comes from concatenating FBS and FBSH fields. Checksum field is left to `None`, as it is unclear what we should use here (TODO).

In [27]:
# Define metadata
metadata = df.metadata

### Create dataset and add tables
Finally, we add the tables to the dataset.

In [28]:
ds_garden = catalog.Dataset.create_empty(dest_dir)

In [29]:
# Propagate metadata
ds_garden.metadata = ds_meadow.metadata
ds_garden.save()

In [30]:
# Add bulk table
df.metadata.short_name = "wb_income_group"
ds_garden.add(df)

In [31]:
ds_garden.save()