# World Bank: Income Groups

## Parameters

In [None]:
dest_dir = "/tmp/wb_income"

## Imports & Paths
Import the required libraries and define paths to load files (including data files and standardisation mappings for item and element names).

In [None]:
import json
from pathlib import Path
import pandas as pd
from owid import catalog
from etl.paths import BASE_DIR, DATA_DIR

In [None]:
HERE = BASE_DIR / "etl/steps/data/garden/wb/2021-07-01"

path_dataset = DATA_DIR / "meadow/wb/2021-07-01/wb_income"
path_country_mapping = HERE / "wb_income.country_mapping.json"
path_income_mapping = HERE / "wb_income.income_mapping.json"

## Load meadow datasets
In this step we load the required datasets from Garden: FBS and FBSH

In [None]:
# Read datasets
ds_meadow = catalog.Dataset(path_dataset)

In [None]:
# Bulk data and items metadata
df = ds_meadow["wb_income_group"]

In [None]:
print(df.shape)
df.head()

## Clean dataset

### Drop rows and columns

In [None]:
# Drop supranational regions
df = df.dropna(subset=["Region"])

In [None]:
# Rename & drop columns
column_keep_rename = {
    "Economy": "country",
    "Income group": "income_group",
}
df = df.reset_index()
df = df[column_keep_rename].rename(columns=column_keep_rename)

In [None]:
df.head()

### Harmonize entities

In [None]:
# Harmonize entities
with open(path_country_mapping) as f:
    country_mapping = json.load(f)
df = df.assign(country=df.country.replace(country_mapping))

### Harmonize income group names

In [None]:
# Harmonize entities
with open(path_income_mapping) as f:
    income_mapping = json.load(f)
df = df.assign(income_group=df.income_group.replace(income_mapping))

### Add extra countries

In [None]:
# Define additional entities
extra = [
    ["Falkland Islands", "High-income countries"],
    ["Guernsey", "High-income countries"],
    ["Jersey", "High-income countries"],
    ["Saint Helena", "High-income countries"],
    ["Montserrat", "High-income countries"],
    ["Northern Cyprus", "High-income countries"],
    ["Wallis and Futuna", "High-income countries"],
    ["Anguilla", "High-income countries"],
]
df_extra = pd.DataFrame(extra, columns=["country", "income_group"])

In [None]:
# Merge
df = pd.concat([df, df_extra]).sort_values("country").reset_index(drop=True)

In [None]:
df.set_index("country", inplace=True)

In [None]:
df.head()

## Create Garden dataset

### Metadata
First, we create the metadata for this new dataset FBSC. Most of its content comes from concatenating FBS and FBSH fields. Checksum field is left to `None`, as it is unclear what we should use here (TODO).

In [None]:
# Define metadata
metadata = df.metadata

### Create dataset and add tables
Finally, we add the tables to the dataset.

In [None]:
ds_garden = catalog.Dataset.create_empty(dest_dir)

In [None]:
# Propagate metadata
ds_garden.metadata = ds_meadow.metadata
ds_garden.save()

In [None]:
# Add bulk table
df.metadata.short_name = "wb_income_group"
ds_garden.add(df)

In [None]:
ds_garden.save()