# WPP: total population

## Parameters

In [None]:
dest_dir = "/tmp/wpp_2019_total_population"

## Walden

In [None]:
from owid import walden

In [None]:
walden_ds = walden.Catalog().find_one("wpp", "2019", "standard_projections")

In [None]:
walden_ds

## Unzip

In [None]:
import tempfile
import shutil

In [None]:
temp_dir = tempfile.mkdtemp()

In [None]:
import zipfile

In [None]:
zipfile.ZipFile(walden_ds.local_path).extractall(temp_dir)

In [None]:
!ls {temp_dir}/WPP2019

## Make dataset

In [None]:
from owid.catalog import Dataset
from etl.steps.data import converters

In [None]:
ds = Dataset.create_empty(dest_dir)

In [None]:
ds.metadata = converters.convert_walden_metadata(walden_ds)

In [None]:
ds.save()

## Add tables

In [None]:
from owid.catalog import Table
import pandas as pd

### Total population

In [None]:
df = pd.read_csv(f"{temp_dir}/WPP2019/WPP2019_TotalPopulationBySex.csv")

In [None]:
df.head()

In [None]:
df.columns = [
    "loc_id",
    "location",
    "var_id",
    "variant",
    "year",
    "mid_period",
    "population_male",
    "population_female",
    "population_total",
    "population_density",
]

In [None]:
t = Table(df[["loc_id", "location"]].drop_duplicates().set_index("loc_id"))
t.metadata.short_name = "location_codes"
ds.add(t)

In [None]:
t = Table(df[["var_id", "variant"]].drop_duplicates().set_index("var_id"))
t.metadata.short_name = "variant_codes"
ds.add(t)

In [None]:
df.drop(columns=["loc_id", "var_id"], inplace=True)

In [None]:
for col in ["location", "variant"]:
    df[col] = df[col].astype("category")

In [None]:
df.set_index(["variant", "location", "year"], inplace=True)

In [None]:
df

In [None]:
df.index.levels[0]

In [None]:
t = Table(df)
t.metadata.short_name = "total_population"
ds.add(t)

### Fertility by age

In [None]:
df = pd.read_csv(f"{temp_dir}/WPP2019/WPP2019_Fertility_by_Age.csv")

In [None]:
df.head()

In [None]:
df.drop(columns=["LocID", "VarID", "MidPeriod", "AgeGrpStart", "AgeGrpSpan"], inplace=True)

In [None]:
df.columns = [
    "location",
    "variant",
    "year_range",
    "age_group",
    "asfr",
    "pasfr",
    "births",
]

In [None]:
df.head()

In [None]:
for col in ["location", "variant", "year_range", "age_group"]:
    df[col] = df[col].astype("category")

In [None]:
df.set_index(["variant", "location", "year_range", "age_group"], inplace=True)

In [None]:
t = Table(df)
t.metadata.short_name = "fertility_by_age"
ds.add(t)

### Population by age and sex

In [None]:
df = pd.read_csv(f"{temp_dir}/WPP2019/WPP2019_PopulationByAgeSex_Medium.csv")

In [None]:
df.head()

In [None]:
df.drop(columns=["LocID", "VarID", "MidPeriod", "AgeGrpStart", "AgeGrpSpan"], inplace=True)

In [None]:
df.columns = [
    "location",
    "variant",
    "year",
    "age_group",
    "population_male",
    "population_female",
    "population_total",
]

In [None]:
df.head()

In [None]:
for col in ["location", "variant", "age_group"]:
    df[col] = df[col].astype("category")

In [None]:
df.set_index(["variant", "location", "year", "age_group"], inplace=True)

In [None]:
df.head()

In [None]:
t = Table(df)
t.metadata.short_name = "population_by_age_sex"
ds.add(t)

## Clean up

In [None]:
shutil.rmtree(temp_dir)