In [1]:
import pandas as pd
import numpy as np
import os

# Read raw population data

In [2]:
def prepare_total_pop():
    total_pop = pd.read_csv("../data/raw/WPP2019_TotalPopulationBySex.csv", sep=",", na_values='')
    total_pop = total_pop[total_pop["Time"] < 2020]
    total_pop.drop(columns=["LocID", "VarID", "Variant", "MidPeriod"], inplace=True)

    total_pop.columns = ["Country", "Year", "PopMale", "PopFemale", "PopTotal"]
    total_pop.set_index(["Country", "Year"], inplace=True)

    total_pop[["PopMale", "PopFemale", "PopTotal"]] = (total_pop[["PopMale", "PopFemale", "PopTotal"]].fillna(0) * 1000).astype("int64")
    return total_pop

def prepare_age_pop():
    age_pop = pd.read_csv("../data/raw/population/WPP2019_PopulationByAgeSex_Medium.csv", sep=",", na_values='')
    age_pop = age_pop[age_pop["Time"] < 2020]
    age_pop.drop(columns=["LocID", "VarID", "Variant", "MidPeriod", "AgeGrpStart", "AgeGrpSpan"], inplace=True)

    age_pop.columns = ["Country", "Year", "AgeGrp", "PopMale", "PopFemale", "PopTotal"]
    age_pop.set_index(["Country", "Year"], inplace=True)

    age_pop[["PopMale", "PopFemale", "PopTotal"]] = (age_pop[["PopMale", "PopFemale", "PopTotal"]].fillna(0) * 1000).astype("int64")
    return age_pop

In [3]:
pop_total = prepare_total_pop()
pop_per_age = prepare_age_pop()

In [4]:
pop_total.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,PopMale,PopFemale,PopTotal
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,1950,4099242,3652874,7752116
Afghanistan,1951,4134755,3705395,7840151


In [5]:
pop_per_age.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,AgeGrp,PopMale,PopFemale,PopTotal
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,1950,0-4,630044,661578,1291622
Afghanistan,1950,5-9,516206,487335,1003541


# Rearrange Population data

## Aggregate the age groups in three larger groups, from 0-19, 20-59, and 60+ years

In [6]:
pop_per_age_young = pop_per_age.query('AgeGrp in ["0-4", "5-9", "10-14", "15-19"]').sum(level=['Country', 'Year'])
pop_per_age_young.rename(columns={"PopMale": "PopMale_0-19", "PopFemale": "PopFemale_0-19", "PopTotal" : "PopTotal_0-19"}, inplace=True)
pop_per_age_mid = pop_per_age.query('AgeGrp in ["20-24", "25-29", "30-34", "35-39", "40-44", "45-49","50-54", "55-59"]').sum(level=['Country', 'Year'])
pop_per_age_mid.rename(columns={"PopMale": "PopMale_20-59", "PopFemale": "PopFemale_20-59", "PopTotal" : "PopTotal_20-59"}, inplace=True)
pop_per_age_old = pop_per_age.query('AgeGrp in ["60-64", "65-69", "70-74", "75-79", "80-84", "85-89", "90-94", "95-99", "100+"]').sum(level=['Country', 'Year'])
pop_per_age_old.rename(columns={"PopMale": "PopMale_60+", "PopFemale": "PopFemale_60+", "PopTotal" : "PopTotal_60+"}, inplace=True)

## Merge age groups with total population

In [7]:
pop_with_groups = pop_per_age_young.merge(pop_per_age_mid, left_index=True, right_index=True)
pop_with_groups = pop_with_groups.merge(pop_per_age_old, left_index=True, right_index=True)
pop_total_with_groups = pop_total.merge(pop_with_groups, left_index=True, right_index=True)

# Load indicators dataset

In [8]:
indicators = pd.read_csv("../data/raw/WPP2019_Period_Indicators_Medium.csv", sep=",", na_values='')
indicators[["Births", "Deaths", "DeathsMale", "DeathsFemale", "NetMigrations"]] = (indicators[["Births", "Deaths", "DeathsMale", "DeathsFemale", "NetMigrations"]].fillna(0) * 1000 / 5).astype("int64")
indicators_pop = indicators.merge(pop_total_with_groups, left_on=['MidPeriod','Location'], right_on=['Year', 'Country'])
indicators_pop["RelMigrations"] = indicators_pop["NetMigrations"] / indicators_pop["PopTotal"]
indicators_pop.drop(columns=['VarID', 'Variant'], inplace=True)
indicators_pop.rename(columns={"Location" : "Country"}, inplace=True)

## Scale population columns by total population

In [9]:
scale_cols = ["Births", "Deaths", "DeathsMale", "DeathsFemale", "PopMale", "PopFemale", "PopMale_0-19", "PopFemale_0-19",
              "PopTotal_0-19", "PopMale_20-59", "PopFemale_20-59", "PopTotal_20-59", "PopMale_60+", "PopFemale_60+", "PopTotal_60+"]
indicators_pop[scale_cols] = indicators_pop[scale_cols].div(indicators_pop["PopTotal"], axis=0)

# Read fragile states data
## Merge it with `indicators_pop`

In [10]:
fragile_states = pd.read_csv("../data/clean/fragile_states_index.csv", sep=",", na_values='')
fragile_states["country"] = fragile_states["country"].str.strip()
full_set = indicators_pop.merge(fragile_states, left_on=['Country','MidPeriod'], right_on=['country','year'])

## Calculate the index change from the previous 5 year

In [11]:
query_result = full_set.query("Time == '2005-2010'")
full_set.loc[query_result.index, "change_from_previous_year"] = pd.Series(0, index=query_result.index)
previous = query_result
for years in ["2010-2015", "2015-2020"]:
    query_result = full_set.query("Time == '" + years + "'")
    full_set.loc[query_result.index, "change_from_previous_year"] = \
        (query_result.set_index("Country")["total"] - previous.set_index("Country")["total"]).round(1).fillna(0).to_numpy()
    previous = full_set.loc[query_result.index, :]

# Save all dataframes

In [12]:
pop_total.to_csv("../data/clean/population_total.csv")
pop_per_age.to_csv("../data/clean/population_per_age.csv")
indicators_pop.to_csv(index=False, path_or_buf="../data/clean/population_indicators.csv")
full_set.to_csv(index=False, path_or_buf="../data/clean/full_set.csv")