Skip to content

Commit

Permalink
Add script for global per-country population sizes
Browse files Browse the repository at this point in the history
To be used for weighted sampling in a future commit.
  • Loading branch information
victorlin committed May 3, 2024
1 parent 3ab3ed5 commit ba0d7ea
Showing 1 changed file with 67 additions and 0 deletions.
67 changes: 67 additions & 0 deletions scripts/get_population_sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import argparse
import pandas as pd
import requests
import xlrd


def export_imf(output):
# Read data from the web
# Data source web page: <https://www.imf.org/external/datamapper/LP@WEO/OEMDC/ADVEC/WEOWORLD>
excel_url = "https://www.imf.org/external/datamapper//export/excel.php?indicator=LP"
response = requests.get(excel_url)
excel_contents = response.content
workbook = xlrd.open_workbook(file_contents=excel_contents, ignore_workbook_corruption=True)
df = pd.read_excel(workbook,
sheet_name='LP',
comment='©', # Ignore last line "© IMF …"
na_values='no data',
)

# Rename columns to match names in metadata
column_name_map = {
'Population (Millions of people)': 'country',
2022: 'weight',
}
df = df.rename(columns=column_name_map)

# Keep only the columns used above
df = df[column_name_map.values()]

# Set country as index
df = df.set_index('country')

# Rename countries to match values in metadata
country_name_map = {
"Brunei Darussalam": "Brunei",
"China, People's Republic of": "China",
"Hong Kong SAR": "Hong Kong",
"Korea, Republic of": "South Korea",
"Kyrgyz Republic": "Kyrgyzstan",
"Lao P.D.R.": "Laos",
"Macao SAR": "Macao",
"Taiwan Province of China": "Taiwan",
"West Bank and Gaza": "Palestine",
}
df = df.rename(index=country_name_map)

# Remove rows without a weight (or country)
df = df.dropna(how='any')

# Add missing countries
# Data sourced from <https://www.cia.gov/the-world-factbook/about/archives/2023/field/population/country-comparison/>
# TODO: consider using this data source for everything - it doesn't seem to have any missing countries
df.loc['Syria', 'weight'] = 22.933531

# Export
df.to_csv(output, index=True, sep='\t')


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Create population sizes file",
)

parser.add_argument('--output', type=str, metavar="FILE", required=True, help="Path to output population sizes file")
args = parser.parse_args()

export_imf(args.output)

0 comments on commit ba0d7ea

Please sign in to comment.