In [None]:
'''
Population data

The population dataset by gapminder is a dataset containing information about the worlds population from 1800 until 2100.
The data is a composition from different sources. There a two main sources - a dataset by Angus Maddison and the CLIO Infra Project -  and the World Population Prospects (WPP) provided by the UN.
The dataset by Angus Maddison provides the data for the years 1800 - 1950. Population data after 1950 was taken from the WPP dataset.
Additional sources were used, to fill missing values for years or regions. A details list of sources can be found in the documentation. (https://www.gapminder.org/data/documentation/gd003/)
The primary data from the sources originates from census, informal census, indirect estimate and arbitrary guesses.

Modifications and Estimations:
- Summations of parts
- Larger area minus non-included parts
- Geographical interpolation
- Geographical extrapolation
- Temporal interpolation
- Temporal extrapolation
- Adjustments for under-enumeration
- Recalculated to fit present borders

Since the data for the year 1950 of the two main datasets did not match for every country, small adjustments and smoothing were applied.

Preprocessing:
For our purposes, the subdatasets "data-for-world-by-year", "data-for-regions-by-year" and "data-for-countries-by-year" are relevant.
We consider the period from 1900 until now and remove redundant rows.
There are no missing values.

Subdataset:
    population-global.csv
        Columns: year, population

    population-region.csv
        Columns: year, region, population

    population-country.csv
        Columns: year, country code, country, population
'''

In [82]:
import pandas as pd

# Read Excel

In [83]:
population_dict = pd.read_excel('data/raw/population/gapminder-population-v7.xlsx', sheet_name=['data-for-world-by-year', 'data-for-regions-by-year', 'data-for-countries-etc-by-year'])

regions_population_df = population_dict.get('data-for-regions-by-year')

## Preprocess world data

In [None]:
world_population_df = population_dict.get('data-for-world-by-year')

print('raw data for world population')
print(world_population_df.head(3))

# remove unnecessary columns
world_population_df = world_population_df[['time', 'Population']]
world_population_df.set_index('time')
world_population_df.rename(columns={'Population': 'population'}, inplace=True)

print('extract population data from 1900 until now')
world_population_df = world_population_df[world_population_df['time'] >= 1900]
world_population_df = world_population_df[world_population_df['time'] <= 2022]
print(world_population_df)

world_population_df.to_csv('data/processed/population/population-global.csv', sep=';', index=False, header=True)

# Prepare dataset for countries and regions

In [85]:
countries_population_df = population_dict.get('data-for-countries-etc-by-year')
un_country_codes = pd.read_csv("data/raw/country-codes/un-country-codes.csv", sep=";")
un_country_codes = un_country_codes[['Region Code', 'Region Name', 'ISO-alpha3 Code']]

# extract population data from 1900 until now
countries_population_df = countries_population_df[countries_population_df['time'] >= 1900]
countries_population_df = countries_population_df[countries_population_df['time'] <= 2022]

# format column that will be joined
countries_population_df['geo'] = countries_population_df['geo'].str.upper()

# merge country-population and un-country-codes
countries_with_regions = pd.merge(countries_population_df, un_country_codes, how='inner', left_on='geo', right_on='ISO-alpha3 Code')

countries_with_regions.rename(columns={'Region Code': 'region_code', 'Region Name': 'region_name', 'time': 'year', 'Population': 'population', 'geo': 'country_code', 'name': 'country_name'}, inplace=True)

## Get population regions dataset

In [86]:
columns_regions = ['region_code', 'region_name', 'country_code', 'country_name', 'year', 'population']

# select columns
population_regions = countries_with_regions[columns_regions]
population_regions = population_regions.groupby(['region_code', 'region_name', 'year'], as_index=False)['population'].sum()

#store in csv file
population_regions.to_csv('data/processed/population/population-region.csv', sep=';', index=False, header=True)

## Get population countries dataset

In [87]:
columns_country = ['country_code', 'country_name', 'year', 'population']

# select columns
population_countries = countries_with_regions[columns_country]

#store in csv file
population_countries.to_csv('data/processed/population/population-country.csv', sep=';', index=False, header=True)