In [198]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr

% matplotlib inline

### UN Population Data Analysis

**Data & Scope:** downloaded on 12/12 3.40 pm
- 1950-2100
    - historical/current: 1950-2015 are estimates
    - projections (probabilistic): 2016 and beyond (9 different variants)
- Population data only.
- Update status: 
    - last update - 20 Aug 2013 
    - next udpate - summer 2019

**Source:** United Nations (http://data.un.org/Data.aspx?d=PopDiv&f=variableID%3a12)

**Analysis Goals**
- Compare UN population data with PWT, WB, and IMF data
- Outline UN's methodology for population data, note any limitations/assumptions.

*Currently Out of Scope*
- Also evaluate whether UN dataset seems reliable for future projection X 

**Conclusion**

**Questions**


Table of contents

1. Open and munge data
    - filter/reformat
2.  Find missing years/countries


#### Open UN data

In [199]:
#df = pd.read_csv('/Users/mlimb/Dropbox/covariate_project/zerg/data/UN/WPP2017_PopulationByAgeSex_Medium.csv')
df = pd.read_csv('/Users/mlimb/Dropbox/covariate_project/zerg/data/UN/WPP2017_TotalPopulationBySex.csv')

#### Open Other Data Sources (WB, PWT)

In [200]:
df_wb = pd.read_csv('/Users/mlimb/Dropbox/covariate_project/zerg/data/WB/population/API_SP.POP.TOTL_DS2_en_csv_v2.csv', skiprows=3)
df_pwt = pd.read_stata('/Users/mlimb/Dropbox/covariate_project/zerg/data/PWT/pwt90.dta')

#df_pwt.head(5)

#### Look at UN data

In [201]:
df.columns

Index(['LocID', 'Location', 'VarID', 'Variant', 'Time', 'MidPeriod', 'PopMale',
       'PopFemale', 'PopTotal'],
      dtype='object')

In [202]:
df.shape

(371007, 9)

In [203]:
print(df['Time'].nunique()) # 151 years
print(df['Location'].nunique()) # 273 unique country/areas
print(df['Variant'].nunique())# 9 variants # 

151
273
9


In [204]:
df['Time'].unique()
df['Time'].unique().min(), df['Time'].unique().max() # get year range

(1950, 2100)

In [205]:
df['Variant'].unique()

array(['Medium', 'High', 'Low', 'Constant fertility',
       'Instant replacement', 'Zero migration', 'Constant mortality',
       'No change', 'Momentum'], dtype=object)

### Filter data ('medium' variant)  - only relevant to some datasets from UN that have multiple variants
Note: for historical data, all variants have same population value. For future projection, medium is the most used variant (does not imply highest accuracy) as population projections are inherently very uncertainy and depends on many variables.

In [206]:
# Filter medium variant
df_med = df[df['Variant']=='Medium']

In [207]:
df_med.columns
df_med.shape

(41223, 9)

### Filter historical estimates only

In [208]:
df_med['Time'].unique()

array([1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960,
       1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971,
       1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982,
       1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993,
       1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
       2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
       2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025, 2026,
       2027, 2028, 2029, 2030, 2031, 2032, 2033, 2034, 2035, 2036, 2037,
       2038, 2039, 2040, 2041, 2042, 2043, 2044, 2045, 2046, 2047, 2048,
       2049, 2050, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2058, 2059,
       2060, 2061, 2062, 2063, 2064, 2065, 2066, 2067, 2068, 2069, 2070,
       2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, 2079, 2080, 2081,
       2082, 2083, 2084, 2085, 2086, 2087, 2088, 2089, 2090, 2091, 2092,
       2093, 2094, 2095, 2096, 2097, 2098, 2099, 21

In [209]:
df_hist = df_med[df_med['Time'] <= 2015]
df_hist['Time'].unique().min(), df_hist['Time'].unique().max() # now year ranges from 1950 to 2015

(1950, 2015)

In [214]:
df_hist.shape # (18018, 9) fewer entries due to reduction in years

(18018, 9)

In [216]:
df_hist.columns
df_hist.head(3)

Unnamed: 0,LocID,Location,VarID,Variant,Time,MidPeriod,PopMale,PopFemale,PopTotal
0,4,Afghanistan,2,Medium,1950,1950.5,4099.243,3652.874,7752.118
1,4,Afghanistan,2,Medium,1951,1951.5,4134.477,3705.031,7839.51
2,4,Afghanistan,2,Medium,1952,1952.5,4173.993,3760.979,7934.98


### Select only relevant columns - Location (ie. country), Time (ie. year), PopTotal

In [218]:
df_sub = df_hist[['Location','Time','PopTotal']]
df_sub.shape

(18018, 3)

In [219]:
df_sub.tail(3)

Unnamed: 0,Location,Time,PopTotal
41135,Zimbabwe,2013,15054.506
41136,Zimbabwe,2014,15411.675
41137,Zimbabwe,2015,15777.451


### Pandas pivoting!!

In [222]:
pivoted=df_hist.pivot(index='Location', columns='Time', values='PopTotal')
pivoted.shape

(273, 66)

In [225]:
2015-1950+1 # years

66

In [226]:
pivoted.head(3)

Time,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,7752.118,7839.51,7934.98,8038.596,8150.447,8270.581,8399.03,8535.807,8680.946,8834.445,...,25893.45,26616.792,27294.031,28004.331,28803.167,29708.599,30696.958,31731.688,32758.02,33736.494
Africa,228670.019,233277.049,238113.121,243177.791,248471.497,253995.025,259750.195,265739.281,271965.261,278431.545,...,948156.166,972265.961,997144.67,1022858.654,1049446.344,1076933.813,1105285.268,1134398.192,1164129.789,1194369.908
Albania,1263.174,1287.497,1316.089,1348.099,1382.878,1419.97,1459.089,1500.149,1543.222,1588.484,...,3054.331,3023.907,2991.651,2962.635,2940.525,2926.659,2920.039,2918.978,2920.775,2923.352


### Reindexing to make Location (or country name) a separate column

Doing this to be consistent with WB datasets. Right now index column has name `Time` but it's irrelevant so not trying to rename it.

But `pivoted` and not `reindexed` has nice advantage of row access via country name.

In [242]:
pivoted.loc['Africa']
pivoted.loc['China'].to_dict()

{1950: 554419.27500000002,
 1951: 569611.07499999995,
 1952: 582029.30099999998,
 1953: 592567.86100000003,
 1954: 601971.21799999999,
 1955: 610834.39600000007,
 1956: 619597.54700000002,
 1957: 628551.40500000003,
 1958: 637853.60899999994,
 1959: 647555.96299999999,
 1960: 657686.14299999992,
 1961: 668334.897,
 1962: 679732.31499999994,
 1963: 692280.48999999999,
 1964: 706460.95299999998,
 1965: 722562.18299999996,
 1966: 740745.64199999999,
 1967: 760771.06299999997,
 1968: 782008.72900000005,
 1969: 803577.2209999999,
 1970: 824788.45700000005,
 1971: 845481.57799999998,
 1972: 865686.53000000003,
 1973: 885145.93299999996,
 1974: 903613.53799999994,
 1975: 920945.08299999998,
 1976: 937018.25199999998,
 1977: 951927.44400000002,
 1978: 966039.82200000004,
 1979: 979880.61900000006,
 1980: 993877.31000000006,
 1981: 1008000.152,
 1982: 1022253.3959999999,
 1983: 1037123.834,
 1984: 1053210.7350000001,
 1985: 1070863.389,
 1986: 1090348.0559999999,
 1987: 1111341.7250000001,
 198

In [234]:
reindexed = pivoted.reset_index(inplace=False)
reindexed.head(2)

Time,Location,1950,1951,1952,1953,1954,1955,1956,1957,1958,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,Afghanistan,7752.118,7839.51,7934.98,8038.596,8150.447,8270.581,8399.03,8535.807,8680.946,...,25893.45,26616.792,27294.031,28004.331,28803.167,29708.599,30696.958,31731.688,32758.02,33736.494
1,Africa,228670.019,233277.049,238113.121,243177.791,248471.497,253995.025,259750.195,265739.281,271965.261,...,948156.166,972265.961,997144.67,1022858.654,1049446.344,1076933.813,1105285.268,1134398.192,1164129.789,1194369.908
