In [2]:
import os

import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr

% matplotlib inline

In [3]:
os.listdir('./UN')

['.DS_Store', 'UNdata_Export_20171201_022351573.csv']

### UN Population Data Analysis

**<span style="color:red; background:yellow;">Problem: Missing countries after Ecuador (alphabetical)</span>**

**Data & Scope:** downloaded on 11/30 5.28 pm
- 1950-2100
- for areas/countries with:
    - '>' 90,000 inhabitants in 2017, key demographic indicators provided
    - < 90,000 inhabitatns, ONLY figures related to pop size and growth provided
    
- data exists from historical/current estimates to probabilistic future projections  
- last update - 20 Aug 2013
- next udpate - summer 2019
**Source:** United Nations (http://data.un.org/Data.aspx?d=PopDiv&f=variableID%3a12)

**Assumptions/Expectations**
- Has both historical/current estimates to future projection data. 
- Has population data - probablistic population projections.
- Not for income data

**Analysis Goals**
- Task 3 Deliverable: ADM0 population & real income estimates from `1950-2017`
- Evaluate which years data exist for (and that `1950-2017` are present). Note any missing years (DONE - no missing years)
- Evaluate if missing countries in any years. why?
- Quantify any data availability differences between countries (years per country, etc.)
- Based on methodology. evaluate whether UN dataset seems reliable for historical population data

*Currently Out of Scope*
- Also evaluate whether UN dataset seems reliable for future projection X 

*Advanced*
- Plot population growth over different variants (8 - `df['Variant'].unique()`
    High, Low, 
    Constant fertility, 
    Instant replacement, 
    Zero migration, 
    Constant mortality, 
    No change, 
    momentum)
    

**Methodology**
- Look up methodology 
- Read about 8 different variants (what they are)
- Read about how historical projection is different across the variants
- Read about how projections are different across variants


**Conclusion**
1. Different years may have unequal country #s.
- Year 2012 has 658 countries. (1 more country than - 2011)

- Year 2013-2100 has 666 countries. (3 more countires than 2012)
- Year 1950-2011 has 657 countries. 

**Questions**

In [17]:
os.listdir('./UN/')

['.DS_Store', 'UNdata_Export_20171201_022351573.csv']

In [9]:
# open data
df = pd.read_csv('./UN/UNdata_Export_20171201_022351573.csv')

df.describe()
# colnames are year, gdppccountry, gdppcstate, gdppc_adm0_PWT

Unnamed: 0,Country or Area Code,Year(s),Value
count,100000.0,100000.0,100000.0
mean,331.31342,2025.24935,151960.9
std,671.264658,43.564441,647318.7
min,4.0,1950.0,5.118
25%,64.0,1988.0,523.0552
50%,152.0,2025.0,7353.869
75%,344.0,2063.0,30547.56
max,5500.0,2100.0,16269260.0


In [169]:
g = df.groupby('Country or Area')

# Get country names
g.groups.keys()


dict_keys(['Afghanistan', 'Africa', 'Albania', 'Algeria', 'American Samoa', 'Andorra', 'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Aruba', 'Asia', 'Australia', 'Australia/New Zealand', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'British Virgin Islands', 'Brunei Darussalam', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Caribbean', 'Caribbean Netherlands', 'Cayman Islands', 'Central African Republic', 'Central America', 'Central Asia', 'Chad', 'Channel Islands', 'Chile', 'China', 'China Hong Kong SAR', 'China Macao SAR', 'China Taiwan Province of China', 'Colombia', 'Comoros', 'Congo', 'Cook Islands', 'Costa Rica', 'Croatia', 'Cuba', 'Curaçao', 'Cyprus', 'Czechia', "Cô´te d'Ivoire", "Dem. People's Republic of Korea", 'Democratic Republic of the Co

# Warning: there is =something with the data source-- it only goes up from Afghanista to Ecuador likely not all data was downloaded

In [158]:
# Get unique country Names
df['Country or Area'].unique().size

74

In [159]:
df.columns
#df['Variant'].unique()

#print(df['Year(s)'].value_counts().unique())
# Each year appears _ number of times (one of the three)
# count. 666, 658, 657
# freq.  88,  1,   62
year_counts = df['Year(s)'].value_counts() # Series
#Q: why do some years appear more than others?
# Does this reflect any changes in countries represented/data collected over time?
# Which ones?

year_with_freq_666 = year_counts[year_counts == 666]
year_with_freq_658 = year_counts[year_counts == 658]
year_with_freq_657 = year_counts[year_counts == 657]

#print(df['Year(s)'].value_counts().value_counts())
# Frequency of occurrence: 88 62 1
year_with_freq_658 # 2012
dd = year_with_freq_666.sort_index() # years 2013  
dd_hist = year_with_freq_657.sort_index() 

#get index values (which are years)
dd.index[0] # get first index -> 2013
dd.index[-1] # get last index -> 2100

#get index values (which are years)
dd_hist.index[0] # get first index -> 1950
dd_hist.index[-1] # get last index -> 2011

2011

## Q. Which countries are unaccounted for in years with 658/657 countries?

#Q. which countries are missing in 658 freq. vs 666 freq? (Should be 8 countries)

### Get differences between all index (in this case years with 666 freq in dataset)


In [151]:
#dd.diff() # Get differences between values -> Series

# Get differences between all index (in this case years with 666 freq in dataset)
xdiff = np.diff(dd.index.values) # returns array of 1's
xdiff_hist = np.diff(dd_hist.index.values)

# Get subset of xdiff, which doesn't have difference of 1
xdiff[xdiff != 1] == [] # returns empty array

# Different groups are 666, 657, 658 (based on frequency of occurrence)
print ("Are the years in 3 groups contiguous?")
print (xdiff[xdiff != 1].size == 0)
print (xdiff_hist[xdiff_hist != 1].size == 0)

Are the years in 3 groups contiguous?
True
True
