In [190]:
import os

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr


% matplotlib inline

## Ideas for comparing population data

1) Pick select countries- China. US. Brazil, Indonesia, Russian Federation. 

2) Most populous countries

1. China	1,379,302,771	  6. Pakistan	204,924,861
2. India	1,281,935,911	  7. Nigeria	190,632,261
3. United States	326,625,791	  8. Bangladesh	157,826,578
4. Indonesia	260,580,739	  9. Russia	142,257,519
5. Brazil	207,353,391	  10. Japan	126,451,398


ISO:
1. China CHN
2. USA
3. Indonesia IDN
4. Brazil BRA
5. Russian Federation RUS
Bangladesh BGD
Nigeria NGA
Japan JPN
Pakistan PAK

3) Least populated
Least populated:
San Marino SMR
Palau PLW

United Arab Emirates
Singapore?
Norway NOR
Luxembourg LUX

In [191]:
iso_map['India']
iso_map['Pakistan']

'PAK'

In [192]:
highest_populated = ['CHN', 'IND', 'USA', 'BRA', 'PAL', 'RUS', ]
top3_populated = ['CHN', 'IND', 'USA']

## Convenience function

In [193]:
def add_iso_column(_df, _country_name_col):
    # uses iso_map (global dict)
    _df['iso'] = _df[_country_name_col].apply(lambda name: iso_map[name] if iso_map.get(name) else name)
    return _df

def set_iso_as_index(_df, _iso_col_name):
    return _df.set_index(_iso_col_name)

## Open data

In [194]:
dir_path = os.environ['ZERG']
pwt = dir_path + 'data/PWT/pwt9_pivoted.csv'
un = dir_path + 'data/UN/un_population_iso_pivoted.csv'
imf = dir_path + 'data/IMF/imf_population_1214.xls'
wb = dir_path + 'data/WB/wb_population_1215.csv'

In [195]:
pop_pwt = pd.read_csv(pwt)
pop_un = pd.read_csv(un)
pop_wb = pd.read_csv(wb)
pop_imf = pd.read_excel(imf, sheetname='countries_only')

In [196]:
rhg_regions = dir_path + 'data/iso_based_on_rhg_regions.csv'

WARNING: THIS IS SLOW (heavy excel with macros, etc.) - don't re-run
#### or have a separate file just for iso

In [197]:
# open ISO data (RHG_regions)
# this is SLOW
iso = pd.read_csv(rhg_regions)

In [198]:
iso.columns

Index(['country_name_imf', 'ENGLISH SHORT NAME', 'ALPHA-2', 'ALPHA-3',
       'NUMERIC'],
      dtype='object')

In [199]:
iso_map_pre = iso[['country_name_imf', 'ALPHA-3']].set_index('country_name_imf')

In [200]:
iso_map = iso_map_pre.to_dict()['ALPHA-3']
iso_map_rev = {y:x for x,y in iso_map.items()}

In [201]:
#iso_map_rev # map from iso to country nape
#iso_map # map from country name to ISO

In [202]:
pwt_countries=pop_pwt['countrycode']
un_countries=pop_un['ISO3']
wb_countries=pop_wb['Country Code']

In [203]:
iso_map.get('moon')
iso_map.get('United States of America')

### MAP country name to country code

In [204]:
pop_imf['country_code'] = pop_imf['country_name'].apply(lambda name: iso_map[name] if iso_map.get(name) else name)

In [205]:
# imf data doesn't provide country code so needs manual mapping
imf_countries = pop_imf['country_code']

In [206]:
[print(country) for country in pop_imf['country_code'].tolist() if len(country) > 3]

Kosovo
Syria


[None, None]

In [207]:
pwt_set = set(pwt_countries)
un_set = set(un_countries)
wb_set = set(wb_countries)
imf_set = set(imf_countries)

In [208]:
ls = {
    'pwt': pwt_set,
    'un': un_set,
    'wb': wb_set,
    'imf': imf_set
}

for k,v in ls.items():
    print ('{} has {} countries'.format(k,len(v)))

# pwt compare

pwt has 182 countries
un has 233 countries
wb has 217 countries
imf has 192 countries


In [209]:
countries_in_un_not_wb = ls['un'] - ls['wb']
sorted([iso_map_rev[c] for c in countries_in_un_not_wb])

['Anguilla',
 'Bonaire, Sint Eustatius and Saba',
 'Cook Islands',
 'Falkland Islands (Malvinas)',
 'French Guiana',
 'Guadeloupe',
 'Holy See',
 'Martinique',
 'Mayotte',
 'Montserrat',
 'Niue',
 'Réunion',
 'Saint Helena, Ascension and Tristan da Cunha',
 'Saint Pierre and Miquelon',
 'Taiwan Province of China',
 'Tokelau',
 'Wallis and Futuna',
 'Western Sahara']

# countries in imf but not in pwt

In [210]:
countries_in_imf_not_pwt = ls['imf'] - ls['pwt']
     
sorted([iso_map_rev.get(c) for c in countries_in_imf_not_pwt if iso_map_rev.get(c)])

['Afghanistan',
 'Eritrea',
 'Guyana',
 'Kiribati',
 'Libya',
 'Marshall Islands',
 'Micronesia, Fed. States of',
 'Nauru',
 'Palau',
 'Papua New Guinea',
 'Puerto Rico',
 'Samoa',
 'San Marino',
 'Solomon Islands',
 'South Sudan, Republic of',
 'Timor-Leste',
 'Tonga',
 'Tuvalu',
 'Vanuatu']

In [211]:
pop_pwt = pd.read_csv(pwt)
pop_un = pd.read_csv(un)
pop_wb = pd.read_csv(wb)
pop_imf = pd.read_excel(imf, sheetname='countries_only')

## Convenience functions

### Function to retrieve missing # years per country for a given list of countries
```
Given a list of countries

Retrieve # of missing years for each country

Optional: *specify a dataset
Optional: replace Country Name with Country code to be more reusable
```

In [212]:
def get_missing_years(_list_of_countries):
    '''
    Parmeters
    ----
    [Python List] _list_of_countries
        list of countries (each country is a Country name as a String)
        
    Returns
    ----
    [Python List]
        a list of tuple pairs containing (country_name, #_missing_years)
    where country_name (String) and # missing_years (int)
    sorted by decreasing # of missing years
    '''
    missing_years = []
    
    return []

### population data

In [213]:
# replace with NaN
replaced = pop_imf.replace('no data', np.nan)
#print(replaced.isnull().sum())

In [214]:
# histogram code
# df['gdppccountry'].hist(bins=50)
# #df['gdppcstate'].hist(bins=50)
# #df['gdppc_adm0_PWT'].hist(bins=50)
# df['gdppcstate_rescaled'].nunique()

In [215]:
#pop_pwt.head()

### Mungdata data

1) rename indexes to be consistent across data set

2) set iso column as the index

#### Rename cols

In [247]:
# rename column names
#del pop_wb['Unnamed: 0']

# IMF
pop_imf = add_iso_column(pop_imf, 'country_name')

# PWT
pop_pwt.rename(index=str, columns={'countrycode': 'iso', 'country': 'country_name'}, inplace=True)

# WB
pop_wb.rename(index=str, columns={"Country Code": "iso", "Country Name": "country_name"}, inplace=True)

# UN
# del pop_un['Unnamed: 0']
pop_un.rename(index=str, columns={'ISO3': 'iso', 'Location': 'country_name'}, inplace=True)

#### Set ISO as Index

In [253]:
df_imf = set_iso_as_index(pop_imf, 'iso')
df_pwt = set_iso_as_index(pop_pwt, 'iso')
df_wb = set_iso_as_index(pop_wb, 'iso')
df_un = set_iso_as_index(pop_un, 'iso')

In [268]:
df_imf

Unnamed: 0_level_0,country_name,1980,1981,1982,1983,1984,1985,1986,1987,1988,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
iso,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AFG,Afghanistan,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,30.55,31.279,32.007,33.4,36.8,36.867,36.933,37,37.514,38.028
ALB,Albania,2.762,2.817,2.877,2.935,2.996,3.059,3.12,3.182,3.246,...,2.895,2.889,2.881,2.876,2.876,2.874,2.87,2.865,2.861,2.856
DZA,Algeria,18.666,19.246,19.864,20.516,21.175,22.2,22.8,23.4,24.1,...,38.297,39.114,39.963,40.762,41.537,42.326,43.088,43.863,44.609,45.323
AGO,Angola,8.872,9.111,9.352,9.597,9.851,11.101,11.418,11.71,12.027,...,25.038,25.789,26.563,27.36,28.18,29.026,29.897,30.793,31.717,32.669
ATG,Antigua and Barbuda,0.068,0.068,0.067,0.066,0.065,0.064,0.063,0.062,0.061,...,0.087,0.088,0.089,0.09,0.091,0.092,0.093,0.094,0.095,0.096
ARG,Argentina,27.95,28.45,28.93,29.34,29.84,30.35,30.74,31.09,31.47,...,42.203,42.67,43.132,43.6,44.082,44.57,45.062,45.561,46.065,46.574
ARM,Armenia,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,2.98,2.985,2.99,2.991,2.991,2.991,2.991,2.992,2.992,2.992
AUS,Australia,14.802,15.039,15.289,15.484,15.677,15.901,16.139,16.395,16.687,...,23.322,23.673,24.013,24.386,24.764,25.149,25.539,25.936,26.338,26.747
AUT,Austria,7.54,7.556,7.565,7.543,7.544,7.549,7.557,7.567,7.576,...,8.477,8.544,8.63,8.74,8.815,8.885,8.95,9.012,9.069,9.122
AZE,Azerbaijan,no data,no data,no data,no data,no data,no data,no data,no data,no data,...,9.268,9.342,9.417,9.492,9.568,9.645,9.722,9.8,9.878,9.957


In [267]:
print(df_imf.loc['CHN'][1:5])
print(df_pwt.loc['CHN'][1:5])

1980     987.05
1981    1000.72
1982    1016.54
1983    1030.08
Name: CHN, dtype: object
1950        NaN
1951        NaN
1952    566.208
1953    575.723
Name: CHN, dtype: object


### Get China/CHN data

In [259]:
df_imf.loc['CHN']
df_pwt.loc['CHN']
df_wb.loc['CHN']
# df_un.loc['CHN']

country_name          China
1960             6.6707e+08
1961             6.6033e+08
1962             6.6577e+08
1963            6.82335e+08
1964            6.98355e+08
1965            7.15185e+08
1966              7.354e+08
1967             7.5455e+08
1968             7.7451e+08
1969            7.96025e+08
1970            8.18315e+08
1971            8.41105e+08
1972             8.6203e+08
1973             8.8194e+08
1974             9.0035e+08
1975            9.16395e+08
1976            9.30685e+08
1977            9.43455e+08
1978            9.56165e+08
1979            9.69005e+08
1980            9.81235e+08
1981            9.93885e+08
1982            1.00863e+09
1983            1.02331e+09
1984            1.03682e+09
1985            1.05104e+09
1986            1.06679e+09
1987            1.08404e+09
1988            1.10163e+09
1989            1.11865e+09
1990            1.13518e+09
1991            1.15078e+09
1992            1.16497e+09
1993            1.17844e+09
1994            1.19

In [221]:
china_df pd.Dataframe()

SyntaxError: invalid syntax (<ipython-input-221-1fd2fa117c38>, line 1)

In [None]:
pwt9.set_index(['countrycode', 'year']).unstack('year')['rgdpe_pc'].loc[['IND', 'IDN', 'BRA', 'CHN']].T.plot(style='*')

In [None]:
countries_with_complete_data = replaced[~replaced[1950].isnull()]['country']

# Syria 1980-2010, missing data 2011 - 2022
# South Sudan, 2011-2022, missing data 1980-2010
# Afghanistan 2001-2022, missing 1980-2000
# Armenia 1990-2022, missing 1980-1989
# Bahamas, The, 1992-2022, missing 1980-1991