In [3]:
import os

import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr


% matplotlib inline

## PWT Data Analysis

**Data & Scope**

* pwt_income_adm1.dta' - from U Chicago - Jingyuan
* pwt90.xlsx  (excel)
* pwt90.dta - direct download from PWT link

**Assumptions/Expectations**
* Has both historical/current estimates to future projection data.
* Has several types of Income (GDP) data: national accounts based, output-based using PPP (current and constant), expenditure-based using PPP (current and constant).
* All GDP values in 2011 millions of USD
* Population data in millions

**Analysis Goals**

**Currently Out of Scope**
Future projection X

**Conclusion**


**Questions**
1. What is a national accounts data sources?
2. Why does U Chicago team use `gdppcna`? (national-accounts based) Confirm with Jingyuan this is true

In [4]:
#os.listdir()
#  'pwt_income_adm1.dta' - from U Chicago - Jingyuan
#   pwt90.xlsx 
#   pwt90.dta - direct download from PWT link
# where wtf99.dta file from?

## Open PWT (raw) Data

In [5]:
pwt9 = pd.read_excel('pwt90.xlsx', sheetname='Data')

### Get variable names

In [6]:
pwt9['year'].unique().min(), pwt9['year'].unique().max()

(1950, 2014)

In [7]:
pwt9.columns

Index(['countrycode', 'country', 'currency_unit', 'year', 'rgdpe', 'rgdpo',
       'pop', 'emp', 'avh', 'hc', 'ccon', 'cda', 'cgdpe', 'cgdpo', 'ck',
       'ctfp', 'cwtfp', 'rgdpna', 'rconna', 'rdana', 'rkna', 'rtfpna',
       'rwtfpna', 'labsh', 'delta', 'xr', 'pl_con', 'pl_da', 'pl_gdpo',
       'i_cig', 'i_xm', 'i_xr', 'i_outlier', 'cor_exp', 'statcap', 'csh_c',
       'csh_i', 'csh_g', 'csh_x', 'csh_m', 'csh_r', 'pl_c', 'pl_i', 'pl_g',
       'pl_x', 'pl_m', 'pl_k'],
      dtype='object')

### GDP analysis

In [8]:
#pwt9.columns
sub_vars = ['countrycode', 'country', 'currency_unit', 'year', 'pop']
gdps = ['rgdpe', 'rgdpo', 'rgdpna', 'cgdpe', 'cgdpo']
gdps_cp = ['rgdpe_pc', 'rgdpo_pc', 'rgdpna_pc', 'cdgec_pc_na', 'rgdpo_pc_pc']
sub_vars.extend(gdps)
print (sub_vars)
pwt9[sub_vars]

for gdp in gdps:
    pc_gdp = gdp + '_pc'
    pwt9[pc_gdp] = pwt9[gdp]/pwt9['pop']
    
#print (pwt9)

['countrycode', 'country', 'currency_unit', 'year', 'pop', 'rgdpe', 'rgdpo', 'rgdpna', 'cgdpe', 'cgdpo']


In [9]:
arr = sub_vars.extend(gdps)
sub_vars

['countrycode',
 'country',
 'currency_unit',
 'year',
 'pop',
 'rgdpe',
 'rgdpo',
 'rgdpna',
 'cgdpe',
 'cgdpo',
 'rgdpe',
 'rgdpo',
 'rgdpna',
 'cgdpe',
 'cgdpo']

In [10]:
pwt9.columns

Index(['countrycode', 'country', 'currency_unit', 'year', 'rgdpe', 'rgdpo',
       'pop', 'emp', 'avh', 'hc', 'ccon', 'cda', 'cgdpe', 'cgdpo', 'ck',
       'ctfp', 'cwtfp', 'rgdpna', 'rconna', 'rdana', 'rkna', 'rtfpna',
       'rwtfpna', 'labsh', 'delta', 'xr', 'pl_con', 'pl_da', 'pl_gdpo',
       'i_cig', 'i_xm', 'i_xr', 'i_outlier', 'cor_exp', 'statcap', 'csh_c',
       'csh_i', 'csh_g', 'csh_x', 'csh_m', 'csh_r', 'pl_c', 'pl_i', 'pl_g',
       'pl_x', 'pl_m', 'pl_k', 'rgdpe_pc', 'rgdpo_pc', 'rgdpna_pc', 'cgdpe_pc',
       'cgdpo_pc'],
      dtype='object')

### Find out if all years occur with same frequency - YES (65 years, each with 182 occurences)

This is expected as any missing year for a country listed is extrapolated or interpolated.

In [11]:
years = pwt9['year']
years.value_counts() 
years.sort_values().value_counts().unique() # 182 only
# all years appear 182 times
years.max() # 2014
years.min() # 1950
#2014- 1950 #~ 64 years span or 65 years total years

# get length of years array
# 11830
years.size == 65*182 # 65 years * freq_of_year 

True

In [12]:
#pwt9['currency_unit'].nunique() # 135 currency
# Note: we are only interested in USD equivalent that is translated into USD via PPP

In [13]:
# Get variables
pwt9.columns

Index(['countrycode', 'country', 'currency_unit', 'year', 'rgdpe', 'rgdpo',
       'pop', 'emp', 'avh', 'hc', 'ccon', 'cda', 'cgdpe', 'cgdpo', 'ck',
       'ctfp', 'cwtfp', 'rgdpna', 'rconna', 'rdana', 'rkna', 'rtfpna',
       'rwtfpna', 'labsh', 'delta', 'xr', 'pl_con', 'pl_da', 'pl_gdpo',
       'i_cig', 'i_xm', 'i_xr', 'i_outlier', 'cor_exp', 'statcap', 'csh_c',
       'csh_i', 'csh_g', 'csh_x', 'csh_m', 'csh_r', 'pl_c', 'pl_i', 'pl_g',
       'pl_x', 'pl_m', 'pl_k', 'rgdpe_pc', 'rgdpo_pc', 'rgdpna_pc', 'cgdpe_pc',
       'cgdpo_pc'],
      dtype='object')

In [14]:
# Manually add per capita (_pc) data for eah GW]
pwt9['rgdpna_pc'] = pwt9['rgdpna'] / pwt9['pop']

pwt9['cdgpo_pc'] = pwt9['cgdpo'] / pwt9['pop']
pwt9['cgdpe_pc'] = pwt9['cgdpe'] / pwt9['pop']

pwt9['rgdpo_pc'] = pwt9['rgdpo'] / pwt9['pop']
pwt9['rgdpe-pc'] = pwt9['rgdpe'] / pwt9['pop']

test = pwt9[['country','year', 'rgdpna_pc']]

### Pivot table (same as WB data shape - years as columns)

In [15]:
df_gdp = pwt9[['country', 'countrycode', 'year', 'pop', 'rgdpe']]
print(df_gdp['pop'].isnull().value_counts())

print(df_gdp['rgdpe'].isnull().value_counts())

False    9439
True     2391
Name: pop, dtype: int64
False    9439
True     2391
Name: rgdpe, dtype: int64


In [16]:
## 12/14 evauation continued...

df_pop = pwt9[['country', 'countrycode', 'year', 'pop']]

pivoted=df_pop.pivot_table(index='country', columns='year', values='pop')
pivoted.to_csv('pwt9_pivoted.csv', sep=',', encoding='utf-8')
reindexed_pop = pivoted.reset_index()

### a) get countries with complete data

In [17]:
countries_with_complete_data = reindexed_pop[~reindexed_pop[1950].isnull()]['country']
countries_with_complete_data_list = countries_with_complete_data.tolist()
len(countries_with_complete_data)

55

In [18]:
print(countries_with_complete_data_list)

['Argentina', 'Australia', 'Austria', 'Belgium', 'Bolivia (Plurinational State of)', 'Brazil', 'Canada', 'Colombia', 'Costa Rica', 'Cyprus', 'D.R. of the Congo', 'Denmark', 'Ecuador', 'Egypt', 'El Salvador', 'Ethiopia', 'Finland', 'France', 'Germany', 'Guatemala', 'Honduras', 'Iceland', 'India', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kenya', 'Luxembourg', 'Mauritius', 'Mexico', 'Morocco', 'Netherlands', 'New Zealand', 'Nicaragua', 'Nigeria', 'Norway', 'Pakistan', 'Panama', 'Peru', 'Philippines', 'Portugal', 'South Africa', 'Spain', 'Sri Lanka', 'Sweden', 'Switzerland', 'Thailand', 'Trinidad and Tobago', 'Turkey', 'Uganda', 'United Kingdom', 'United States', 'Uruguay', 'Venezuela (Bolivarian Republic of)']


### Get countries missing data up to 1989

In [19]:
# get countries missing data on 1989
missing_before_1990=reindexed_pop[reindexed_pop[1989].isnull()]

In [59]:
missing_before_1990_list = missing_before_1990[['country', 1989,1990]]['country'].tolist()

print(len(missing_before_1990_list))

25


In [21]:
missing_before_1970=reindexed_pop[reindexed_pop[1969].isnull()]

In [22]:
len(missing_before_1970)-23-2

43

In [23]:
print(sorted(list(set(missing_before_1970['country']) - set(missing_before_1990['country']))))

['Albania', 'Angola', 'Anguilla', 'Antigua and Barbuda', 'Aruba', 'Bahamas', 'Bahrain', 'Belize', 'Bermuda', 'Bhutan', 'British Virgin Islands', 'Brunei Darussalam', 'Bulgaria', 'Cambodia', 'Cayman Islands', 'China, Macao SAR', 'Djibouti', 'Dominica', 'Grenada', 'Hungary', 'Iraq', 'Kuwait', "Lao People's DR", 'Lebanon', 'Maldives', 'Mongolia', 'Montserrat', 'Oman', 'Poland', 'Qatar', 'Saint Kitts and Nevis', 'Saint Lucia', 'Sao Tome and Principe', 'Saudi Arabia', 'St. Vincent and the Grenadines', 'State of Palestine', 'Sudan (Former)', 'Suriname', 'Swaziland', 'Turks and Caicos Islands', 'United Arab Emirates', 'Viet Nam', 'Yemen']


In [24]:
missing_before_1970_list = missing_before_1970[['country', 1969,1970]]['country'].tolist()
print(missing_before_1970_list)

['Albania', 'Angola', 'Anguilla', 'Antigua and Barbuda', 'Armenia', 'Aruba', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Belarus', 'Belize', 'Bermuda', 'Bhutan', 'Bosnia and Herzegovina', 'British Virgin Islands', 'Brunei Darussalam', 'Bulgaria', 'Cambodia', 'Cayman Islands', 'China, Macao SAR', 'Croatia', 'Curaçao', 'Czech Republic', 'Djibouti', 'Dominica', 'Estonia', 'Georgia', 'Grenada', 'Hungary', 'Iraq', 'Kazakhstan', 'Kuwait', 'Kyrgyzstan', "Lao People's DR", 'Latvia', 'Lebanon', 'Lithuania', 'Maldives', 'Mongolia', 'Montenegro', 'Montserrat', 'Oman', 'Poland', 'Qatar', 'Republic of Moldova', 'Russian Federation', 'Saint Kitts and Nevis', 'Saint Lucia', 'Sao Tome and Principe', 'Saudi Arabia', 'Serbia', 'Sint Maarten (Dutch part)', 'Slovakia', 'Slovenia', 'St. Vincent and the Grenadines', 'State of Palestine', 'Sudan (Former)', 'Suriname', 'Swaziland', 'TFYR of Macedonia', 'Tajikistan', 'Turkmenistan', 'Turks and Caicos Islands', 'Ukraine', 'United Arab Emirates', 'Uzbekistan', 'Viet Na

In [25]:
missing_before_1960=reindexed_pop[reindexed_pop[1959].isnull()]

In [26]:
not_missing_1959=reindexed_pop[~reindexed_pop[1959].isnull()]

In [27]:
len(not_missing_1959)

75

In [28]:
print(sorted(set(not_missing_1959['country']) - set(countries_with_complete_data)))

['Bangladesh', 'Benin', 'Burkina Faso', 'Chile', 'China', 'Dominican Republic', 'Ghana', 'Greece', 'Guinea', 'Iran (Islamic Republic of)', 'Jamaica', 'Jordan', 'Malawi', 'Malaysia', 'Malta', 'Paraguay', 'Republic of Korea', 'Taiwan', 'Zambia', 'Zimbabwe']


In [29]:
len(missing_before_1960) - 43-23-2

print(sorted(list(set(missing_before_1960['country']) - set(missing_before_1970['country']))))

['Algeria', 'Barbados', 'Botswana', 'Burundi', 'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'China, Hong Kong SAR', 'Comoros', 'Congo', "Côte d'Ivoire", 'Equatorial Guinea', 'Fiji', 'Gabon', 'Gambia', 'Guinea-Bissau', 'Haiti', 'Indonesia', 'Lesotho', 'Liberia', 'Madagascar', 'Mali', 'Mauritania', 'Mozambique', 'Myanmar', 'Namibia', 'Nepal', 'Niger', 'Romania', 'Rwanda', 'Senegal', 'Seychelles', 'Sierra Leone', 'Singapore', 'Syrian Arab Republic', 'Togo', 'Tunisia', 'U.R. of Tanzania: Mainland']


In [30]:
def how_many_years_missing(_df, _country):
    return _df.loc[_country].isnull().value_counts()

how_many_years_missing(pivoted, 'Greece')

False    64
True      1
Name: Greece, dtype: int64

In [31]:
#pivoted.loc['Republic of Korea']

### GET GDP missing country-year


In [32]:
pwt9.columns

Index(['countrycode', 'country', 'currency_unit', 'year', 'rgdpe', 'rgdpo',
       'pop', 'emp', 'avh', 'hc', 'ccon', 'cda', 'cgdpe', 'cgdpo', 'ck',
       'ctfp', 'cwtfp', 'rgdpna', 'rconna', 'rdana', 'rkna', 'rtfpna',
       'rwtfpna', 'labsh', 'delta', 'xr', 'pl_con', 'pl_da', 'pl_gdpo',
       'i_cig', 'i_xm', 'i_xr', 'i_outlier', 'cor_exp', 'statcap', 'csh_c',
       'csh_i', 'csh_g', 'csh_x', 'csh_m', 'csh_r', 'pl_c', 'pl_i', 'pl_g',
       'pl_x', 'pl_m', 'pl_k', 'rgdpe_pc', 'rgdpo_pc', 'rgdpna_pc', 'cgdpe_pc',
       'cgdpo_pc', 'cdgpo_pc', 'rgdpe-pc'],
      dtype='object')

In [45]:
list_of_cols = ['countrycode', 'country', 'year', 'rgdpe', 'rgdpna', 'rgdpo']
pwt_gdp = pwt9[list_of_cols]
pwt_gdp.head(3)

Unnamed: 0,countrycode,country,year,rgdpe,rgdpna,rgdpo
0,ABW,Aruba,1950,,,
1,ABW,Aruba,1951,,,
2,ABW,Aruba,1952,,,


In [47]:
print(pwt_gdp['rgdpe'].isnull().value_counts())  # same number as pop

False    9439
True     2391
Name: rgdpe, dtype: int64


In [48]:
pivoted_rgdpe=pwt_gdp.pivot_table(index='country', columns='year', values='rgdpe')
pivoted_rgdpe.to_csv('pwt9_rgdpe_pivoted.csv', sep=',', encoding='utf-8')
reindexed_rgdpe = pivoted_rgdpe.reset_index()

In [52]:
countries_with_complete_data_rgdpe = reindexed_rgdpe[~reindexed_rgdpe[1950].isnull()]['country']
countries_with_complete_data_rgdpe.tolist()
len(countries_with_complete_data_rgpde)

55

In [56]:
missing_before_1990_rgdpe=reindexed_rgdpe[reindexed_rgdpe[1989].isnull()]

In [58]:
missing_before_1990_list2 = missing_before_1990_rgdpe[['country', 1989,1990]]['country'].tolist()

print(len(missing_before_1990_list2))

25


## Open UChicago Data

In [33]:
chicago_data = pd.read_stata('pwt_income_adm1.dta')

In [41]:

chicago_data.shape
chicago_data.columns

Index(['countryname', 'countrycode', 'region', 'year', 'gdppccountry',
       'gdppcstate', 'gdppc_adm0_PWT', 'gdppcstate_rescaled'],
      dtype='object')

In [34]:
chicago_data.columns
chicago_data.describe()
chicago_data.size
# chicago_data[10000:10030]

706864

In [35]:
# chicago_data[['countryname', 'year', 'gdppc_adm0_PWT']]
