# Core Data
- This program downloads data from the google big query public World Development Indicators dataset
- The program downloads the main data for GDP, consumption, etc. and a separate dataset for population
- The program then loads each table into a local database

In [35]:
import os
from google.cloud import bigquery
import sqlalchemy
engine = sqlalchemy.create_engine('postgresql://mitch:password@localhost:5432/wdi')

os.environ.setdefault("GCLOUD_PROJECT", "enduring-rush-410922")
client = bigquery.Client()

## Main Data

In [38]:
QUERY = """
WITH country_data AS ( - CTE for country descriptive data
  SELECT country_code, 
  short_name country,
  region, 
  income_group 
  FROM bigquery-public-data.world_bank_wdi.country_summary
)
SELECT data.country_code, country,
CASE -- rename indicators to readable
    WHEN indicator_code = "NY.GDP.MKTP.KD" THEN "GDP"
    WHEN indicator_code = "NE.CON.TOTL.KD" THEN "Consumption"
    WHEN indicator_code = "NE.GDI.FTOT.KD" THEN "Investment"
    WHEN indicator_code = "NE.EXP.GNFS.KD" THEN "Exports"
    WHEN indicator_code = "NE.IMP.GNFS.KD" THEN "Imports"
    WHEN indicator_code = "SP.POP.TOTL" THEN "Population"
    WHEN indicator_code = "SP.DYN.CBRT.IN" THEN "Birth_Rate"
END AS indicator,
year,
value, 
ROUND(
  (100*(value - LAG(value, 1) OVER(PARTITION BY data.country_code, indicator_code ORDER BY year ASC)))
  / (
    NULLIF(LAG(value, 1) OVER(PARTITION BY data.country_code, indicator_code ORDER BY year ASC), 0)
    )
, 2) AS growth_rate,
region, 
income_group
FROM bigquery-public-data.world_bank_wdi.indicators_data data 
LEFT JOIN country_data
ON data.country_code = country_data.country_code
WHERE indicator_code IN ( -- restrict data to GDP, consumption, investment, exports, ...
    "NY.GDP.MKTP.KD", --GDP
    "NE.CON.TOTL.KD", --Consumption
    "NE.GDI.FTOT.KD", --Investment
    "NE.EXP.GNFS.KD", --Exports
    "NE.IMP.GNFS.KD",  --Imports
    "SP.POP.TOTL", --Population
    "SP.DYN.CBRT.IN" --Birth Rate
  ) 
AND country_data.region IS NOT NULL  --IMPORTANT: removes 'aggregations' such as 'latin america', 'world', etc.
AND country_data.income_group IS NOT NULL 
ORDER BY country_code, indicator, year
"""

In [39]:
# Perform a query.
query_job = client.query(QUERY)  # API request
df = query_job.to_dataframe()

In [None]:
# output for tableau
rename = lambda x : x.replace('_', ' ').title()
df.columns = [rename(x) for x in df.columns]
df.to_csv('../data/main.csv', index=False)
df

In [42]:
# update local SQL db
db = df.to_sql('main', engine, if_exists='replace', index=False)

## Population
- This query creates a table for population and population growth

In [58]:
QUERY_POP = """
WITH country_data AS ( -- collect country_data to remove rows without region or income group (e.g. not countries)
  SELECT country_code, 
  region, 
  income_group 
  FROM bigquery-public-data.world_bank_wdi.country_summary
)
SELECT data.country_code, 
year,
CAST(value AS INTEGER) population, 
ROUND(
  (100*(value - LAG(value, 1) OVER(PARTITION BY data.country_code ORDER BY year ASC)))
  / (
    NULLIF(LAG(value, 1) OVER(PARTITION BY data.country_code ORDER BY year ASC), 0)
    )
, 2) AS population_growth, 
region, 
income_group
FROM bigquery-public-data.world_bank_wdi.indicators_data data 
LEFT JOIN country_data
ON data.country_code = country_data.country_code
WHERE indicator_code IN (
    "SP.POP.TOTL" --Population
  ) 
AND country_data.region IS NOT NULL
AND country_data.income_group IS NOT NULL
ORDER BY country_code, year
"""

In [59]:
query_job = client.query(QUERY_POP)  # API request

In [60]:
df_pop = query_job.to_dataframe()
df_pop

Unnamed: 0,country_code,year,population,population_growth,region,income_group
0,ABW,1960,54208,,Latin America & Caribbean,High income
1,ABW,1961,55434,2.26,Latin America & Caribbean,High income
2,ABW,1962,56234,1.44,Latin America & Caribbean,High income
3,ABW,1963,56699,0.83,Latin America & Caribbean,High income
4,ABW,1964,57029,0.58,Latin America & Caribbean,High income
...,...,...,...,...,...,...
13129,ZWE,2016,14030338,1.56,Sub-Saharan Africa,Lower middle income
13130,ZWE,2017,14236599,1.47,Sub-Saharan Africa,Lower middle income
13131,ZWE,2018,14438812,1.42,Sub-Saharan Africa,Lower middle income
13132,ZWE,2019,14645473,1.43,Sub-Saharan Africa,Lower middle income


In [62]:
db_pop = df_pop.to_sql('pop', engine, if_exists='replace', index=False)