In [1]:
import pandas as pd
import numpy as np
import xlrd
import os
import re
from ddf_utils.str import to_concept_id
from ddf_utils.index import create_index_file



In [2]:
pd.set_option('max_row', 20)

* Gapminder Documentation 001 – GDP per Capita by Purchasing Power Parities
* Gapminder Documentation 002 – Infant Mortality Rate
* Gapminder Documentation 003 – Total Population
* Gapminder Documentation 004 – Life Expectancy at Birth
* Gapminder Documentation 005 – Under-five mortality rate
* Gapminder Documentation 007 – GDP per capita by PPP for sub-national units
* Gapminder Documentation 008 – Children per woman (total fertility)

In [4]:
xlrd.open_workbook('source/gapdata004 v7.xlsx').sheet_names()

['About', 'Data & meta data', 'List of sources and references', 'Instructions']

In [3]:
data001 = pd.read_excel('source/gapdata004 v7.xlsx', sheetname='Data & meta data')

In [4]:
data001.head()

Unnamed: 0,Area,Year,Life expectancy at birth,"Life expectancy, with interpolations",Data quality,Events 1,Events 1 Wikipedia link,Events 2,Events 2 Wikipedia link,Source,...,Note on the timing of health transition,Other notes,Source II (where our source found their estimate),Alternative data,For disasters: CDR or excess CDR,For disasters: duration,CDR coefficient,For disasters: Source of (excess) CDR,For disasters: Notes on excess CDR,Further research needed
0,Afghanistan,1800,28.211,28.211,"4. Trend, guesstimate",,,,,Gapminder model - extrapolation from earliest ...,...,,,,,,,,,,
1,Afghanistan,1801,,28.200753,"4. Trend, guesstimate",,,,,Interpolation (automatic),...,,,,,,,,,,
2,Afghanistan,1802,,28.190507,"4. Trend, guesstimate",,,,,Interpolation (automatic),...,,,,,,,,,,
3,Afghanistan,1803,,28.18026,"4. Trend, guesstimate",,,,,Interpolation (automatic),...,,,,,,,,,,
4,Afghanistan,1804,,28.170013,"4. Trend, guesstimate",,,,,Interpolation (automatic),...,,,,,,,,,,


In [7]:
area = data001['Area'].unique()

In [8]:
area_id = list(map(to_concept_id, area))

In [9]:
ent = pd.DataFrame([], columns=['area', 'name'])

In [10]:
ent['area'] = area_id
ent['name'] = area

In [11]:
ent.to_csv('../ddf--entities--area.csv', index=False)

In [13]:
data001_dp_1 = data001[['Area', 'Year', 'Life expectancy at birth']].copy()
data001_dp_2 = data001[['Area', 'Year', 'Life expectancy, with interpolations']].copy()

In [16]:
to_concept_id('Life expectancy at birth')

'life_expectancy_at_birth'

In [17]:
data001_dp_1.columns = ['area', 'year', 'life_expectancy_at_birth']
data001_dp_2.columns = ['area', 'year', 'life_expectancy_with_interpolations']

In [18]:
data001_dp_1['area'] = data001_dp_1['area'].map(to_concept_id)
data001_dp_2['area'] = data001_dp_2['area'].map(to_concept_id)

In [21]:
data001_dp_1.dropna().sort_values(by=['area', 'year']).to_csv('../ddf--datapoints--life_expectancy_at_birth--by--area--year.csv', index=False)

In [22]:
data001_dp_2.dropna().sort_values(by=['area', 'year']).to_csv('../ddf--datapoints--life_expectancy_with_interpolations--by--area--year.csv', index=False)

In [23]:
conc = ['life_expectancy_at_birth', 'life_expectancy_with_interpolations', 'area', 'year', 'name']


In [24]:
cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])

In [25]:
cdf['concept'] = conc
cdf['name'] = ['Life expectancy at birth', 'Life expectancy, with interpolations', 'Area', 'Year', 'Name']
cdf['concept_type'] = ['measure', 'measure', 'entity_domain', 'time', 'string']

In [26]:
cdf.to_csv('../ddf--concepts.csv', index=False)

In [27]:
create_index_file('../')

Unnamed: 0,key,value,file
0,concept,name,ddf--concepts.csv
1,concept,concept_type,ddf--concepts.csv
0,"area,year",life_expectancy_at_birth,ddf--datapoints--life_expectancy_at_birth--by-...
0,"area,year",life_expectancy_with_interpolations,ddf--datapoints--life_expectancy_with_interpol...
0,area,name,ddf--entities--area.csv


In [6]:
data001['Data quality'].dropna().unique()

array(['4. Trend, guesstimate', '3. Yearly, guesstimate', '2. Trend, data',
       '1. Yearly, data'], dtype=object)

In [7]:
import ddf_utils.ddf_reader as ddf

ddf.SEARCH_PATH = '/Users/semio/src/work/Gapminder/'

In [13]:
a0 = (ddf.ddf_datapoint('ddf--gapminder--gapminder_world', 'data_quality_life_expectancy')
 .query("geo == 'afg'")
)

In [15]:
a0

Unnamed: 0,geo,data_quality_life_expectancy,time
0,afg,4,1800
1,afg,4,1801
2,afg,4,1802
3,afg,4,1803
4,afg,4,1804
5,afg,4,1805
6,afg,4,1806
7,afg,4,1807
8,afg,4,1808
9,afg,4,1809


In [14]:
a = (ddf.ddf_datapoint('ddf--g-hist_lex', 'data_quality')
 .query("area == 'afghanistan' & year < 2015")
)

In [18]:
np.all(a['data_quality'] == a0['data_quality_life_expectancy'])

True