In [1]:
%matplotlib widget
import matplotlib
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pint
import pint_pandas
import pathlib
import tqdm
u = pint.UnitRegistry()

pandas DataFrame in traditional tidy format:

* the index is meaningless (row numbers)
* each column contains one variable
* each observation forms a row
* each type of observational unit forms a DataFrame
* columns ordered from fixed variables (left) to measured variables; within each group, alphabetical order
* rows ordered using the leftmost variable, breaking ties with variables further right

Even given these constraints, there are still some variables in the data representation:

* duplicate information (like countrycode and full countryname) in a single DataFrame vs. multiple DataFrames (only countrycode in the DataFrame, second DataFrame maps countrycodes to countrynames)
* units as composite types (e.g. pint) vs. units as a variable

Because the data structure relies on multiple, possibly joinable DataFrames, there is a need for a container, which is not yet defined.

# representation
* load all example data sets into the data structure at once
* measure memory use
* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

## Read in the data

In [2]:
!ls ../../primap-requirements/example_data_sets/

Emissions_Agriculture_Agriculture_total_E_All_Data.csv
lak-download.csv
mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv
PRIMAPDB.mat
PRIMAP-hist_v2.1_09-Nov-2019.csv


In [3]:
p = pathlib.Path('../../primap-requirements/example_data_sets/')

### read in FAO emissions agriculture total fao_emi_total

In [4]:
years = list(range(1961, 2017)) + [2030, 2050]
dtypes = {}
for year in years:
    dtypes[f'Y{year}'] = np.float

dtypes['Area'] = pd.CategoricalDtype(ordered=True)
dtypes['Item'] = pd.CategoricalDtype(ordered=True)
dtypes['Element'] = 'str'

fao_emi_total = pd.read_csv(
    p / 'Emissions_Agriculture_Agriculture_total_E_All_Data.csv',
    encoding='latin_1',
    dtype=dtypes,
    usecols=dtypes.keys(),
    index_col=[0, 1, 2])

fao_emi_total = fao_emi_total.stack()

index = fao_emi_total.index.set_names('Date', level=3).to_frame()

index['Date'] = pd.to_datetime(index['Date'], format='Y%Y')
index.loc[index['Element'] == 'Emissions (CH4)', 'Element'] = 'CH4'
index.loc[index['Element'] == 'Emissions (CO2eq)', 'Element'] = 'CO2eq'
index.loc[index['Element'] == 'Emissions (CO2eq) from CH4', 'Element'] = 'CH4 (CO2eq)'
index.loc[index['Element'] == 'Emissions (CO2eq) from N2O', 'Element'] = 'N2O (CO2eq)'
index.loc[index['Element'] == 'Emissions (N2O)', 'Element'] = 'N2O'
index['Element'] = index['Element'].astype(pd.CategoricalDtype(ordered=True))

fao_emi_total.index = pd.MultiIndex.from_frame(index)

fao_emi_total = fao_emi_total.reorder_levels(['Area', 'Date', 'Element', 'Item'])
fao_emi_total.sort_index(inplace=True)

fao_emi_total.name = 'Emissions'

fao_emi_total = fao_emi_total.astype('pint[Gg]')

  result = self._run_cell(


In [5]:
fao_emi_total.index.levels[2]

CategoricalIndex(['CH4', 'CH4 (CO2eq)', 'CO2eq', 'N2O', 'N2O (CO2eq)'], categories=['CH4', 'CH4 (CO2eq)', 'CO2eq', 'N2O', 'N2O (CO2eq)'], ordered=True, name='Element', dtype='category')

### read in emission data for German bundesländer lak_emi_energy

In [6]:
dtypes = {}
for i in ['Insgesamt', 'Steinkohle', 'Braunkohle', 'Mineralöle und Mineralöl produkte', 'Gase', 'Abfälle (nicht biogen)', 'Andere Energieträger']:
    dtypes[i] = np.float
dtypes['Land'] = pd.CategoricalDtype(ordered=True)
dtypes['Jahr'] = str

lak_emi_energy = pd.read_csv(
    p / 'lak-download.csv',
    encoding='latin_1',
    dtype=dtypes,
    usecols=dtypes.keys(),
    index_col=[0, 1],
    sep=';',
    skiprows=4,
    nrows=464)

lak_emi_energy = lak_emi_energy.stack()

lak_emi_energy.index = (lak_emi_energy.index
                        .set_levels(pd.to_datetime(lak_emi_energy.index.levels[1], format='%Y'), level=1)
                        .set_levels(lak_emi_energy.index.levels[2].astype(pd.CategoricalDtype()), level=2)
                        .set_names(['Area', 'Date', 'Category'])
                       )
lak_emi_energy = lak_emi_energy.reorder_levels(['Area', 'Category', 'Date', ])
lak_emi_energy.sort_index(inplace=True)
lak_emi_energy.name = 'Emissions'
lak_emi_energy = lak_emi_energy.astype('pint[kilometric_ton]')

In [7]:
lak_emi_energy

Area               Category                Date      
Baden-Württemberg  Abfälle (nicht biogen)  1990-01-01     595.6759999999999
                                           1991-01-01               533.609
                                           1992-01-01              582.9358
                                           1993-01-01              582.8823
                                           1994-01-01                513.03
                                                                ...        
Thüringen          Steinkohle              2013-01-01    60.221564691110004
                                           2014-01-01      60.3047505431081
                                           2015-01-01     55.85147852397861
                                           2016-01-01    51.744334669214496
                                           2017-01-01    55.785371808505296
Name: Emissions, Length: 2713, dtype: pint[kilometric_ton]

### Read in CO2 mole fraction time series cmip_conc_co2

In [8]:
!tail -n4 $p/mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv

735857,2014,9,15,15-Sep-2014 00:00:00,395.026810069061,394.318660838738,395.734959299383
735887,2014,10,15,15-Oct-2014 00:00:00,396.668759675911,397.456652804706,395.880866547116
735918,2014,11,15,15-Nov-2014 00:00:00,398.189074185698,400.321219754153,396.056928617242
735948,2014,12,15,15-Dec-2014 00:00:00,399.179691756507,402.195087926915,396.164295586098


In [9]:
dtypes = {
    'data_mean_global': np.float,
    'data_mean_nh': np.float,
    'data_mean_sh': np.float,
}
cmip_conc_co2 = pd.read_csv(
    p / 'mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv',
    usecols=dtypes.keys(),
    #index_col=[0, 1],
)
# cannot use datetime here, because the range of possible values reaches back only into the 17th century
cmip_conc_co2.index = pd.period_range(start=pd.Period(year=0, month=1, freq='M'), end=pd.Period(year=2014, month=12, freq='M'), name='Date')

cmip_conc_co2 = cmip_conc_co2.stack()

cmip_conc_co2.index = (cmip_conc_co2.index
                       .set_levels(cmip_conc_co2.index.levels[1].str[len('data_mean_'):].astype(pd.CategoricalDtype(ordered=True)), level=1)
                       .set_names('Area', level=1)
                      )

cmip_conc_co2 = cmip_conc_co2.reorder_levels(['Area', 'Date', ])
cmip_conc_co2.sort_index(inplace=True)

cmip_conc_co2.name = 'CO2 concentration'

cmip_conc_co2 = cmip_conc_co2.astype('pint[dimensionless]')
cmip_conc_co2 *= 1e-6

In [10]:
cmip_conc_co2

Area    Date   
global  0-01       0.00027836643883702005
        0-02         0.000278713058010423
        0-03         0.000279013835279965
        0-04       0.00027920961334856496
        0-05         0.000279106444207592
                            ...          
sh      2014-08    0.00039577037859628094
        2014-09      0.000395734959299383
        2014-10      0.000395880866547116
        2014-11    0.00039605692861724196
        2014-12    0.00039616429558609803
Name: CO2 concentration, Length: 72540, dtype: pint[dimensionless]

### Read in PRIMAPDB

In [11]:
import read_primapdb as rp

In [12]:
mfh = rp.MatFileHandle.from_matfile(p / 'PRIMAPDB.mat')

In [13]:
def extract_df(sheet):
    a, df = rp.primap_sheet_to_pandas(sheet)

    df = df.stack()
    
    index = df.index.to_frame()
    index.columns = ['Date', 'Area']
    index['Area'] = index['Area'].astype(pd.CategoricalDtype(ordered=True))
    
    try:
        index['Date'] = pd.to_datetime(index['Date'], format='%Y')
    except ValueError:  # outside the range of datetime64
        index['Date'] = [pd.Period(year=x, freq='Y') for x in index['Date']]
    
    df.index = pd.MultiIndex.from_frame(index)
    
    df = df.reorder_levels(['Area', 'Date', ])
    df.sort_index(inplace=True)
    
    df.name = a.pop('entity')

    unit = a.pop('unit')    
    if unit == 'ThousandPers':
        df = df.astype('pint[dimensionless]')
        df *= 1000
    elif unit == 'Gg':
        df = df.astype('pint[gigagram]')
    elif unit == 't':
        df = df.astype('pint[metric_ton]')
    elif unit == 'kt':
        df = df.astype('pint[kilometric_ton]')
    elif unit == 'GgCO2eq':
        df = df.astype('pint[Gg]')
        df.name = f'{df.name} (CO2eq)'
    elif unit == 'ktC':
        df = df.astype('pint[kilometric_ton]')
        df.name = f'{df.name} (C)'
    elif unit in ('2011GKD/Pers', '2011GKD'):  # give up
        df = df.astype('pint[dimensionless]')
        df.name = f'{df.name} ({unit})'
    elif unit == 'MtCO2eq':
        df = df.astype('pint[megametric_ton]')
        df.name = f'{df.name} (CO2eq)'
    else:
        raise ValueError(f'unit={unit}')
        
    return a, df

In [14]:
primap_tables = {}
pmd = mfh["PRIMAPDB"]["mastertable"]["datatables"]
for key in tqdm.tqdm_notebook(pmd.group.keys()):
    primap_tables[key] = extract_df(pmd[key])

HBox(children=(IntProgress(value=0, max=4118), HTML(value='')))




### Read in PRIMAP-hist (published)

In [15]:
!head -n3 $p/PRIMAP-hist_v2.1_09-Nov-2019.csv

scenario,country,category,entity,unit,1850,1851,1852,1853,1854,1855,1856,1857,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,1890,1891,1892,1893,1894,1895,1896,1897,1898,1899,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
HISTCR,ABW,IPC1A,CH4,Gg,3.66e-04,3.78e-04,3.90e-04,4.02e-04,4.15e-04,4.28e-04,4.41e-04,4.54e-04,4.68e-04,4.82e-04,4.96e-04

In [16]:
dtypes = {
    'scenario': str,
    'country': str,
    'category': str,
    'entity': str,
    'unit': str,
}
for year in range(1850, 2018):
    dtypes[str(year)] = np.float64
    
prm_emi = pd.read_csv(
    p / 'PRIMAP-hist_v2.1_09-Nov-2019.csv',
    usecols=dtypes.keys(),
    index_col=[0, 1, 2, 3, 4],
)

prm_emi = prm_emi.stack()

index = prm_emi.index.to_frame()
index.columns = ['Scenario', 'Area', 'Category', 'Entity', 'Unit', 'Date']
index['Date'] = pd.to_datetime(index['Date'], format='%Y')
index.loc[index['Unit'] == 'GgCO2eq', 'Entity'] += ' (CO2eq)'
index = index.drop('Unit', axis='columns')

prm_emi.index = pd.MultiIndex.from_frame(index)
prm_emi = prm_emi.reorder_levels(['Area', 'Category', 'Date', 'Entity', 'Scenario'])
prm_emi.sort_index(inplace=True)
prm_emi.name = 'Emissions'
prm_emi = prm_emi.astype('pint[Gg]')

In [17]:
prm_emi

Area  Category   Date        Entity               Scenario
ABW   IPC1       1850-01-01  CH4                  HISTCR      0.00038700000000000003
                                                  HISTTP      0.00038700000000000003
                             CO2                  HISTCR                        40.3
                                                  HISTTP                        40.3
                             KYOTOGHG (CO2eq)     HISTCR                        40.3
                                                                       ...          
ZWE   IPCMAGELV  2017-01-01  KYOTOGHG (CO2eq)     HISTTP                      6630.0
                             KYOTOGHGAR4 (CO2eq)  HISTCR                      6040.0
                                                  HISTTP                      6640.0
                             N2O                  HISTCR                        15.6
                                                  HISTTP                        17.6
Name: 

## Measure memory use

In [18]:
!mkdir -p mem

In [19]:
import pickle

for fpath, obj in (
    ('fao_emi_total.pck', fao_emi_total),
    ('lak_emi_energy.pck', lak_emi_energy),
    ('cmip_conc_co2.pck', cmip_conc_co2),
    ('primap_tables.pck', primap_tables),
    ('prm_emi.pck', prm_emi), 
):
    with open(f'mem/{fpath}', 'wb') as fd:
        pickle.dump(obj, fd, -1)

In [20]:
!ls -lah mem/

total 674M
drwxrwxr-x 2 pflueger pflueger 4,0K Aug 27 10:55 .
drwxrwxr-x 7 pflueger pflueger 4,0K Aug 27 18:48 ..
-rw-rw-r-- 1 pflueger pflueger 1,2M Aug 27 18:49 cmip_conc_co2.pck
-rw-rw-r-- 1 pflueger pflueger 1,2M Aug 27 10:55 cmip_conc_co2_wo.pck
-rw-rw-r-- 1 pflueger pflueger 8,8M Aug 27 18:49 fao_emi_total.pck
-rw-rw-r-- 1 pflueger pflueger 9,7M Aug 27 10:55 fao_emi_total_wo.pck
-rw-rw-r-- 1 pflueger pflueger  40K Aug 27 18:49 lak_emi_energy.pck
-rw-rw-r-- 1 pflueger pflueger  50K Aug 27 10:55 lak_emi_energy_wo.pck
-rw-rw-r-- 1 pflueger pflueger 205M Aug 27 18:49 primap_tables.pck
-rw-rw-r-- 1 pflueger pflueger 244M Aug 27 10:55 primap_tables_wo.pck
-rw-rw-r-- 1 pflueger pflueger 105M Aug 27 18:49 prm_emi.pck
-rw-rw-r-- 1 pflueger pflueger 100M Aug 27 10:55 prm_emi_wo.pck


In [21]:
!ls -lah $p

total 308M
drwxrwxr-x 2 pflueger pflueger 4,0K Aug 21 15:35 .
drwxrwxr-x 5 pflueger pflueger 4,0K Aug 21 15:35 ..
-rw-rw-r-- 1 pflueger pflueger  10M Aug 28  2019 Emissions_Agriculture_Agriculture_total_E_All_Data.csv
-rw-rw---- 1 pflueger pflueger   41 Aug 19 18:34 .gitattributes
-rw-rw-r-- 1 pflueger pflueger  52K Aug 19 11:08 lak-download.csv
-rw-rw-r-- 1 pflueger pflueger 2,1M Aug 21 09:54 mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv
-rw-rw-r-- 1 pflueger pflueger 254M Aug 12 08:44 PRIMAPDB.mat
-rw-r--r-- 1 pflueger pflueger  44M Nov  9  2019 PRIMAP-hist_v2.1_09-Nov-2019.csv


## I/O

* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

### export to CSV

* df.to_csv does not save pint unit

In [22]:
!mkdir -p csv/

for fpath, obj in (
    ('fao_emi_total.csv', fao_emi_total),
    ('lak_emi_energy.csv', lak_emi_energy),
    ('cmip_conc_co2.csv', cmip_conc_co2),
    ('prm_emi.csv', prm_emi), 
):
    obj.to_csv(f'csv/{fpath}', header=[f'{obj.name}/{obj.pint.units}'])

In [23]:
!mkdir -p csv/primap_tables/

for key, (meta, df) in tqdm.tqdm_notebook(primap_tables.items()):
    df.to_csv(f'csv/primap_tables/{key}.csv', header=[f'{df.name}/{df.pint.units}'])
    with open(f'csv/primap_tables/{key}_meta.csv', 'w') as fd:
        fd.write('key,value\n')
        for k, v in meta.items():
            fd.write(f'{k},{v}\n')

HBox(children=(IntProgress(value=0, max=4118), HTML(value='')))




In [24]:
!du -sh csv/*

2,4M	csv/cmip_conc_co2.csv
29M	csv/fao_emi_total.csv
148K	csv/lak_emi_energy.csv
402M	csv/primap_tables
218M	csv/prm_emi.csv


### reimport from CSV

In [25]:
def csv_to_series(fpath):
    with open(fpath) as fd:
        ncol = len(fd.readline().split(','))
    ser = pd.read_csv(fpath, parse_dates=['Date'], index_col=list(range(0, ncol-1)), squeeze=True)
    name, unit = ser.name.rsplit('/', 1)
    ser.name = name
    ser = ser.astype(f'pint[{unit}]')
    return ser

In [26]:
fao_emi_total_r = csv_to_series('csv/fao_emi_total.csv')

In [27]:
lak_emi_energy_r = csv_to_series('csv/lak_emi_energy.csv')

In [28]:
cmip_conc_co2_r = csv_to_series('csv/cmip_conc_co2.csv')
# Date is column of str objects, not PeriodIndex

In [29]:
prm_emi_r = csv_to_series('csv/prm_emi.csv')

In [30]:
primap_tables_r = {}
import glob
import pathlib

for meta_path in tqdm.tqdm_notebook(glob.glob('csv/primap_tables/*_meta.csv')):
    mp = pathlib.Path(meta_path)
    key = mp.name[:-9]
    ser = csv_to_series(mp.with_name(f'{key}.csv'))
    meta = {}
    with mp.open() as fd:
        fd.readline()
        for l in fd.readlines():
            k, v = l.split(',', 1)
            meta[k] = v[:-1]
    primap_tables_r[key] = (meta, ser)

HBox(children=(IntProgress(value=0, max=4118), HTML(value='')))




### export to native storage format

* with pint, all of the advanced storage formats hdf/feather/parquet do not work
* would need upstream fixes I guess
* pickle is native, works

In [31]:
!mkdir -p pickle/

for fpath, obj in (
    ('fao_emi_total', fao_emi_total),
    ('lak_emi_energy', lak_emi_energy),
    ('cmip_conc_co2', cmip_conc_co2),
    ('prm_emi', prm_emi), 
):
    obj.to_pickle(f'pickle/{fpath}.pickle.xz', compression={'method': 'xz', 'preset': 1})

In [32]:
for fpath, obj in (
    ('fao_emi_total', fao_emi_total),
    ('lak_emi_energy', lak_emi_energy),
    ('cmip_conc_co2', cmip_conc_co2),
    ('prm_emi', prm_emi), 
):
    obj.to_pickle(f'pickle/{fpath}.pickle')

In [33]:
import zstandard
for fpath, obj in (
    ('fao_emi_total', fao_emi_total),
    ('lak_emi_energy', lak_emi_energy),
    ('cmip_conc_co2', cmip_conc_co2),
    ('prm_emi', prm_emi), 
):
    with open(f'pickle/{fpath}.pickle.zstd', 'wb') as fd:
        zstc = zstandard.ZstdCompressor(level=6, write_checksum=True)
        fd.write(zstc.compress(pickle.dumps(obj)))

In [34]:
import lzma
with lzma.open('pickle/primap_tables.pickle.xz', 'wb', preset=1) as fd:
    pickle.dump(primap_tables, fd)

In [35]:
with open('pickle/primap_tables.pickle', 'wb') as fd:
    pickle.dump(primap_tables, fd)

In [36]:
with open('pickle/primap_tables.pickle.zstd', 'wb') as fd:
    zstc = zstandard.ZstdCompressor(level=6, write_checksum=True)
    fd.write(zstc.compress(pickle.dumps(primap_tables)))

In [37]:
!ls -lah pickle/

total 456M
drwxrwxr-x 2 pflueger pflueger 4,0K Aug 26 11:59 .
drwxrwxr-x 7 pflueger pflueger 4,0K Aug 27 18:51 ..
-rw-rw-r-- 1 pflueger pflueger 1,2M Aug 27 18:51 cmip_conc_co2.pickle
-rw-rw-r-- 1 pflueger pflueger 433K Aug 27 18:51 cmip_conc_co2.pickle.xz
-rw-rw-r-- 1 pflueger pflueger 577K Aug 27 18:51 cmip_conc_co2.pickle.zstd
-rw-rw-r-- 1 pflueger pflueger 8,8M Aug 27 18:51 fao_emi_total.pickle
-rw-rw-r-- 1 pflueger pflueger 1,4M Aug 27 18:51 fao_emi_total.pickle.xz
-rw-rw-r-- 1 pflueger pflueger 2,0M Aug 27 18:51 fao_emi_total.pickle.zstd
-rw-rw-r-- 1 pflueger pflueger  40K Aug 27 18:51 lak_emi_energy.pickle
-rw-rw-r-- 1 pflueger pflueger  18K Aug 27 18:51 lak_emi_energy.pickle.xz
-rw-rw-r-- 1 pflueger pflueger  19K Aug 27 18:51 lak_emi_energy.pickle.zstd
-rw-rw-r-- 1 pflueger pflueger 205M Aug 27 18:52 primap_tables.pickle
-rw-rw-r-- 1 pflueger pflueger  58M Aug 27 18:52 primap_tables.pickle.xz
-rw-rw-r-- 1 pflueger pflueger  59M Aug 27 18:52 primap_tables.pickle.zstd
-rw-rw-r-- 

### Reimport from native storage format

In [38]:
fao_emi_total_n = pd.read_pickle('pickle/fao_emi_total.pickle')
lak_emi_energy_n = pd.read_pickle('pickle/lak_emi_energy.pickle')
cmip_conc_co2_n = pd.read_pickle('pickle/cmip_conc_co2.pickle')
prm_emi_n = pd.read_pickle('pickle/prm_emi.pickle')

In [39]:
def read_zstd_pickle(fpath):
    with open(fpath, 'rb') as fd:
        zstd = zstandard.ZstdDecompressor()
        with zstd.stream_reader(fd) as fh:
            return pickle.load(fh)
        
fao_emi_total_n = read_zstd_pickle('pickle/fao_emi_total.pickle.zstd')
lak_emi_energy_n = read_zstd_pickle('pickle/lak_emi_energy.pickle.zstd')
cmip_conc_co2_n = read_zstd_pickle('pickle/cmip_conc_co2.pickle.zstd')
prm_emi_n = read_zstd_pickle('pickle/prm_emi.pickle.zstd')

In [40]:
with lzma.open('pickle/primap_tables.pickle.xz', 'rb') as fd:
    primap_tables_n = pickle.load(fd)

In [41]:
primap_tables_n = read_zstd_pickle('pickle/primap_tables.pickle.zstd')

# interactive analysis and plotting
* add / subtract / divide / multiply compatible, overlapping data sets
* timeit
* plot a historical time series, and something akin to gapminder (countries on a gdp vs emissions plot)
* select datasets which contain data on the N2O emissions of Finland
* in a selected dataset, select years in which the N2O emissions of Finland were above the 1950-2000 average
* resample non-yearly dataset to yearly dataset
* convert a dataset to tidy format

## aligned arithmetic

In [42]:
prm_emif = prm_emi.to_frame()

In [43]:
# extract two compatible, partly overlapping data sets from primap-hist
a = (prm_emif.xs('IPC1', level='Category')
             .xs(slice('1900', '1990'), level='Date')
             .xs('CO2', level='Entity')
             .xs('HISTCR', level='Scenario')
)

In [44]:
# alternative spelling
a = prm_emif.xs(('IPC1', slice('1900', '1990'), 'CO2', 'HISTCR'), level=('Category', 'Date', 'Entity', 'Scenario'))

In [45]:
# alternative spelling
a = prm_emif.query(
    "Category == 'IPC1' and "
    "'1900' < Date < '1990' and "
    "Entity == 'CO2' and "
    "Scenario == 'HISTCR'"
)

In [46]:
# alternative spelling
a = prm_emi.loc[:, 
                'IPC1',
                '1900':'1990',
                'CO2',
                'HISTCR'
               ]

In [47]:
b = prm_emi.loc[['DEU', 'FIN', 'ZWE']]

In [48]:
a

Area  Category  Date        Entity  Scenario
ABW   IPC1      1900-01-01  CO2     HISTCR        446.0
                1901-01-01  CO2     HISTCR        467.0
                1902-01-01  CO2     HISTCR        491.0
                1903-01-01  CO2     HISTCR        517.0
                1904-01-01  CO2     HISTCR        544.0
                                                 ...   
ZWE   IPC1      1986-01-01  CO2     HISTCR      11400.0
                1987-01-01  CO2     HISTCR      13000.0
                1988-01-01  CO2     HISTCR      13600.0
                1989-01-01  CO2     HISTCR      13700.0
                1990-01-01  CO2     HISTCR      13200.0
Name: Emissions, Length: 19474, dtype: pint[gigagram]

In [49]:
b

Area  Category   Date        Entity               Scenario
DEU   IPC1       1850-01-01  CH4                  HISTCR        420.0
                                                  HISTTP        445.0
                             CO2                  HISTCR      14800.0
                                                  HISTTP      15200.0
                             KYOTOGHG (CO2eq)     HISTCR      24100.0
                                                               ...   
ZWE   IPCMAGELV  2017-01-01  KYOTOGHG (CO2eq)     HISTTP       6630.0
                             KYOTOGHGAR4 (CO2eq)  HISTCR       6040.0
                                                  HISTTP       6640.0
                             N2O                  HISTCR         15.6
                                                  HISTTP         17.6
Name: Emissions, Length: 78960, dtype: pint[gigagram]

In [50]:
c = a + b

In [51]:
c.dropna()

Area  Category  Date        Entity  Scenario
DEU   IPC1      1900-01-01  CO2     HISTCR      640000.0
                1901-01-01  CO2     HISTCR      638000.0
                1902-01-01  CO2     HISTCR      626000.0
                1903-01-01  CO2     HISTCR      674000.0
                1904-01-01  CO2     HISTCR      698000.0
                                                  ...   
ZWE   IPC1      1986-01-01  CO2     HISTCR       22800.0
                1987-01-01  CO2     HISTCR       26000.0
                1988-01-01  CO2     HISTCR       27200.0
                1989-01-01  CO2     HISTCR       27400.0
                1990-01-01  CO2     HISTCR       26400.0
Name: Emissions, Length: 273, dtype: pint[gigagram]

In [52]:
bt = b.pint.to('metric_ton')
bt

Area  Category   Date        Entity               Scenario
DEU   IPC1       1850-01-01  CH4                  HISTCR        420000.0
                                                  HISTTP        445000.0
                             CO2                  HISTCR      14800000.0
                                                  HISTTP      15200000.0
                             KYOTOGHG (CO2eq)     HISTCR      24100000.0
                                                                 ...    
ZWE   IPCMAGELV  2017-01-01  KYOTOGHG (CO2eq)     HISTTP       6630000.0
                             KYOTOGHGAR4 (CO2eq)  HISTCR       6040000.0
                                                  HISTTP       6640000.0
                             N2O                  HISTCR         15600.0
                                                  HISTTP         17600.0
Name: Emissions, Length: 78960, dtype: pint[metric_ton]

In [53]:
c = a + bt

In [54]:
c.dropna()

Area  Category  Date        Entity  Scenario
DEU   IPC1      1900-01-01  CO2     HISTCR      640000.0
                1901-01-01  CO2     HISTCR      638000.0
                1902-01-01  CO2     HISTCR      626000.0
                1903-01-01  CO2     HISTCR      674000.0
                1904-01-01  CO2     HISTCR      698000.0
                                                  ...   
ZWE   IPC1      1986-01-01  CO2     HISTCR       22800.0
                1987-01-01  CO2     HISTCR       26000.0
                1988-01-01  CO2     HISTCR       27200.0
                1989-01-01  CO2     HISTCR       27400.0
                1990-01-01  CO2     HISTCR       26400.0
Name: Emissions, Length: 273, dtype: pint[gigagram]

In [55]:
(a / b).dropna()

Area  Category  Date        Entity  Scenario
DEU   IPC1      1900-01-01  CO2     HISTCR      1.0
                1901-01-01  CO2     HISTCR      1.0
                1902-01-01  CO2     HISTCR      1.0
                1903-01-01  CO2     HISTCR      1.0
                1904-01-01  CO2     HISTCR      1.0
                                               ... 
ZWE   IPC1      1986-01-01  CO2     HISTCR      1.0
                1987-01-01  CO2     HISTCR      1.0
                1988-01-01  CO2     HISTCR      1.0
                1989-01-01  CO2     HISTCR      1.0
                1990-01-01  CO2     HISTCR      1.0
Name: Emissions, Length: 273, dtype: pint[dimensionless]

In [56]:
(a * b).dropna()

Area  Category  Date        Entity  Scenario
DEU   IPC1      1900-01-01  CO2     HISTCR      102400000000.0
                1901-01-01  CO2     HISTCR      101761000000.0
                1902-01-01  CO2     HISTCR       97969000000.0
                1903-01-01  CO2     HISTCR      113569000000.0
                1904-01-01  CO2     HISTCR      121801000000.0
                                                     ...      
ZWE   IPC1      1986-01-01  CO2     HISTCR         129960000.0
                1987-01-01  CO2     HISTCR         169000000.0
                1988-01-01  CO2     HISTCR         184960000.0
                1989-01-01  CO2     HISTCR         187690000.0
                1990-01-01  CO2     HISTCR         174240000.0
Name: Emissions, Length: 273, dtype: pint[gigagram ** 2]

In [57]:
(a - b).dropna()

Area  Category  Date        Entity  Scenario
DEU   IPC1      1900-01-01  CO2     HISTCR      0.0
                1901-01-01  CO2     HISTCR      0.0
                1902-01-01  CO2     HISTCR      0.0
                1903-01-01  CO2     HISTCR      0.0
                1904-01-01  CO2     HISTCR      0.0
                                               ... 
ZWE   IPC1      1986-01-01  CO2     HISTCR      0.0
                1987-01-01  CO2     HISTCR      0.0
                1988-01-01  CO2     HISTCR      0.0
                1989-01-01  CO2     HISTCR      0.0
                1990-01-01  CO2     HISTCR      0.0
Name: Emissions, Length: 273, dtype: pint[gigagram]

## plotting

In [58]:
plt.figure()
sel = prm_emi.xs(('DEU', 'IPCM0EL', 'KYOTOGHG (CO2eq)', 'HISTCR'), level=('Area', 'Category', 'Entity', 'Scenario'))
#sel = prm_emi.loc['DEU', 'IPCM0EL', :, 'KYOTOGHG (CO2eq)', 'HISTCR']
sel.pint.magnitude.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f520e8ac0d0>

In [59]:
[key for key, (meta, df) in primap_tables.items() if df.name == 'POP']

['POP_DEMOGR_TOTAL_NET_CONSTANT_UN2017P',
 'POP_DEMOGR_TOTAL_NET_CONSTANT_UN2019P',
 'POP_DEMOGR_TOTAL_NET_CONSTMORTALITY_UN2017P',
 'POP_DEMOGR_TOTAL_NET_CONSTMORTALITY_UN2019P',
 'POP_DEMOGR_TOTAL_NET_HIGH_UN2017P',
 'POP_DEMOGR_TOTAL_NET_HIGH_UN2019P',
 'POP_DEMOGR_TOTAL_NET_HISTORY_MPD2018P',
 'POP_DEMOGR_TOTAL_NET_HISTORY_PMHSOCIOECO12',
 'POP_DEMOGR_TOTAL_NET_HISTORY_UN2017P',
 'POP_DEMOGR_TOTAL_NET_HISTORY_UN2019P',
 'POP_DEMOGR_TOTAL_NET_LOW_UN2017P',
 'POP_DEMOGR_TOTAL_NET_LOW_UN2019P',
 'POP_DEMOGR_TOTAL_NET_MEDIUM_UN2017P',
 'POP_DEMOGR_TOTAL_NET_MEDIUM_UN2019P',
 'POP_DEMOGR_TOTAL_NET_NOCHANGE_UN2017P',
 'POP_DEMOGR_TOTAL_NET_NOCHANGE_UN2019P',
 'POP_DEMOGR_TOTAL_NET_NOMIGRATION_UN2017P',
 'POP_DEMOGR_TOTAL_NET_NOMIGRATION_UN2019P',
 'POP_DEMOGR_TOTAL_NET_REPLACEMENT_UN2017P',
 'POP_DEMOGR_TOTAL_NET_REPLACEMENT_UN2019P']

In [60]:
pop = primap_tables['POP_DEMOGR_TOTAL_NET_HISTORY_UN2019P'][1]

pop_2000 = pop.loc[:, '2000-01-01']

emi_2000 = prm_emi.xs(('IPCM0EL', 'KYOTOGHG (CO2eq)', 'HISTCR', '2000-01-01'), level=('Category', 'Entity', 'Scenario', 'Date'))

In [61]:
fig, ax = plt.subplots()
ax.plot(pop_2000.pint.magnitude[emi_2000.index], emi_2000.pint.to('Tg').pint.magnitude, 'o')
ax.set_xlabel('Population in 2000')
ax.set_ylabel('Emissions in 2000 (CO2eq) / Tg')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Text(0, 0.5, 'Emissions in 2000 (CO2eq) / Tg')

## Selecting datasets and data
* select datasets which contain data on the N2O emissions of Finland
* in a selected dataset, select years in which the N2O emissions of Finland were above the 1950-2000 average

In [62]:
[key for key, (meta, df) in primap_tables.items() if 
 'FIN' in df.index.levels[0] and
 not all(df.loc['FIN'].isna()) and
 df.name == 'N2O' and
 meta['category'] == 'IPC0']

['N2O_IPC0_TOTAL_NET_HISTCR_PRIMAPHIST20',
 'N2O_IPC0_TOTAL_NET_HISTORY_CRF2019',
 'N2O_IPC0_TOTAL_NET_HISTORY_CRF2020',
 'N2O_IPC0_TOTAL_NET_HISTORY_EDGAR42COMPI',
 'N2O_IPC0_TOTAL_NET_HISTORY_EDGAR432I',
 'N2O_IPC0_TOTAL_NET_HISTORY_EDGAR50I',
 'N2O_IPC0_TOTAL_NET_HISTTP_PRIMAPHIST20']

In [63]:
df = primap_tables['N2O_IPC0_TOTAL_NET_HISTTP_PRIMAPHIST20'][1].loc['FIN']

In [64]:
avg2hXX = df.loc['1950':'2000'].pint.to('Gg').pint.magnitude.mean()
avg2hXX

22.8155464621459

In [65]:
df.loc[df.pint.to('Gg').pint.magnitude > avg2hXX]

Date
1967-01-01    22.936279858300836
1968-01-01     23.50504739128506
1969-01-01     23.72240281268353
1970-01-01     23.88628989928003
1971-01-01    24.266094377860284
1972-01-01    25.070312519579783
1973-01-01    26.521749211204348
1974-01-01    27.165969898448687
1975-01-01     26.87486301821685
1976-01-01     24.48126745278573
1977-01-01    23.781691508675376
1978-01-01    25.340659362756316
1979-01-01    26.242610125606912
1980-01-01    26.636444589094506
1981-01-01    25.215840130525102
1982-01-01    25.507799872631583
1983-01-01     26.81153158481475
1984-01-01    26.363238482483016
1985-01-01    25.435555867104213
1986-01-01    25.649390322608294
1987-01-01     25.23117604934743
1988-01-01    24.966332612143507
1989-01-01     26.25516033249774
1990-01-01        26.64164902418
1991-01-01    25.522203928520003
1992-01-01    22.925233612280003
1993-01-01        23.48831523982
1994-01-01    24.240635375940002
1995-01-01        24.60235596766
1996-01-01        24.82996873796
1997-

## Resampling
* resample non-yearly dataset to yearly dataset

In [66]:
cmip_conc_co2

Area    Date   
global  0-01       0.00027836643883702005
        0-02         0.000278713058010423
        0-03         0.000279013835279965
        0-04       0.00027920961334856496
        0-05         0.000279106444207592
                            ...          
sh      2014-08    0.00039577037859628094
        2014-09      0.000395734959299383
        2014-10      0.000395880866547116
        2014-11    0.00039605692861724196
        2014-12    0.00039616429558609803
Name: CO2 concentration, Length: 72540, dtype: pint[dimensionless]

In [67]:
# This spelling would work if Dates were in the timespan covered by datetime64...
if False:
    cmip_conc_co2.groupby(
        [pd.Grouper(level='Area'),
         pd.Grouper(level='Date', freq='1Y')]
    ).mean()

In [68]:
cmip_conc_co2.pint.magnitude.groupby([
    pd.Grouper(level='Area'),
    cmip_conc_co2.index.get_level_values('Date').year
]).mean()

Area    Date
global  0       0.000277
        1       0.000277
        2       0.000277
        3       0.000277
        4       0.000277
                  ...   
sh      2010    0.000387
        2011    0.000389
        2012    0.000391
        2013    0.000394
        2014    0.000396
Name: CO2 concentration, Length: 6045, dtype: float64

# Interpolate
* delete some data points from a data set and fill them in using linear interpolation

In [69]:
df = primap_tables['N2O_IPC0_TOTAL_NET_HISTTP_PRIMAPHIST20'][1].loc['FIN']

In [70]:
df['1990':'1994'] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['1990':'1994'] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_with(key, value)


In [71]:
plt.figure()
df.pint.magnitude.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f5214b27a30>

In [72]:
plt.figure()
df.pint.magnitude.interpolate().plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f520e76d940>

# Alternative without MultiIndex
## Convert

In [73]:
fao_emi_total_wo = fao_emi_total.reset_index()
fao_emi_total_wo['Element'] = fao_emi_total_wo['Element'].astype(pd.CategoricalDtype(ordered=True))

In [74]:
lak_emi_energy_wo = lak_emi_energy.reset_index()

In [75]:
cmip_conc_co2_wo = cmip_conc_co2.reset_index()

In [76]:
prm_emi_wo = prm_emi.reset_index()
prm_emi_wo['Area'] = prm_emi_wo['Area'].astype(pd.CategoricalDtype(ordered=True))
prm_emi_wo['Category'] = prm_emi_wo['Category'].astype(pd.CategoricalDtype(ordered=True))
prm_emi_wo['Entity'] = prm_emi_wo['Entity'].astype(pd.CategoricalDtype(ordered=True))
prm_emi_wo['Scenario'] = prm_emi_wo['Scenario'].astype(pd.CategoricalDtype())

In [77]:
primap_tables_wo = {}
for key, (meta, df) in primap_tables.items():
    primap_tables_wo[key] = (meta, df.reset_index())

## Measure memory use

In [78]:
for fpath, obj in (
    ('fao_emi_total', fao_emi_total_wo),
    ('lak_emi_energy', lak_emi_energy_wo),
    ('cmip_conc_co2', cmip_conc_co2_wo),
    ('primap_tables', primap_tables_wo),
    ('prm_emi', prm_emi_wo), 
):
    with open(f'mem/{fpath}_wo.pck', 'wb') as fd:
        pickle.dump(obj, fd, -1)

In [79]:
!ls -lah mem

total 674M
drwxrwxr-x 2 pflueger pflueger 4,0K Aug 27 10:55 .
drwxrwxr-x 7 pflueger pflueger 4,0K Aug 27 18:53 ..
-rw-rw-r-- 1 pflueger pflueger 1,2M Aug 27 18:49 cmip_conc_co2.pck
-rw-rw-r-- 1 pflueger pflueger 1,2M Aug 27 18:53 cmip_conc_co2_wo.pck
-rw-rw-r-- 1 pflueger pflueger 8,8M Aug 27 18:49 fao_emi_total.pck
-rw-rw-r-- 1 pflueger pflueger 9,7M Aug 27 18:53 fao_emi_total_wo.pck
-rw-rw-r-- 1 pflueger pflueger  40K Aug 27 18:49 lak_emi_energy.pck
-rw-rw-r-- 1 pflueger pflueger  50K Aug 27 18:53 lak_emi_energy_wo.pck
-rw-rw-r-- 1 pflueger pflueger 205M Aug 27 18:49 primap_tables.pck
-rw-rw-r-- 1 pflueger pflueger 244M Aug 27 18:53 primap_tables_wo.pck
-rw-rw-r-- 1 pflueger pflueger 105M Aug 27 18:49 prm_emi.pck
-rw-rw-r-- 1 pflueger pflueger 100M Aug 27 18:53 prm_emi_wo.pck


## Aligned Arithmetic

In [80]:
# extract two comptaible, partly overlapping data sets from primap-hist
a = prm_emi_wo.loc[(prm_emi_wo['Category'] == 'IPC1') &
                   ('1900' < prm_emi_wo['Date']) &
                   ('1990' > prm_emi_wo['Date']) &
                   (prm_emi_wo['Entity'] == 'CO2') &
                   (prm_emi_wo['Scenario'] == 'HISTCR')]

In [81]:
a = prm_emi_wo.query('Category == "IPC1" and '
                     '"1900" < Date < "1990" and '
                     'Entity == "CO2" and '
                     'Scenario == "HISTCR"')

In [82]:
a = a.reset_index(drop=True)

In [83]:
b = prm_emi_wo.loc[prm_emi_wo['Area'].isin(['DEU', 'FIN', 'ZWE'])]
b = b.reset_index(drop=True)

In [84]:
a

Unnamed: 0,Area,Category,Date,Entity,Scenario,Emissions
0,ABW,IPC1,1901-01-01,CO2,HISTCR,467.0
1,ABW,IPC1,1902-01-01,CO2,HISTCR,491.0
2,ABW,IPC1,1903-01-01,CO2,HISTCR,517.0
3,ABW,IPC1,1904-01-01,CO2,HISTCR,544.0
4,ABW,IPC1,1905-01-01,CO2,HISTCR,573.0
...,...,...,...,...,...,...
19041,ZWE,IPC1,1985-01-01,CO2,HISTCR,8500.0
19042,ZWE,IPC1,1986-01-01,CO2,HISTCR,11400.0
19043,ZWE,IPC1,1987-01-01,CO2,HISTCR,13000.0
19044,ZWE,IPC1,1988-01-01,CO2,HISTCR,13600.0


In [85]:
b

Unnamed: 0,Area,Category,Date,Entity,Scenario,Emissions
0,DEU,IPC1,1850-01-01,CH4,HISTCR,420.0
1,DEU,IPC1,1850-01-01,CH4,HISTTP,445.0
2,DEU,IPC1,1850-01-01,CO2,HISTCR,14800.0
3,DEU,IPC1,1850-01-01,CO2,HISTTP,15200.0
4,DEU,IPC1,1850-01-01,KYOTOGHG (CO2eq),HISTCR,24100.0
...,...,...,...,...,...,...
78955,ZWE,IPCMAGELV,2017-01-01,KYOTOGHG (CO2eq),HISTTP,6630.0
78956,ZWE,IPCMAGELV,2017-01-01,KYOTOGHGAR4 (CO2eq),HISTCR,6040.0
78957,ZWE,IPCMAGELV,2017-01-01,KYOTOGHGAR4 (CO2eq),HISTTP,6640.0
78958,ZWE,IPCMAGELV,2017-01-01,N2O,HISTCR,15.6


In [86]:
a.align(b, axis='columns')

(      Area Category       Date Entity Scenario Emissions
 0      ABW     IPC1 1901-01-01    CO2   HISTCR     467.0
 1      ABW     IPC1 1902-01-01    CO2   HISTCR     491.0
 2      ABW     IPC1 1903-01-01    CO2   HISTCR     517.0
 3      ABW     IPC1 1904-01-01    CO2   HISTCR     544.0
 4      ABW     IPC1 1905-01-01    CO2   HISTCR     573.0
 ...    ...      ...        ...    ...      ...       ...
 19041  ZWE     IPC1 1985-01-01    CO2   HISTCR    8500.0
 19042  ZWE     IPC1 1986-01-01    CO2   HISTCR   11400.0
 19043  ZWE     IPC1 1987-01-01    CO2   HISTCR   13000.0
 19044  ZWE     IPC1 1988-01-01    CO2   HISTCR   13600.0
 19045  ZWE     IPC1 1989-01-01    CO2   HISTCR   13700.0
 
 [19046 rows x 6 columns],
       Area   Category       Date               Entity Scenario Emissions
 0      DEU       IPC1 1850-01-01                  CH4   HISTCR     420.0
 1      DEU       IPC1 1850-01-01                  CH4   HISTTP     445.0
 2      DEU       IPC1 1850-01-01                  CO

In [87]:
am = a['Emissions']
am.index = pd.MultiIndex.from_frame(a.loc[:, ('Area', 'Category', 'Date', 'Entity', 'Scenario')])

In [88]:
bm = b['Emissions']
bm.index = pd.MultiIndex.from_frame(b.loc[:, ('Area', 'Category', 'Date', 'Entity', 'Scenario')])

In [89]:
(am + bm).dropna()

Area  Category  Date        Entity  Scenario
DEU   IPC1      1901-01-01  CO2     HISTCR      638000.0
                1902-01-01  CO2     HISTCR      626000.0
                1903-01-01  CO2     HISTCR      674000.0
                1904-01-01  CO2     HISTCR      698000.0
                1905-01-01  CO2     HISTCR      750000.0
                                                  ...   
ZWE   IPC1      1985-01-01  CO2     HISTCR       17000.0
                1986-01-01  CO2     HISTCR       22800.0
                1987-01-01  CO2     HISTCR       26000.0
                1988-01-01  CO2     HISTCR       27200.0
                1989-01-01  CO2     HISTCR       27400.0
Name: Emissions, Length: 267, dtype: pint[gigagram]

## Plotting

In [90]:
plt.figure()
sel = prm_emi_wo.query('Area == "DEU" and Category == "IPCM0EL" and Entity == "KYOTOGHG (CO2eq)" and Scenario == "HISTCR"')
plt.plot(sel['Date'], sel['Emissions'].pint.to('Tg').pint.magnitude)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

[<matplotlib.lines.Line2D at 0x7f51f558ed30>]

In [91]:
[key for key, (meta, df) in primap_tables_wo.items() if 'POP' in df.columns]

['POP_DEMOGR_TOTAL_NET_CONSTANT_UN2017P',
 'POP_DEMOGR_TOTAL_NET_CONSTANT_UN2019P',
 'POP_DEMOGR_TOTAL_NET_CONSTMORTALITY_UN2017P',
 'POP_DEMOGR_TOTAL_NET_CONSTMORTALITY_UN2019P',
 'POP_DEMOGR_TOTAL_NET_HIGH_UN2017P',
 'POP_DEMOGR_TOTAL_NET_HIGH_UN2019P',
 'POP_DEMOGR_TOTAL_NET_HISTORY_MPD2018P',
 'POP_DEMOGR_TOTAL_NET_HISTORY_PMHSOCIOECO12',
 'POP_DEMOGR_TOTAL_NET_HISTORY_UN2017P',
 'POP_DEMOGR_TOTAL_NET_HISTORY_UN2019P',
 'POP_DEMOGR_TOTAL_NET_LOW_UN2017P',
 'POP_DEMOGR_TOTAL_NET_LOW_UN2019P',
 'POP_DEMOGR_TOTAL_NET_MEDIUM_UN2017P',
 'POP_DEMOGR_TOTAL_NET_MEDIUM_UN2019P',
 'POP_DEMOGR_TOTAL_NET_NOCHANGE_UN2017P',
 'POP_DEMOGR_TOTAL_NET_NOCHANGE_UN2019P',
 'POP_DEMOGR_TOTAL_NET_NOMIGRATION_UN2017P',
 'POP_DEMOGR_TOTAL_NET_NOMIGRATION_UN2019P',
 'POP_DEMOGR_TOTAL_NET_REPLACEMENT_UN2017P',
 'POP_DEMOGR_TOTAL_NET_REPLACEMENT_UN2019P']

In [92]:
pop = primap_tables_wo['POP_DEMOGR_TOTAL_NET_HISTORY_UN2019P'][1]

pop_2000 = pop.loc[pop['Date'] == '2000-01-01']

emi_2000 = prm_emi_wo.query('Category == "IPCM0EL" and Entity == "KYOTOGHG (CO2eq)" and Scenario == "HISTCR" and Date == 20000101')

In [93]:
# plotting needs alignment via MultiIndex again, I don't see a way without

Summary w/o MultiIndex: For anything which is aligned, there is no useful way without converting to a MultiIndexed format.
Selecting is somewhat better with a plain DataFrame than with a MultiIndexed one especially for deeply linked ones.