In [1]:
%matplotlib widget
import matplotlib
from matplotlib import pyplot as plt
import xarray as xr
import pandas as pd
import numpy as np
import tqdm
import pickle
import pint
import pint_xarray

xarray DataArray:

* all fixed variables are dimensions and coordinates
* space is "allocated" for one data point of observation
* each observation gets their own DataArray

A collection of DataArrays with shared dimensions/coords, i.e. where the coords of each individual DataArray is a subset of the shared coords, can form a Dataset.

Because not all DataArrays may share dimensions/coords, there is possibly a need for a container, which is not yet defined.

There are, again questions around representation of units.

# representation
* load all example data sets into the data structure at once
* measure memory use
* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

## read in the data

In [59]:
with open(f'mem/fao_emi_total.pck', 'rb') as fd:
    fao_emi_total_pd = pickle.load(fd)

fao_emi_total_pd.index = (fao_emi_total_pd.index.set_levels(fao_emi_total_pd.index.levels[0].astype('str'), level=0)
                                                .set_levels(fao_emi_total_pd.index.levels[2].astype('str'), level=2)
                                                .set_levels(fao_emi_total_pd.index.levels[3].astype('str'), level=3)
                         )
fao_emi_total = xr.DataArray.from_series(fao_emi_total_pd.pint.magnitude)
fao_emi_total.attrs['units'] = 'Gg'

In [63]:
with open(f'mem/lak_emi_energy.pck', 'rb') as fd:
    lak_emi_energy_pd = pickle.load(fd)

lak_emi_energy_pd.index = (lak_emi_energy_pd.index.set_levels(lak_emi_energy_pd.index.levels[0].astype('str'), level=0)
                                                  .set_levels(lak_emi_energy_pd.index.levels[0].astype('str'), level=0)
                          )
lak_emi_energy = xr.DataArray.from_series(lak_emi_energy_pd.pint.magnitude)
lak_emi_energy.attrs['units'] = 'kilometric_ton'

In [106]:
with open(f'mem/cmip_conc_co2.pck', 'rb') as fd:
    cmip_conc_co2_pd = pickle.load(fd)
    
cmip_conc_co2_pd.index = cmip_conc_co2_pd.index.set_levels(cmip_conc_co2_pd.index.levels[0].astype('str'), level=0)
cmip_conc_co2 = xr.DataArray.from_series(cmip_conc_co2_pd.pint.magnitude)

from itertools import product
from cftime import DatetimeGregorian
cmip_conc_co2["Date"] = xr.CFTimeIndex([DatetimeGregorian(year, month, 1) for year, month in product([-1] + list(range(1, 2015)), range(1, 13))])
cmip_conc_co2.attrs['units'] = 'dimensionless'

In [108]:
with open(f'mem/prm_emi.pck', 'rb') as fd:
    prm_emi_pd = pickle.load(fd)

prm_emi_pd.index = (prm_emi_pd.index.set_levels(prm_emi_pd.index.levels[0].astype('str'), level=0)
                                    .set_levels(prm_emi_pd.index.levels[1].astype('str'), level=1)
                                    .set_levels(prm_emi_pd.index.levels[3].astype('str'), level=3)
                                    .set_levels(prm_emi_pd.index.levels[4].astype('str'), level=4)
                   )
prm_emi = xr.DataArray.from_series(prm_emi_pd.pint.magnitude)
prm_emi.attrs['units'] = 'Gg'

In [138]:
ds = prm_emi.to_dataset('Entity')
dss = {}
for da in ds:
    dss[da] = ds[da].dropna('Category', how='all')
    dss[da].attrs['units'] = 'Gg'

prm_emi_d = dss

In [145]:
with open(f'mem/primap_tables.pck', 'rb') as fd:
    primap_tables_pd = pickle.load(fd)

In [151]:
primap_arrays = {}
for key, (meta, dt) in tqdm.tqdm_notebook(primap_tables_pd.items()):
    primap_arrays[key] = xr.DataArray.from_series(dt.pint.magnitude)
    primap_arrays[key].attrs.update(meta)
    primap_arrays[key].attrs['units'] = str(dt.pint.units)

HBox(children=(IntProgress(value=0, max=4118), HTML(value='')))




In [154]:
primap_arrays['BC_IPC0_TOTAL_NET_HISTORY_BUR2IPCC2006I']

In [155]:
set([x.attrs['source'] for x in primap_arrays.values()])

{'ANDREW2018V4I',
 'BUR1IPCC2006I',
 'BUR2IPCC2006I',
 'BUR3IPCC2006I',
 'CDIAC2017I',
 'CRF2019',
 'CRF2020',
 'EDGAR42COMPI',
 'EDGAR432I',
 'EDGAR50I',
 'MPD2018P',
 'PMHSOCIOECO12',
 'PRIMAPHIST20',
 'UN2017P',
 'UN2019P',
 'UNFCCC2019BI',
 'UNFCCC2020AI'}

In [181]:
ds = {}
for key, da in primap_arrays.items():
    if da.attrs['source'] != 'PRIMAPHIST20':
        continue
    Entity, Category, _, _, Scenario, _ = key.split('_')
    da.attrs['Entity'] = Entity
    da.attrs['Category'] = Category
    da.attrs['Scenario'] = Scenario
    if (Entity, Scenario) not in ds:
        ds[(Entity, Scenario)] = []
    ds[(Entity, Scenario)].append(da)

ds2 = {}
for key, value in ds.items():
    ds2[key] = xr.concat(value, pd.Index([x.attrs['Category'] for x in value], name='Category'))

ds3 = {}
for (Entity, Scenario), value in ds2.items():
    if Entity not in ds3:
        ds3[Entity] = []
    ds3[Entity].append(value)

ds4 = {}
for Entity, value in ds3.items():
    ds4[Entity] = xr.concat(value, pd.Index([x.attrs['Scenario'] for x in value], name='Scenario')).dropna('Category', how='all')
    ds4[Entity].attrs['units'] = 'Gg'

prm_emi_do = ds4

In [195]:
das = {key.split('_')[4]: value for key, value in primap_arrays.items() if value.attrs['source'] == 'UN2017P'}
un_pop_2017 = xr.Dataset(das).to_array('Scenario')

In [198]:
das = {key.split('_')[4]: value for key, value in primap_arrays.items() if value.attrs['source'] == 'UN2019P'}
un_pop_2019 = xr.Dataset(das).to_array('Scenario')

In [216]:
ds = {}
for key, da in primap_arrays.items():
    if da.attrs['source'] != 'UNFCCC2019BI':
        continue
    Entity, Category, _, _, _, _ = key.split('_')
    da.attrs['Entity'] = Entity
    da.attrs['Category'] = Category
    if Entity not in ds:
        ds[Entity] = []
    ds[Entity].append(da)

ds2 = {}
for key, value in ds.items():
    ds2[key] = xr.concat(value, pd.Index([x.attrs['Category'] for x in value], name='Category'))
    ds2[key].attrs['units'] = 'Gg'

unfccc_emi_2019 = xr.Dataset(ds2)

In [220]:
ds = {}
for key, da in primap_arrays.items():
    if da.attrs['source'] != 'UNFCCC2020AI':
        continue
    Entity, Category, _, _, _, _ = key.split('_')
    da.attrs['Entity'] = Entity
    da.attrs['Category'] = Category
    if Entity not in ds:
        ds[Entity] = []
    ds[Entity].append(da)

ds2 = {}
for key, value in ds.items():
    ds2[key] = xr.concat(value, pd.Index([x.attrs['Category'] for x in value], name='Category'))
    ds2[key].attrs['units'] = 'Gg'

unfccc_emi_2020 = xr.Dataset(ds2)

In [238]:
das = {key: value for key, value in primap_arrays.items() if value.attrs['source'] == 'ANDREW2018V4I'}
andrew_emi_2018 = xr.concat(das.values(), pd.Index([x.split('_')[1] for x in das], name='Category'))

In [245]:
ds = {}
for key, da in primap_arrays.items():
    if da.attrs['source'] != 'BUR1IPCC2006I':
        continue
    Entity, Category, _, _, Scenario, _ = key.split('_')
    da.attrs['Entity'] = Entity
    da.attrs['Category'] = Category
    if Entity not in ds:
        ds[Entity] = []
    ds[Entity].append(da)

ds2 = {}
for Entity, value in ds.items():
    ds2[Entity] = xr.concat(value, pd.Index([x.attrs['Category'] for x in value], name='Category')).dropna('Category', how='all')
    ds2[Entity].attrs['units'] = 'Gg'

bur_emi_2006 = ds2

In [None]:
ds = {}
for key, da in primap_arrays.items():
    if da.attrs['source'] != 'BUR2IPCC2006I':
        continue
    Entity, Category, _, _, Scenario, _ = key.split('_')
    da.attrs['Entity'] = Entity
    da.attrs['Category'] = Category
    if Entity not in ds:
        ds[Entity] = []
    ds[Entity].append(da)

ds2 = {}
for Entity, value in ds.items():
    ds2[Entity] = xr.concat(value, pd.Index([x.attrs['Category'] for x in value], name='Category')).dropna('Category', how='all')
    ds2[Entity].attrs['units'] = 'Gg'

bur_emi_2006 = ds2

In [248]:
def primap_to_dict_of_data_arrays(source, vals_to_dimensions):
    ds = {}
    for key, da in primap_arrays.item():
        if da.attrs['source'] != source:
            continue
        Entity, Category, _, Type, Scenario, _ = key.split('_')
        da.attrs['Entity'] = Entity
        da.attrs['Category'] = Category
        da.attrs['Type'] = Type
        da.attrs['Scenario'] = Scenario
        tempkey = [da.attrs[x] for x in vals_to_dimensions]
        if tempkey not in ds:
            ds[tempkey] = []
        ds[tempkey].append(da)
    
    # TODO: investigate how to best turn a dict of DataArrays into a DataArray with dimensions -> probably combine_nested.

In [248]:
[key for key, value in primap_arrays.items() if value.attrs['source'] == 'BUR1IPCC2006I']

['C2F6_IPC0_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'C2F6_IPC2C_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'C2F6_IPC2_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'C2F6_IPCM0EL_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CF4_IPC0_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CF4_IPC2C_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CF4_IPC2_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CF4_IPCM0EL_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC0_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1A1_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1A2_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1A3_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1A4_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1A5_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1A_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1B1_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1B2_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1B_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC1_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC2B8A_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC2B8B_TOTAL_NET_HISTORY_BUR1IPCC2006I',
 'CH4_IPC2B8C_TOTAL_N

In [None]:
 'BUR1IPCC2006I',
 'BUR2IPCC2006I',
 'BUR3IPCC2006I',
 'CDIAC2017I',
 'CRF2019',
 'CRF2020',
 'EDGAR42COMPI',
 'EDGAR432I',
 'EDGAR50I',
 'MPD2018P',
 'PMHSOCIOECO12',
    