In [1]:
%matplotlib widget
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import tqdm
import pickle
import scmdata
import xarray as xr
import pint_xarray
from itertools import product
from cftime import DatetimeGregorian

<IPython.core.display.Javascript object>

pyam - INFO: Running in a notebook, setting `pyam` logging level to `logging.INFO` and adding stderr handler


In [2]:
def scmrun_from_pandas(df, region, variable, scenario, model):
    columns = list(df.index.names)
    columns.remove('Date')
    columns.append('Date')
    dfs = df.pint.magnitude.reorder_levels(columns).sort_index().unstack()
    
    def get_level_values_from_spec(spec):
        if isinstance(spec, str):
            return dfs.index.get_level_values(spec)
        elif callable(spec):
            return spec(dfs)
        else:
            return spec
    
    return scmdata.ScmRun(dfs.values.T,
                          columns={'region': get_level_values_from_spec(region),
                                   'variable': get_level_values_from_spec(variable),
                                   'unit': [str(df.pint.units)],
                                   'scenario': get_level_values_from_spec(scenario),
                                   'model': get_level_values_from_spec(model)},
                          index=dfs.columns)

In [3]:
with open(f'mem/fao_emi_total.pck', 'rb') as fd:
    fao_emi_total_pd = pickle.load(fd)

def variable(dfs):
    elements = pd.Series(dfs.index.get_level_values('Element')).astype(pd.StringDtype())
    items = pd.Series(dfs.index.get_level_values('Item')).astype(pd.StringDtype())
    return elements + ' - ' + items

fao_emi_total = scmrun_from_pandas(fao_emi_total_pd,
                                   region='Area',
                                   variable=variable,
                                   scenario=['history'],
                                   model=['FAO'])

del fao_emi_total_pd

In [4]:
with open(f'mem/lak_emi_energy.pck', 'rb') as fd:
    lak_emi_energy_pd = pickle.load(fd)

lak_emi_energy = scmrun_from_pandas(lak_emi_energy_pd,
                                    region='Area',
                                    variable='Category',
                                    scenario=['history'],
                                    model=['LAK'])

del lak_emi_energy_pd

In [5]:
with open(f'mem/cmip_conc_co2.pck', 'rb') as fd:
    cmip_conc_co2_pd = pickle.load(fd)

cmip_conc_co2_pd.index.set_levels(
    [f'{year:04}-{month:02}-01' for year, month in product([-1] + list(range(1, 2015)), range(1, 13))],
    level='Date',
    inplace=True)    

cmip_conc_co2 = scmrun_from_pandas(cmip_conc_co2_pd,
                                   region='Area',
                                   variable=['CO2 concentration'],
                                   scenario=['history'],
                                   model=['CMIP'])

del cmip_conc_co2_pd

In [6]:
with open(f'mem/prm_emi.pck', 'rb') as fd:
    prm_emi_pd = pickle.load(fd)   

def variable(dfs):
    elements = pd.Series(dfs.index.get_level_values('Entity')).astype(pd.StringDtype())
    items = pd.Series(dfs.index.get_level_values('Category')).astype(pd.StringDtype())
    return elements + ' - ' + items
    
prm_emi = scmrun_from_pandas(prm_emi_pd,
                             region='Area',
                             variable=variable,
                             scenario='Scenario',
                             model=['PRIMAP-hist']
                             )

del prm_emi_pd

In [7]:
with open(f'xr/mem/primap_sources.pck', 'rb') as fd:
    primap_xr = pickle.load(fd)

In [8]:
def scmrun_from_xarray(da, model):
    das = da.pint.quantify().to_array('Element').stack({'index': ['Area', 'Category', 'Class', 'Scenario', 'Type', 'Element']})
    variable = das['Type'] + ' - ' + das['Class'] + ' - ' + das['Element'] + ' - ' + das['Category']
    return scmdata.ScmRun(das.pint.magnitude,
                          columns={'region': das['Area'],
                                   'variable': variable,
                                   'unit': [str(das.pint.units)],
                                   'scenario': das['Scenario'],
                                   'model': [model]},
                          index=das['Date'].values)

In [9]:
# not enough memory to read all of it in, so just read in one example
crf2020 = scmrun_from_xarray(primap_xr['CRF2020'], 'CRF2020')


#primap = {}
#for key in tqdm.tqdm_notebook(primap_xr):
#    print(key)
#    da = primap_xr[key]
#    primap[key] = scmrun_from_xarray(da, key)
#
del primap_xr

## Measure memory use

In [10]:
!rm -rf sd/mem/
!mkdir -p sd/mem/

import pickle

for fpath, obj in (
    ('fao_emi_total', fao_emi_total),
    ('lak_emi_energy', lak_emi_energy),
    ('cmip_conc_co2', cmip_conc_co2),
    ('prm_emi', prm_emi), 
    ('primap_crf2020', crf2020),
):
    with open(f'sd/mem/{fpath}.pck', 'wb') as fd:
        pickle.dump(obj, fd, -1)

!ls -lah sd/mem/
!ls -lah mem/

total 116M
drwxrwxr-x 2 pflueger pflueger 4,0K Sep  4 12:25 .
drwxrwxr-x 3 pflueger pflueger 4,0K Sep  4 12:25 ..
-rw-rw-r-- 1 pflueger pflueger 2,0M Sep  4 12:25 cmip_conc_co2.pck
-rw-rw-r-- 1 pflueger pflueger  11M Sep  4 12:25 fao_emi_total.pck
-rw-rw-r-- 1 pflueger pflueger  70K Sep  4 12:25 lak_emi_energy.pck
-rw-rw-r-- 1 pflueger pflueger  22M Sep  4 12:25 primap_crf2020.pck
-rw-rw-r-- 1 pflueger pflueger  82M Sep  4 12:25 prm_emi.pck
total 674M
drwxrwxr-x 2 pflueger pflueger 4,0K Aug 27 10:55 .
drwxrwxr-x 9 pflueger pflueger 4,0K Sep  4 12:23 ..
-rw-rw-r-- 1 pflueger pflueger 1,2M Aug 27 18:49 cmip_conc_co2.pck
-rw-rw-r-- 1 pflueger pflueger 1,2M Aug 27 18:53 cmip_conc_co2_wo.pck
-rw-rw-r-- 1 pflueger pflueger 8,8M Aug 27 18:49 fao_emi_total.pck
-rw-rw-r-- 1 pflueger pflueger 9,7M Aug 27 18:53 fao_emi_total_wo.pck
-rw-rw-r-- 1 pflueger pflueger  40K Aug 27 18:49 lak_emi_energy.pck
-rw-rw-r-- 1 pflueger pflueger  50K Aug 27 18:53 lak_emi_energy_wo.pck
-rw-rw-r-- 1 pflueger pflueg

## I/O
* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

In [19]:
!mkdir -p sd/csv

fao_emi_total.to_csv('sd/csv/fao_emi_total.csv')
lak_emi_energy.to_csv('sd/csv/lak_emi_energy.csv')
#cmip_conc_co2.to_csv('sd/csv/cmip_conc_co2.csv')
prm_emi.to_csv('sd/csv/prm_emi.csv')
crf2020.to_csv('sd/csv/crf2020.csv')

# CSV does not work with crftime axis

!ls -lah sd/csv
!ls -lah csv/

total 47M
drwxrwxr-x 2 pflueger pflueger 4,0K Sep  4 15:25 .
drwxrwxr-x 4 pflueger pflueger 4,0K Sep  4 15:23 ..
-rw-rw-r-- 1 pflueger pflueger 6,4M Sep  4 15:27 crf2020.csv
-rw-rw-r-- 1 pflueger pflueger 4,8M Sep  4 15:27 fao_emi_total.csv
-rw-rw-r-- 1 pflueger pflueger  46K Sep  4 15:27 lak_emi_energy.csv
-rw-rw-r-- 1 pflueger pflueger  36M Sep  4 15:27 prm_emi.csv
total 250M
drwxrwxr-x 3 pflueger pflueger 4,0K Aug 25 18:14 .
drwxrwxr-x 9 pflueger pflueger 4,0K Sep  4 15:25 ..
-rw-rw-r-- 1 pflueger pflueger 2,4M Aug 27 18:49 cmip_conc_co2.csv
-rw-rw-r-- 1 pflueger pflueger  29M Aug 27 18:49 fao_emi_total.csv
-rw-rw-r-- 1 pflueger pflueger 145K Aug 27 18:49 lak_emi_energy.csv
drwxrwxr-x 2 pflueger pflueger 692K Aug 25 18:13 primap_tables
-rw-rw-r-- 1 pflueger pflueger 218M Aug 27 18:50 prm_emi.csv


In [22]:
fao_emi_total_r = scmdata.ScmRun('sd/csv/fao_emi_total.csv')
lak_emi_energy_r = scmdata.ScmRun('sd/csv/lak_emi_energy.csv')
prm_emi_r = scmdata.ScmRun('sd/csv/prm_emi.csv')
crf2020_r = scmdata.ScmRun('sd/csv/crf2020.csv')

In [23]:
# no native storage format

# interactive analysis and plotting
* add / subtract / divide / multiply compatible, overlapping data sets
* timeit
* plot a historical time series, and something akin to gapminder (countries on a gdp vs emissions plot)
* select datasets which contain data on the N2O emissions of Finland
* in a selected dataset, select years in which the N2O emissions of Finland were above the 1950-2000 average
* resample non-yearly dataset to yearly dataset
* convert a dataset to tidy format

## aligned arithmetic

In [41]:
# no way to filter for time without conversion into long format
# string mangling necessary to filter for parts of variable
# no convenience functions for string mangling
# easiest way to do aligned arithmetic is via pandas.

## plotting

In [48]:
plt.figure()
prm_emi.lineplot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f6d3c476970>

In [49]:
# what?

In [50]:
# again, easiest way is via pandas

In [52]:
# honestly, give up. Any data which does not conform to the meta data format of scmdata (region, variable, unit, scenario, model) can not be handled at all with scmdata.