In [59]:
%matplotlib widget
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
import tqdm
import pickle
import pyam
import pint_xarray
import pint
import xarray as xr
from itertools import product

# representation

In [42]:
def iamdf_from_pandas(df, region, variable, scenario, model):
    columns = list(df.index.names)
    columns.remove('Date')
    columns.append('Date')
    rename = {}
    kwargs = {}
    for key, spec in ('region', region), ('variable', variable), ('scenario', scenario), ('model', model):
        if isinstance(spec, list) and len(spec) == 1:
            rename[spec[0]] = key
        else:
            kwargs[key] = spec
            
    dfs = df.pint.magnitude.reorder_levels(columns).sort_index().unstack().reset_index().rename(columns=rename)
    
    return pyam.IamDataFrame(dfs,
                             unit=str(df.pint.units),
                             **kwargs)

In [43]:
with open(f'mem/fao_emi_total.pck', 'rb') as fd:
    fao_emi_total_pd = pickle.load(fd)

fao_emi_total = iamdf_from_pandas(fao_emi_total_pd,
                                  region=['Area'],
                                  variable=['Element', 'Item'],
                                  scenario='history',
                                  model='FAO')

del fao_emi_total_pd

In [46]:
with open(f'mem/lak_emi_energy.pck', 'rb') as fd:
    lak_emi_energy_pd = pickle.load(fd)

lak_emi_energy = iamdf_from_pandas(lak_emi_energy_pd,
                                   region=['Area'],
                                   variable=['Category'],
                                   scenario='history',
                                   model='LAK')

del lak_emi_energy_pd

In [51]:
if False:  # can't work with out-of-bounds dates
    with open(f'mem/cmip_conc_co2.pck', 'rb') as fd:
        cmip_conc_co2_pd = pickle.load(fd)

    cmip_conc_co2_pd.index.set_levels(
        [f'{year:04}-{month:02}-01' for year, month in product([-1] + list(range(1, 2015)), range(1, 13))],
        level='Date',
        inplace=True)    

    cmip_conc_co2 = iamdf_from_pandas(cmip_conc_co2_pd,
                                      region=['Area'],
                                      variable='CO2 concentration',
                                      scenario='history',
                                      model='CMIP')

    del cmip_conc_co2_pd

In [52]:
with open(f'mem/prm_emi.pck', 'rb') as fd:
    prm_emi_pd = pickle.load(fd)   

    
prm_emi = iamdf_from_pandas(prm_emi_pd,
                            region=['Area'],
                            variable=['Entity', 'Category'],
                            scenario=['Scenario'],
                            model='PRIMAP-hist'
                            )

del prm_emi_pd

In [87]:
with open(f'xr/mem/primap_sources.pck', 'rb') as fd:
    primap_xr = pickle.load(fd)

In [101]:
def iamdf_from_xarray(da, model):
    dq = da.pint.quantify().to_array('Element').pint.dequantify()
    dfs = dq.to_dataset('Date').to_dataframe().reset_index().rename(columns={'Area': 'region', 'Scenario': 'scenario'})
    
    return pyam.IamDataFrame(dfs,
                             unit=dq.attrs['units'],
                             variable=['Type', 'Class', 'Category', 'Element'],
                             model=model)


In [105]:
crf2020 = iamdf_from_xarray(primap_xr['CRF2020'], 'CRF2020')


primap = {}
for key in tqdm.tqdm_notebook(primap_xr):
    if key == 'MPD2018P':  # can't handle out-of-bounds times
        continue
    print(key)
    da = primap_xr[key]
    primap[key] = iamdf_from_xarray(da, key)

del primap_xr

HBox(children=(IntProgress(value=0, max=17), HTML(value='')))

CRF2020
BUR2IPCC2006I
BUR3IPCC2006I
ANDREW2018V4I
UNFCCC2019BI
CDIAC2017I
UN2017P
UN2019P
EDGAR42COMPI
EDGAR50I
CRF2019
BUR1IPCC2006I
PMHSOCIOECO12
PRIMAPHIST20
EDGAR432I
UNFCCC2020AI


## Measure memory use

In [116]:
!rm -rf pm/mem/
!mkdir -p pm/mem/

import pickle

for fpath, obj in (
    ('fao_emi_total', fao_emi_total),
    ('lak_emi_energy', lak_emi_energy),
    ('prm_emi', prm_emi), 
    ('primap', primap),
):
    with open(f'pm/mem/{fpath}.pck', 'wb') as fd:
        pickle.dump(obj, fd, -1)

!ls -lah pm/mem/
!ls -lah mem/

total 684M
drwxrwxr-x 2 pflueger pflueger 4,0K Sep 15 15:51 .
drwxrwxr-x 4 pflueger pflueger 4,0K Sep 15 15:51 ..
-rw-rw-r-- 1 pflueger pflueger  16M Sep 15 15:51 fao_emi_total.pck
-rw-rw-r-- 1 pflueger pflueger  67K Sep 15 15:51 lak_emi_energy.pck
-rw-rw-r-- 1 pflueger pflueger 513M Sep 15 15:51 primap.pck
-rw-rw-r-- 1 pflueger pflueger 155M Sep 15 15:51 prm_emi.pck
total 674M
drwxrwxr-x  2 pflueger pflueger 4,0K Aug 27 10:55 .
drwxrwxr-x 10 pflueger pflueger 4,0K Sep 15 15:49 ..
-rw-rw-r--  1 pflueger pflueger 1,2M Aug 27 18:49 cmip_conc_co2.pck
-rw-rw-r--  1 pflueger pflueger 1,2M Aug 27 18:53 cmip_conc_co2_wo.pck
-rw-rw-r--  1 pflueger pflueger 8,8M Aug 27 18:49 fao_emi_total.pck
-rw-rw-r--  1 pflueger pflueger 9,7M Aug 27 18:53 fao_emi_total_wo.pck
-rw-rw-r--  1 pflueger pflueger  40K Aug 27 18:49 lak_emi_energy.pck
-rw-rw-r--  1 pflueger pflueger  50K Aug 27 18:53 lak_emi_energy_wo.pck
-rw-rw-r--  1 pflueger pflueger 205M Aug 27 18:49 primap_tables.pck
-rw-rw-r--  1 pflueger pflu

## I/O
* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

### export to CSV

In [120]:
!mkdir -p pm/csv/primap

for fpath, obj in (
    ('fao_emi_total', fao_emi_total),
    ('lak_emi_energy', lak_emi_energy),
    ('prm_emi', prm_emi),
    ('primap_crf2020', crf2020),
):
    obj.to_csv(f'pm/csv/{fpath}.csv')

for key in primap:
    primap[key].to_csv(f'pm/csv/primap/{key}.csv')

!ls -lah pm/csv
!du -sh pm/csv/primap

total 45M
drwxrwxr-x 4 pflueger pflueger 4,0K Sep 15 16:04 .
drwxrwxr-x 4 pflueger pflueger 4,0K Sep 15 15:51 ..
-rw-rw-r-- 1 pflueger pflueger 4,7M Sep 15 16:04 fao_emi_total.csv
-rw-rw-r-- 1 pflueger pflueger  46K Sep 15 16:04 lak_emi_energy.csv
drwxrwxr-x 2 pflueger pflueger 4,0K Sep 15 16:05 primap
-rw-rw-r-- 1 pflueger pflueger 4,0M Sep 15 16:04 primap_crf2020.csv
drwxrwxr-x 2 pflueger pflueger 4,0K Sep 15 15:52 prm_emi
-rw-rw-r-- 1 pflueger pflueger  36M Sep 15 16:04 prm_emi.csv
194M	pm/csv/primap


### reimport from CSV

In [124]:
fao_emi_total_r = pyam.IamDataFrame('pm/csv/fao_emi_total.csv')
lak_emi_energy_r = pyam.IamDataFrame('pm/csv/lak_emi_energy.csv')
prm_emi_r = pyam.IamDataFrame('pm/csv/prm_emi.csv')
primap_crf2020_r = pyam.IamDataFrame('pm/csv/primap_crf2020.csv')

pyam.core - INFO: Reading file `pm/csv/fao_emi_total.csv`
pyam.core - INFO: Reading file `pm/csv/lak_emi_energy.csv`
pyam.core - INFO: Reading file `pm/csv/prm_emi.csv`
pyam.core - INFO: Reading file `pm/csv/primap_crf2020.csv`


In [130]:
primap_r = {}
fps = !ls pm/csv/primap/
for fp in tqdm.tqdm_notebook(fps):
    if not fp.endswith('csv'):
        continue
    key = fp[:-4]
    primap_r[key] = pyam.IamDataFrame(f'pm/csv/primap/{key}.csv')

HBox(children=(IntProgress(value=0, max=16), HTML(value='')))

pyam.core - INFO: Reading file `pm/csv/primap/ANDREW2018V4I.csv`
pyam.core - INFO: Reading file `pm/csv/primap/BUR1IPCC2006I.csv`
pyam.core - INFO: Reading file `pm/csv/primap/BUR2IPCC2006I.csv`
pyam.core - INFO: Reading file `pm/csv/primap/BUR3IPCC2006I.csv`
pyam.core - INFO: Reading file `pm/csv/primap/CDIAC2017I.csv`
pyam.core - INFO: Reading file `pm/csv/primap/CRF2019.csv`
pyam.core - INFO: Reading file `pm/csv/primap/CRF2020.csv`
pyam.core - INFO: Reading file `pm/csv/primap/EDGAR42COMPI.csv`
pyam.core - INFO: Reading file `pm/csv/primap/EDGAR432I.csv`
pyam.core - INFO: Reading file `pm/csv/primap/EDGAR50I.csv`
pyam.core - INFO: Reading file `pm/csv/primap/PMHSOCIOECO12.csv`
pyam.core - INFO: Reading file `pm/csv/primap/PRIMAPHIST20.csv`
pyam.core - INFO: Reading file `pm/csv/primap/UN2017P.csv`
pyam.core - INFO: Reading file `pm/csv/primap/UN2019P.csv`
pyam.core - INFO: Reading file `pm/csv/primap/UNFCCC2019BI.csv`
pyam.core - INFO: Reading file `pm/csv/primap/UNFCCC2020AI.csv`


### export to native storage format
* csv is native

# interactive analysis and plotting
* add / subtract / divide / multiply compatible, overlapping data sets
* timeit
* plot a historical time series, and something akin to gapminder (countries on a gdp vs emissions plot)
* select datasets which contain data on the N2O emissions of Finland
* in a selected dataset, select years in which the N2O emissions of Finland were above the 1950-2000 average
* resample non-yearly dataset to yearly dataset
* convert a dataset to tidy format

## aligned arithmetic

In [149]:
# extract two comptaible, partly overlapping data sets from primap-hist

In [150]:
a = prm_emi.filter(year=range(1900, 1991), scenario='HISTCR', variable='CO2|IPC1')

In [151]:
b = prm_emi.filter(region=['FIN', 'DEU', 'ZWE'])

In [152]:
c = a + b

TypeError: unsupported operand type(s) for +: 'IamDataFrame' and 'IamDataFrame'

In [153]:
# only via pandas

## plotting

In [None]:
plt.figure()
sel = prm_emi.xs(('DEU', 'IPCM0EL', 'KYOTOGHG (CO2eq)', 'HISTCR'), level=('Area', 'Category', 'Entity', 'Scenario'))
#sel = prm_emi.loc['DEU', 'IPCM0EL', :, 'KYOTOGHG (CO2eq)', 'HISTCR']
sel.pint.magnitude.plot()

In [158]:
prm_emi.filter(scenario='HISTCR', region='DEU', variable='KYOTOGHG (CO2eq)|IPCM0EL').line_plot(x='time')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f6972c281c0>

In [165]:
[key for key, df in primap.items() if df.variables()[0].endswith('POP')]

['UN2017P', 'UN2019P', 'PMHSOCIOECO12']

In [166]:
pop = primap['UN2019P']

pop_2000 = pop.filter(year=2000)

emi_2000 = prm_emi.filter(scenario='HISTCR', year=2000, variable='KYOTOGHG (CO2eq)|IPCM0EL')

In [176]:
# gapminder-like plot needs alignment via pandas multiindex again, no native way to do it

## Selecting datasets and data
* select datasets which contain data on the N2O emissions of Finland
* in a selected dataset, select years in which the N2O emissions of Finland were above the 1950-2000 average

In [195]:
[key for key, df in primap.items() if 
           'FIN' in df.regions().values
           and df.variables().str.endswith('IPC0|N2O').any()]

['CRF2020', 'EDGAR42COMPI', 'EDGAR50I', 'CRF2019', 'PRIMAPHIST20', 'EDGAR432I']

In [200]:
df = primap['PRIMAPHIST20'].filter(scenario='HISTTP', region='FIN', variable='NET|TOTAL|IPC0|N2O')

In [204]:
avg2hXX = df.filter(year=range(1950, 2001)).data['value'].mean()
avg2hXX

22.8155464621459

In [205]:
df.data.loc[df.data['value'] > avg2hXX]

Unnamed: 0,model,scenario,region,variable,unit,time,value
4018814,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1967-01-01,22.93628
4018815,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1968-01-01,23.505047
4018816,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1969-01-01,23.722403
4018817,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1970-01-01,23.88629
4018818,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1971-01-01,24.266094
4018819,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1972-01-01,25.070313
4018820,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1973-01-01,26.521749
4018821,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1974-01-01,27.16597
4018822,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1975-01-01,26.874863
4018823,PRIMAPHIST20,HISTTP,FIN,NET|TOTAL|IPC0|N2O,gigagram,1976-01-01,24.481267


## Resampling
* resample non-yearly dataset to yearly dataset

In [206]:
# do not have non-yearly dataset, because it was not supported to load it.

# Interpolate
* delete some data points from a data set and fill them in using linear interpolation

In [231]:
df = primap['PRIMAPHIST20'].filter(scenario='HISTTP', region='FIN', variable='NET|TOTAL|IPC0|N2O')

In [232]:
df.data[('1990' < df.data.time) & ('1994' >= df.data.time)] = np.nan

In [233]:
df.line_plot(x='time', marker='x')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f697575bf10>

In [234]:
import datetime
for year in range(1990, 1995):
    df.interpolate(datetime.datetime(year=year, month=1, day=1))

In [235]:
df.line_plot(x='time', marker='x')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<matplotlib.axes._subplots.AxesSubplot at 0x7f697675f220>