In [25]:
import sys
# prefer conda env over .local
sys.path.insert(1, '/home/pflueger/.local/conda/envs/scmdata/lib/python3.8/site-packages')

import matplotlib
%matplotlib widget
from matplotlib import pyplot as plt
import pandas as pd
import scmdata
import pickle
import numpy as np
import pathlib
import itertools
import tqdm
import pint_xarray

In [2]:
scmdata.__version__

'0.7.3'

In [3]:
class CustomScmRun(scmdata.run.BaseScmRun):
    required_cols = ['unit']

In [4]:
def scmrun_from_pandas(df):
    columns = list(df.index.names)
    columns.remove('Date')
    columns.append('Date')
    dfs = df.pint.magnitude.reorder_levels(columns).sort_index().unstack()
    
    scm_cols = {x: dfs.index.get_level_values(x) for x in dfs.index.names}
    scm_cols['unit'] = [str(df.pint.units)]
    
    return CustomScmRun(dfs.values.T,
                          columns=scm_cols,
                          index=dfs.columns)

In [5]:
with open(f'mem/fao_emi_total.pck', 'rb') as fd:
    fao_emi_total_pd = pickle.load(fd)
    
fao_emi_total = scmrun_from_pandas(fao_emi_total_pd)
del fao_emi_total_pd

In [6]:
with open(f'mem/fao_emi_total.pck', 'rb') as fd:
    fao_emi_total_pd = pickle.load(fd)

fao_emi_total = scmrun_from_pandas(fao_emi_total_pd)
del fao_emi_total_pd

In [7]:
with open(f'mem/lak_emi_energy.pck', 'rb') as fd:
    lak_emi_energy_pd = pickle.load(fd)

lak_emi_energy = scmrun_from_pandas(lak_emi_energy_pd)
del lak_emi_energy_pd

In [14]:
p = pathlib.Path('../../primap-requirements/example_data_sets/')
dtypes = {
    'data_mean_global': np.float,
    'data_mean_nh': np.float,
    'data_mean_sh': np.float,
}
cmip_conc_co2 = pd.read_csv(
    p / 'mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv',
    usecols=dtypes.keys(),
    #index_col=[0, 1],
)
# cannot use datetime here, because the range of possible values reaches back only into the 17th century
cmip_conc_co2.index = pd.period_range(start=pd.Period(year=0, month=1, freq='M'), end=pd.Period(year=2014, month=12, freq='M'), name='Date')

cmip_conc_co2 = cmip_conc_co2.stack()

cmip_conc_co2.index = (cmip_conc_co2.index
                       .set_levels(cmip_conc_co2.index.levels[1].str[len('data_mean_'):].astype(pd.CategoricalDtype(ordered=True)), level=1)
                       .set_names('Area', level=1)
                      )

cmip_conc_co2 = cmip_conc_co2.reorder_levels(['Area', 'Date', ])
cmip_conc_co2.sort_index(inplace=True)

cmip_conc_co2.name = 'CO2 concentration'

cmip_conc_co2 = cmip_conc_co2.astype('pint[dimensionless]')
cmip_conc_co2 *= 1e-6

In [17]:
cmip_conc_co2.index.set_levels(
    [f'{year:04}-{month:02}-01' for year, month in itertools.product([-1] + list(range(1, 2015)), range(1, 13))],
    level='Date',
    inplace=True)    

cmip_conc_co2 = scmrun_from_pandas(cmip_conc_co2)

In [18]:
with open(f'mem/prm_emi.pck', 'rb') as fd:
    prm_emi_pd = pickle.load(fd)
    
prm_emi = scmrun_from_pandas(prm_emi_pd)

del prm_emi_pd

In [19]:
with open(f'xr/mem/primap_sources.pck', 'rb') as fd:
    primap_xr = pickle.load(fd)

In [29]:
def scmrun_from_xarray(da, model):
    das = da.pint.quantify().to_array('Element').stack({'index': ['Area', 'Category', 'Class', 'Scenario', 'Type', 'Element']})
    return CustomScmRun(das.pint.magnitude,
                          columns={'region': das['Area'],
                                   'Type': das['Type'],
                                   'Class': das['Class'],
                                   'element': das['Element'],
                                   'category': das['Category'],
                                   'unit': [str(das.pint.units)],
                                   'scenario': das['Scenario'],
                                   'model': [model]},
                          index=das['Date'].values)

In [31]:
primap = {}
for key in tqdm.tqdm_notebook(primap_xr):
    print(key)
    da = primap_xr[key]
    primap[key] = scmrun_from_xarray(da, key)

del primap_xr

HBox(children=(IntProgress(value=0, max=17), HTML(value='')))

CRF2020
BUR2IPCC2006I
BUR3IPCC2006I
ANDREW2018V4I
UNFCCC2019BI
CDIAC2017I
UN2017P
UN2019P
EDGAR42COMPI
EDGAR50I
CRF2019
MPD2018P
BUR1IPCC2006I
PMHSOCIOECO12
PRIMAPHIST20
EDGAR432I
UNFCCC2020AI



## Measure memory use

In [33]:
!rm -rf sd7/mem/
!mkdir -p sd7/mem/

import pickle

for fpath, obj in (
    ('fao_emi_total', fao_emi_total),
    ('lak_emi_energy', lak_emi_energy),
    ('cmip_conc_co2', cmip_conc_co2),
    ('prm_emi', prm_emi),
    ('primap', primap)
):
    with open(f'sd7/mem/{fpath}.pck', 'wb') as fd:
        pickle.dump(obj, fd, -1)

!ls -lah sd7/mem/
!ls -lah mem/

total 952M
drwxrwxr-x 2 pflueger pflueger 4,0K Sep 25 16:14 .
drwxrwxr-x 3 pflueger pflueger 4,0K Sep 25 16:14 ..
-rw-rw-r-- 1 pflueger pflueger 1,2M Sep 25 16:14 cmip_conc_co2.pck
-rw-rw-r-- 1 pflueger pflueger 4,8M Sep 25 16:14 fao_emi_total.pck
-rw-rw-r-- 1 pflueger pflueger  29K Sep 25 16:14 lak_emi_energy.pck
-rw-rw-r-- 1 pflueger pflueger 908M Sep 25 16:14 primap.pck
-rw-rw-r-- 1 pflueger pflueger  39M Sep 25 16:14 prm_emi.pck
total 674M
drwxrwxr-x  2 pflueger pflueger 4,0K Aug 27 10:55 .
drwxrwxr-x 13 pflueger pflueger 4,0K Sep 25 16:13 ..
-rw-rw-r--  1 pflueger pflueger 1,2M Aug 27 18:49 cmip_conc_co2.pck
-rw-rw-r--  1 pflueger pflueger 1,2M Aug 27 18:53 cmip_conc_co2_wo.pck
-rw-rw-r--  1 pflueger pflueger 8,8M Aug 27 18:49 fao_emi_total.pck
-rw-rw-r--  1 pflueger pflueger 9,7M Aug 27 18:53 fao_emi_total_wo.pck
-rw-rw-r--  1 pflueger pflueger  40K Aug 27 18:49 lak_emi_energy.pck
-rw-rw-r--  1 pflueger pflueger  50K Aug 27 18:53 lak_emi_energy_wo.pck
-rw-rw-r--  1 pflueger pflue

## I/O
* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

In [36]:
!mkdir -p sd7/csv/primap

fao_emi_total.to_csv('sd7/csv/fao_emi_total.csv')
lak_emi_energy.to_csv('sd7/csv/lak_emi_energy.csv')
#cmip_conc_co2.to_csv('sd/csv/cmip_conc_co2.csv')
prm_emi.to_csv('sd7/csv/prm_emi.csv')

for key, da in tqdm.tqdm_notebook(primap.items()):
    da.to_csv(f'sd7/csv/primap/{key}.csv')

# CSV does not work with crftime axis

!ls -lah sd7/csv
!ls -lah csv/

HBox(children=(IntProgress(value=0, max=17), HTML(value='')))


total 40M
drwxrwxr-x 3 pflueger pflueger 4,0K Sep 25 16:35 .
drwxrwxr-x 4 pflueger pflueger 4,0K Sep 25 16:34 ..
-rw-rw-r-- 1 pflueger pflueger 4,7M Sep 25 16:35 fao_emi_total.csv
-rw-rw-r-- 1 pflueger pflueger  45K Sep 25 16:35 lak_emi_energy.csv
drwxrwxr-x 2 pflueger pflueger 4,0K Sep 25 16:37 primap
-rw-rw-r-- 1 pflueger pflueger  36M Sep 25 16:35 prm_emi.csv
total 250M
drwxrwxr-x  3 pflueger pflueger 4,0K Aug 25 18:14 .
drwxrwxr-x 13 pflueger pflueger 4,0K Sep 25 16:35 ..
-rw-rw-r--  1 pflueger pflueger 2,4M Aug 27 18:49 cmip_conc_co2.csv
-rw-rw-r--  1 pflueger pflueger  29M Aug 27 18:49 fao_emi_total.csv
-rw-rw-r--  1 pflueger pflueger 145K Aug 27 18:49 lak_emi_energy.csv
drwxrwxr-x  2 pflueger pflueger 692K Aug 25 18:13 primap_tables
-rw-rw-r--  1 pflueger pflueger 218M Aug 27 18:50 prm_emi.csv


In [38]:
# no more "native" storage format

# interactive analysis and plotting
* add / subtract / divide / multiply compatible, overlapping data sets
* timeit
* plot a historical time series, and something akin to gapminder (countries on a gdp vs emissions plot)
* select datasets which contain data on the N2O emissions of Finland
* in a selected dataset, select years in which the N2O emissions of Finland were above the 1950-2000 average
* resample non-yearly dataset to yearly dataset
* convert a dataset to tidy format

In [None]:
# extract two compatible, partly overlapping data sets from primap-hist
a = (prm_emif.xs('IPC1', level='Category')
             .xs(slice('1900', '1990'), level='Date')
             .xs('CO2', level='Entity')
             .xs('HISTCR', level='Scenario')
)

In [49]:
a = prm_emi.filter(Category='IPC1', Entity='CO2', Scenario='HISTCR', year=range(1900, 1991))

In [54]:
b = prm_emi.filter(Area=['DEU', 'FIN', 'ZWE'])

In [71]:
c = a.timeseries() + b.timeseries()
cc = CustomScmRun(c.dropna('index', how='all').dropna('columns', how='all'))
cc.timeseries()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,time,1900-01-01,1901-01-01,1902-01-01,1903-01-01,1904-01-01,1905-01-01,1906-01-01,1907-01-01,1908-01-01,1909-01-01,...,1981-01-01,1982-01-01,1983-01-01,1984-01-01,1985-01-01,1986-01-01,1987-01-01,1988-01-01,1989-01-01,1990-01-01
Area,Category,Entity,Scenario,unit,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
DEU,IPC1,CO2,HISTCR,gigagram,640000.0,638000.0,626000.0,674000.0,698000.0,750000.0,710000.0,866000.0,876000.0,874000.0,...,2020000.0,1958000.0,1950000.0,1994000.0,2020000.0,2020000.0,2000000.0,1996000.0,1958000.0,1980000.0
FIN,IPC1,CO2,HISTCR,gigagram,1242.0,872.0,826.0,1062.0,1092.0,1214.0,1274.0,1834.0,2900.0,2660.0,...,104600.0,87200.0,84000.0,85800.0,101000.0,108800.0,119400.0,106600.0,107400.0,105400.0
ZWE,IPC1,CO2,HISTCR,gigagram,92.4,96.4,95.0,194.6,244.0,402.0,434.0,478.0,678.0,702.0,...,15700.0,14840.0,17660.0,16560.0,17000.0,22800.0,26000.0,27200.0,27400.0,26400.0


In [69]:
bt = b.convert_unit('kilometric_ton')

In [75]:
# doesn't work, have to convert units to same units before aligned arithmetic
ct = a.timeseries() + bt.timeseries()
cct = CustomScmRun(ct.dropna('index', how='all').dropna('columns', how='all'))
cct.timeseries()

ValueError: invalid column format, must contain some time (int, float or datetime) columns!

In [77]:
# 2d aligned arithmetic not really supported