In [37]:
import numpy as np
import pandas as pd
import pint
import pint_pandas
import pathlib
import tqdm
u = pint.UnitRegistry()

pandas DataFrame in traditional tidy format:

* the index is meaningless (row numbers)
* each column contains one variable
* each observation forms a row
* each type of observational unit forms a DataFrame
* columns ordered from fixed variables (left) to measured variables; within each group, alphabetical order
* rows ordered using the leftmost variable, breaking ties with variables further right

Even given these constraints, there are still some variables in the data representation:

* duplicate information (like countrycode and full countryname) in a single DataFrame vs. multiple DataFrames (only countrycode in the DataFrame, second DataFrame maps countrycodes to countrynames)
* units as composite types (e.g. pint) vs. units as a variable

Because the data structure relies on multiple, possibly joinable DataFrames, there is a need for a container, which is not yet defined.

## representation
* load all example data sets into the data structure at once
* measure memory use
* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

In [2]:
!ls ../../primap-requirements/example_data_sets/

Emissions_Agriculture_Agriculture_total_E_All_Data.csv
lak-download.csv
mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv
PRIMAPDB.mat
PRIMAP-hist_v2.1_09-Nov-2019.csv


In [3]:
p = pathlib.Path('../../primap-requirements/example_data_sets/')

## read in FAO emissions agriculture total fao_emi_total

In [4]:
years = list(range(1961, 2017)) + [2030, 2050]
dtypes = {}
for year in years:
    dtypes[f'Y{year}'] = np.float

dtypes['Area'] = pd.CategoricalDtype(ordered=True)
dtypes['Item'] = pd.CategoricalDtype(ordered=True)
dtypes['Element'] = pd.CategoricalDtype(ordered=True)

fao_emi_total = pd.read_csv(
    p / 'Emissions_Agriculture_Agriculture_total_E_All_Data.csv',
    encoding='latin_1',
    dtype=dtypes,
    usecols=dtypes.keys(),
    index_col=[0, 1, 2])

fao_emi_total = fao_emi_total.stack()

fao_emi_total.index = (fao_emi_total.index
                       .set_names('Date', level=3)
                       .set_levels(pd.to_datetime(fao_emi_total.index.levels[3], format='Y%Y'), level=3)
                       .set_levels(fao_emi_total.index.levels[2].str[len('Emissions ('):-1], level=2)
                      )

fao_emi_total = fao_emi_total.reorder_levels(['Area', 'Date', 'Element', 'Item'])
fao_emi_total.sort_index(inplace=True)

fao_emi_total.name = 'Emissions'

fao_emi_total = fao_emi_total.astype('pint[Gg]')

In [5]:
fao_emi_total

Area         Date        Element  Item                   
Afghanistan  1961-01-01  CH4      Agriculture total          285.7793
                                  Burning - Crop residues      4.0734
                                  Enteric Fermentation       240.6831
                                  Manure Management           11.6228
                                  Rice Cultivation               29.4
                                                               ...   
Zimbabwe     2050-01-01  N2O      Crop Residues                1.1978
                                  Manure Management            0.3871
                                  Manure applied to Soils      0.6348
                                  Manure left on Pasture      20.8824
                                  Synthetic Fertilizers        3.7159
Name: Emissions, Length: 507542, dtype: pint[gigagram]

## read in emission data for German bundesländer lak_emi_energy

In [6]:
dtypes = {}
for i in ['Insgesamt', 'Steinkohle', 'Braunkohle', 'Mineralöle und Mineralöl produkte', 'Gase', 'Abfälle (nicht biogen)', 'Andere Energieträger']:
    dtypes[i] = np.float
dtypes['Land'] = pd.CategoricalDtype(ordered=True)
dtypes['Jahr'] = str

lak_emi_energy = pd.read_csv(
    p / 'lak-download.csv',
    encoding='latin_1',
    dtype=dtypes,
    usecols=dtypes.keys(),
    index_col=[0, 1],
    sep=';',
    skiprows=4,
    nrows=464)

lak_emi_energy = lak_emi_energy.stack()

lak_emi_energy.index = (lak_emi_energy.index
                        .set_levels(pd.to_datetime(lak_emi_energy.index.levels[1], format='%Y'), level=1)
                        .set_levels(lak_emi_energy.index.levels[2].astype(pd.CategoricalDtype()), level=2)
                        .set_names(['Area', 'Date', 'Category'])
                       )
lak_emi_energy = lak_emi_energy.reorder_levels(['Area', 'Category', 'Date', ])
lak_emi_energy.sort_index(inplace=True)
lak_emi_energy.name = 'Emissions'
lak_emi_energy = lak_emi_energy.astype('pint[kilometric_ton]')

In [7]:
lak_emi_energy

Area               Category                Date      
Baden-Württemberg  Abfälle (nicht biogen)  1990-01-01     595.6759999999999
                                           1991-01-01               533.609
                                           1992-01-01              582.9358
                                           1993-01-01              582.8823
                                           1994-01-01                513.03
                                                                ...        
Thüringen          Steinkohle              2013-01-01    60.221564691110004
                                           2014-01-01      60.3047505431081
                                           2015-01-01     55.85147852397861
                                           2016-01-01    51.744334669214496
                                           2017-01-01    55.785371808505296
Name: Emissions, Length: 2713, dtype: pint[kilometric_ton]

## Read in CO2 mole fraction time series cmip_conc_co2

In [8]:
!tail -n4 $p/mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv

735857,2014,9,15,15-Sep-2014 00:00:00,395.026810069061,394.318660838738,395.734959299383
735887,2014,10,15,15-Oct-2014 00:00:00,396.668759675911,397.456652804706,395.880866547116
735918,2014,11,15,15-Nov-2014 00:00:00,398.189074185698,400.321219754153,396.056928617242
735948,2014,12,15,15-Dec-2014 00:00:00,399.179691756507,402.195087926915,396.164295586098


In [9]:
dtypes = {
    'data_mean_global': np.float,
    'data_mean_nh': np.float,
    'data_mean_sh': np.float,
}
cmip_conc_co2 = pd.read_csv(
    p / 'mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv',
    usecols=dtypes.keys(),
    #index_col=[0, 1],
)
# cannot use datetime here, because the range of possible values reaches back only into the 17th century
cmip_conc_co2.index = pd.period_range(start=pd.Period(year=0, month=1, freq='M'), end=pd.Period(year=2014, month=12, freq='M'), name='Date')

cmip_conc_co2 = cmip_conc_co2.stack()

cmip_conc_co2.index = (cmip_conc_co2.index
                       .set_levels(cmip_conc_co2.index.levels[1].str[len('data_mean_'):].astype(pd.CategoricalDtype(ordered=True)), level=1)
                       .set_names('Area', level=1)
                      )

cmip_conc_co2 = cmip_conc_co2.reorder_levels(['Area', 'Date', ])
cmip_conc_co2.sort_index(inplace=True)

cmip_conc_co2.name = 'CO2 concentration'

cmip_conc_co2 = cmip_conc_co2.astype('pint[dimensionless]')
cmip_conc_co2 *= 1e-6

In [10]:
cmip_conc_co2

Area    Date   
global  0-01       0.00027836643883702005
        0-02         0.000278713058010423
        0-03         0.000279013835279965
        0-04       0.00027920961334856496
        0-05         0.000279106444207592
                            ...          
sh      2014-08    0.00039577037859628094
        2014-09      0.000395734959299383
        2014-10      0.000395880866547116
        2014-11    0.00039605692861724196
        2014-12    0.00039616429558609803
Name: CO2 concentration, Length: 72540, dtype: pint[dimensionless]

## Read in PRIMAPDB

In [11]:
import read_primapdb as rp

In [12]:
mfh = rp.MatFileHandle.from_matfile(p / 'PRIMAPDB.mat')

In [86]:
def extract_df(sheet):
    a, df = rp.primap_sheet_to_pandas(sheet)

    df = df.stack()
    
    index = df.index.to_frame()
    index.columns = ['Date', 'Area']
    index['Area'] = index['Area'].astype(pd.CategoricalDtype(ordered=True))
    
    try:
        index['Date'] = pd.to_datetime(index['Date'], format='%Y')
    except ValueError:  # outside the range of datetime64
        index['Date'] = [pd.Period(year=x, freq='Y') for x in index['Date']]
    
    df.index = pd.MultiIndex.from_frame(index)
    
    df = df.reorder_levels(['Area', 'Date', ])
    df.sort_index(inplace=True)
    
    df.name = a.pop('entity')

    unit = a.pop('unit')    
    if unit == 'ThousandPers':
        df = df.astype('pint[dimensionless]')
        df *= 1000
    elif unit == 'Gg':
        df = df.astype('pint[gigagram]')
    elif unit == 't':
        df = df.astype('pint[metric_ton]')
    elif unit == 'kt':
        df = df.astype('pint[kilometric_ton]')
    elif unit == 'GgCO2eq':
        df = df.astype('pint[Gg]')
        df.name = f'{df.name} (CO2eq)'
    elif unit == 'ktC':
        df = df.astype('pint[kilometric_ton]')
        df.name = f'{df.name} (C)'
    elif unit in ('2011GKD/Pers', '2011GKD'):  # give up
        df = df.astype('pint[dimensionless]')
        df.name = f'{df.name} ({unit})'
    elif unit == 'MtCO2eq':
        df = df.astype('pint[megametric_ton]')
        df.name = f'{df.name} (CO2eq)'
    else:
        raise ValueError(f'unit={unit}')
        
    return a, df

In [87]:
primap_tables = {}
pmd = mfh["PRIMAPDB"]["mastertable"]["datatables"]
for key in tqdm.tqdm_notebook(pmd.group.keys()):
    primap_tables[key] = extract_df(pmd[key])

HBox(children=(IntProgress(value=0, max=4118), HTML(value='')))

## Read in PRIMAP-hist (published)

In [89]:
!head -n3 $p/PRIMAP-hist_v2.1_09-Nov-2019.csv

scenario,country,category,entity,unit,1850,1851,1852,1853,1854,1855,1856,1857,1858,1859,1860,1861,1862,1863,1864,1865,1866,1867,1868,1869,1870,1871,1872,1873,1874,1875,1876,1877,1878,1879,1880,1881,1882,1883,1884,1885,1886,1887,1888,1889,1890,1891,1892,1893,1894,1895,1896,1897,1898,1899,1900,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,1911,1912,1913,1914,1915,1916,1917,1918,1919,1920,1921,1922,1923,1924,1925,1926,1927,1928,1929,1930,1931,1932,1933,1934,1935,1936,1937,1938,1939,1940,1941,1942,1943,1944,1945,1946,1947,1948,1949,1950,1951,1952,1953,1954,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,1965,1966,1967,1968,1969,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
HISTCR,ABW,IPC1A,CH4,Gg,3.66e-04,3.78e-04,3.90e-04,4.02e-04,4.15e-04,4.28e-04,4.41e-04,4.54e-04,4.68e-04,4.82e-04,4.96e-04

In [182]:
dtypes = {
    'scenario': str,
    'country': str,
    'category': str,
    'entity': str,
    'unit': str,
}
for year in range(1850, 2018):
    dtypes[str(year)] = np.float64
    
prm_emi = pd.read_csv(
    p / 'PRIMAP-hist_v2.1_09-Nov-2019.csv',
    usecols=dtypes.keys(),
    index_col=[0, 1, 2, 3, 4],
)

prm_emi = prm_emi.stack()

index = prm_emi.index.to_frame()
index.columns = ['Scenario', 'Area', 'Category', 'Entity', 'Unit', 'Date']
index['Date'] = pd.to_datetime(index['Date'], format='%Y')
index.loc[index['Unit'] == 'GgCO2eq', 'Entity'] += ' (CO2eq)'
index = index.drop('Unit', axis='columns')

prm_emi.index = pd.MultiIndex.from_frame(index)
prm_emi = prm_emi.reorder_levels(['Area', 'Category', 'Date', 'Entity', 'Scenario'])
prm_emi.sort_index(inplace=True)
prm_emi.name = 'Emissions'
prm_emi = prm_emi.astype('pint[Gg]')

In [185]:
prm_emi

Area  Category   Date        Entity               Scenario
ABW   IPC1       1850-01-01  CH4                  HISTCR      0.00038700000000000003
                                                  HISTTP      0.00038700000000000003
                             CO2                  HISTCR                        40.3
                                                  HISTTP                        40.3
                             KYOTOGHG (CO2eq)     HISTCR                        40.3
                                                                       ...          
ZWE   IPCMAGELV  2017-01-01  KYOTOGHG (CO2eq)     HISTTP                      6630.0
                             KYOTOGHGAR4 (CO2eq)  HISTCR                      6040.0
                                                  HISTTP                      6640.0
                             N2O                  HISTCR                        15.6
                                                  HISTTP                        17.6
Name: 

# Measure memory use

In [200]:
!mkdir -p mem

In [203]:
import pickle

for fpath, obj in (
    ('fao_emi_total.pck', fao_emi_total),
    ('lak_emi_energy.pck', lak_emi_energy),
    ('cmip_conc_co2.pck', cmip_conc_co2),
    ('primap_tables.pck', primap_tables),
    ('prm_emi.pck', prm_emi), 
):
    with open(f'mem/{fpath}', 'wb') as fd:
        pickle.dump(obj, fd, -1)

In [204]:
!ls -lah mem/

total 320M
drwxrwxr-x 2 pflueger pflueger 4,0K Aug 24 20:15 .
drwxrwxr-x 5 pflueger pflueger 4,0K Aug 24 20:15 ..
-rw-rw-r-- 1 pflueger pflueger 1,2M Aug 24 20:15 cmip_conc_co2.pck
-rw-rw-r-- 1 pflueger pflueger 8,8M Aug 24 20:15 fao_emi_total.pck
-rw-rw-r-- 1 pflueger pflueger  40K Aug 24 20:15 lak_emi_energy.pck
-rw-rw-r-- 1 pflueger pflueger 205M Aug 24 20:15 primap_tables.pck
-rw-rw-r-- 1 pflueger pflueger 105M Aug 24 20:15 prm_emi.pck


In [205]:
ls -lah $p

total 308M
drwxrwxr-x 2 pflueger pflueger 4,0K Aug 21 15:35 [0m[01;34m.[0m/
drwxrwxr-x 5 pflueger pflueger 4,0K Aug 21 15:35 [01;34m..[0m/
-rw-rw-r-- 1 pflueger pflueger  10M Aug 28  2019 Emissions_Agriculture_Agriculture_total_E_All_Data.csv
-rw-rw---- 1 pflueger pflueger   41 Aug 19 18:34 .gitattributes
-rw-rw-r-- 1 pflueger pflueger  52K Aug 19 11:08 lak-download.csv
-rw-rw-r-- 1 pflueger pflueger 2,1M Aug 21 09:54 mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv
-rw-rw-r-- 1 pflueger pflueger 254M Aug 12 08:44 PRIMAPDB.mat
-rw-r--r-- 1 pflueger pflueger  44M Nov  9  2019 PRIMAP-hist_v2.1_09-Nov-2019.csv
