In [86]:
import numpy as np
import pandas as pd
import pint
import pint_pandas
import pathlib
u = pint.UnitRegistry()

pandas DataFrame in traditional tidy format:

* the index is meaningless (row numbers)
* each column contains one variable
* each observation forms a row
* each type of observational unit forms a DataFrame
* columns ordered from fixed variables (left) to measured variables; within each group, alphabetical order
* rows ordered using the leftmost variable, breaking ties with variables further right

Even given these constraints, there are still some variables in the data representation:

* duplicate information (like countrycode and full countryname) in a single DataFrame vs. multiple DataFrames (only countrycode in the DataFrame, second DataFrame maps countrycodes to countrynames)
* units as composite types (e.g. pint) vs. units as a variable

Because the data structure relies on multiple, possibly joinable DataFrames, there is a need for a container, which is not yet defined.

## representation
* load all example data sets into the data structure at once
* measure memory use
* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

In [15]:
!ls ../../primap-requirements/example_data_sets/

Emissions_Agriculture_Agriculture_total_E_All_Data.csv
lak-download.csv
mole-fraction-of-carbon-dioxide-in-air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-2-0_gr1-GMNHSH_000001-201412.nc
PRIMAPDB.mat
PRIMAP-hist_v2.1_09-Nov-2019.csv


In [16]:
p = pathlib.Path('../../primap-requirements/example_data_sets/')

### read in FAO emissions agriculture total fao_emi_total

In [21]:
years = list(range(1961, 2017)) + [2030, 2050]
dtypes = {}
for year in years:
    dtypes[f'Y{year}'] = np.float

dtypes['Area'] = pd.CategoricalDtype(ordered=True)
dtypes['Item'] = pd.CategoricalDtype(ordered=True)
dtypes['Element'] = pd.CategoricalDtype(ordered=True)

fao_emi_total = pd.read_csv(
    p / 'Emissions_Agriculture_Agriculture_total_E_All_Data.csv',
    encoding='latin_1',
    dtype=dtypes,
    usecols=dtypes.keys(),
    index_col=[0, 1, 2])

In [22]:
fao_emi_total = fao_emi_total.stack()

fao_emi_total.index = (fao_emi_total.index
                       .set_names('Year', level=3)
                       .set_levels(fao_emi_total.index.levels[3].str[1:].astype(int), level=3)
                       .set_levels(fao_emi_total.index.levels[2].str[len('Emissions ('):-1], level=2)
                      )

fao_emi_total.sort_index(inplace=True)

fao_emi_total.name = 'Emissions'

fao_emi_total = fao_emi_total.astype('pint[Gg]')

In [23]:
fao_emi_total

Area         Item                   Element  Year
Afghanistan  Agricultural Soils     CO2eq    1961    3097.8213
                                             1962    3063.8992
                                             1963    3116.2194
                                             1964    3181.7806
                                             1965    3257.1703
                                                       ...    
Zimbabwe     Synthetic Fertilizers  N2O      2014       1.0452
                                             2015        0.887
                                             2016       1.2097
                                             2030       2.1833
                                             2050       3.7159
Name: Emissions, Length: 507542, dtype: pint[gigagram]

### read in emission data for German bundesländer lak_emi_energy

In [29]:
!head -n7 $p/lak-download.csv

"L�nderarbeitskreis Energiebilanzen"
"Quellenbilanz: CO2-Emissionen nach Energietr�gern  *) "
"Einheit: 1 000 t CO2"
" "
Land;Jahr;Anm;Insgesamt;Steinkohle;Braunkohle;"Mineral�le und Mineral�l produkte";Gase;"Abf�lle (nicht biogen)";"Andere Energietr�ger";Stand
Baden-W�rttemberg;1990;"LAK1), BW1)";74883.9160479387;17575.2873793737;527.884916186355;45963.0617038672;10222.0060485114;595.676;0;05.03.2020
Baden-W�rttemberg;1991;"LAK1), BW1)";79279.0137239207;18138.0175792253;586.054488667663;49044.1685387109;10977.1641173168;533.609;0;05.03.2020


In [101]:
dtypes = {}
for i in ['Insgesamt', 'Steinkohle', 'Braunkohle', 'Mineralöle und Mineralöl produkte', 'Gase', 'Abfälle (nicht biogen)', 'Andere Energieträger']:
    dtypes[i] = np.float
dtypes['Land'] = pd.CategoricalDtype(ordered=True)
dtypes['Jahr'] = int

lak_emi_energy = pd.read_csv(
    p / 'lak-download.csv',
    encoding='latin_1',
    dtype=dtypes,
    usecols=dtypes.keys(),
    index_col=[0, 1],
    sep=';',
    skiprows=4,
    nrows=464)

In [102]:
lak_emi_energy = lak_emi_energy.stack()

In [103]:
lak_emi_energy.index = (lak_emi_energy.index
                        .set_levels(lak_emi_energy.index.levels[2].astype(pd.CategoricalDtype()), level=2)
                        .set_names(['Area', 'Year', 'Category'])
                       )
lak_emi_energy.name = 'Emissions'
lak_emi_energy = lak_emi_energy.astype('pint[kilometric_ton]')

In [105]:
lak_emi_energy

Area               Year  Category                         
Baden-Württemberg  1990  Insgesamt                             74883.9160479387
                         Steinkohle                            17575.2873793737
                         Braunkohle                            527.884916186355
                         Mineralöle und Mineralöl produkte     45963.0617038672
                         Gase                                  10222.0060485114
                                                                    ...        
Thüringen          2017  Braunkohle                            405.672147888414
                         Mineralöle und Mineralöl produkte    5238.425039159621
                         Gase                                  4362.57394820377
                         Abfälle (nicht biogen)                   322.472471745
                         Andere Energieträger                               0.0
Name: Emissions, Length: 2713, dtype: pint[kilometric_ton]