In [1]:
import numpy as np
import pandas as pd
import pint
import pint_pandas
import pathlib
u = pint.UnitRegistry()

pandas DataFrame in traditional tidy format:

* the index is meaningless (row numbers)
* each column contains one variable
* each observation forms a row
* each type of observational unit forms a DataFrame
* columns ordered from fixed variables (left) to measured variables; within each group, alphabetical order
* rows ordered using the leftmost variable, breaking ties with variables further right

Even given these constraints, there are still some variables in the data representation:

* duplicate information (like countrycode and full countryname) in a single DataFrame vs. multiple DataFrames (only countrycode in the DataFrame, second DataFrame maps countrycodes to countrynames)
* units as composite types (e.g. pint) vs. units as a variable

Because the data structure relies on multiple, possibly joinable DataFrames, there is a need for a container, which is not yet defined.

## representation
* load all example data sets into the data structure at once
* measure memory use
* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

In [19]:
!ls ../../primap-requirements/example_data_sets/

Emissions_Agriculture_Agriculture_total_E_All_Data.csv
lak-download.csv
mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv
PRIMAPDB.mat
PRIMAP-hist_v2.1_09-Nov-2019.csv


In [3]:
p = pathlib.Path('../../primap-requirements/example_data_sets/')

### read in FAO emissions agriculture total fao_emi_total

In [103]:
years = list(range(1961, 2017)) + [2030, 2050]
dtypes = {}
for year in years:
    dtypes[f'Y{year}'] = np.float

dtypes['Area'] = pd.CategoricalDtype(ordered=True)
dtypes['Item'] = pd.CategoricalDtype(ordered=True)
dtypes['Element'] = pd.CategoricalDtype(ordered=True)

fao_emi_total = pd.read_csv(
    p / 'Emissions_Agriculture_Agriculture_total_E_All_Data.csv',
    encoding='latin_1',
    dtype=dtypes,
    usecols=dtypes.keys(),
    index_col=[0, 1, 2])

fao_emi_total = fao_emi_total.stack()

fao_emi_total.index = (fao_emi_total.index
                       .set_names('Date', level=3)
                       .set_levels(pd.to_datetime(fao_emi_total.index.levels[3], format='Y%Y'), level=3)
                       .set_levels(fao_emi_total.index.levels[2].str[len('Emissions ('):-1], level=2)
                      )

fao_emi_total = fao_emi_total.reorder_levels(['Area', 'Date', 'Element', 'Item'])
fao_emi_total.sort_index(inplace=True)

fao_emi_total.name = 'Emissions'

fao_emi_total = fao_emi_total.astype('pint[Gg]')

In [104]:
fao_emi_total

Area         Date        Element  Item                   
Afghanistan  1961-01-01  CH4      Agriculture total          285.7793
                                  Burning - Crop residues      4.0734
                                  Enteric Fermentation       240.6831
                                  Manure Management           11.6228
                                  Rice Cultivation               29.4
                                                               ...   
Zimbabwe     2050-01-01  N2O      Crop Residues                1.1978
                                  Manure Management            0.3871
                                  Manure applied to Soils      0.6348
                                  Manure left on Pasture      20.8824
                                  Synthetic Fertilizers        3.7159
Name: Emissions, Length: 507542, dtype: pint[gigagram]

### read in emission data for German bundesländer lak_emi_energy

In [100]:
dtypes = {}
for i in ['Insgesamt', 'Steinkohle', 'Braunkohle', 'Mineralöle und Mineralöl produkte', 'Gase', 'Abfälle (nicht biogen)', 'Andere Energieträger']:
    dtypes[i] = np.float
dtypes['Land'] = pd.CategoricalDtype(ordered=True)
dtypes['Jahr'] = str

lak_emi_energy = pd.read_csv(
    p / 'lak-download.csv',
    encoding='latin_1',
    dtype=dtypes,
    usecols=dtypes.keys(),
    index_col=[0, 1],
    sep=';',
    skiprows=4,
    nrows=464)

lak_emi_energy = lak_emi_energy.stack()

lak_emi_energy.index = (lak_emi_energy.index
                        .set_levels(pd.to_datetime(lak_emi_energy.index.levels[1], format='%Y'), level=1)
                        .set_levels(lak_emi_energy.index.levels[2].astype(pd.CategoricalDtype()), level=2)
                        .set_names(['Area', 'Date', 'Category'])
                       )
lak_emi_energy = lak_emi_energy.reorder_levels(['Area', 'Category', 'Date', ])
lak_emi_energy.sort_index(inplace=True)
lak_emi_energy.name = 'Emissions'
lak_emi_energy = lak_emi_energy.astype('pint[kilometric_ton]')

In [101]:
lak_emi_energy

Area               Category                Date      
Baden-Württemberg  Abfälle (nicht biogen)  1990-01-01     595.6759999999999
                                           1991-01-01               533.609
                                           1992-01-01              582.9358
                                           1993-01-01              582.8823
                                           1994-01-01                513.03
                                                                ...        
Thüringen          Steinkohle              2013-01-01    60.221564691110004
                                           2014-01-01      60.3047505431081
                                           2015-01-01     55.85147852397861
                                           2016-01-01    51.744334669214496
                                           2017-01-01    55.785371808505296
Name: Emissions, Length: 2713, dtype: pint[kilometric_ton]

## Read in CO2 mole fraction time series cmip_conc_co2

In [58]:
!tail -n4 $p/mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv

735857,2014,9,15,15-Sep-2014 00:00:00,395.026810069061,394.318660838738,395.734959299383
735887,2014,10,15,15-Oct-2014 00:00:00,396.668759675911,397.456652804706,395.880866547116
735918,2014,11,15,15-Nov-2014 00:00:00,398.189074185698,400.321219754153,396.056928617242
735948,2014,12,15,15-Dec-2014 00:00:00,399.179691756507,402.195087926915,396.164295586098


In [108]:
dtypes = {
    'data_mean_global': np.float,
    'data_mean_nh': np.float,
    'data_mean_sh': np.float,
}
cmip_conc_co2 = pd.read_csv(
    p / 'mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv',
    usecols=dtypes.keys(),
    #index_col=[0, 1],
)
# cannot use datetime here, because the range of possible values reaches back only into the 17th century
cmip_conc_co2.index = pd.period_range(start=pd.Period(year=0, month=1, freq='M'), end=pd.Period(year=2014, month=12, freq='M'), name='Date')

cmip_conc_co2 = cmip_conc_co2.stack()

cmip_conc_co2.index = (cmip_conc_co2.index
                       .set_levels(cmip_conc_co2.index.levels[1].str[len('data_mean_'):].astype(pd.CategoricalDtype(ordered=True)), level=1)
                       .set_names('Area', level=1)
                      )

cmip_conc_co2 = cmip_conc_co2.reorder_levels(['Area', 'Date', ])
cmip_conc_co2.sort_index(inplace=True)

cmip_conc_co2.name = 'CO2 concentration'

cmip_conc_co2 = cmip_conc_co2.astype('pint[dimensionless]')
cmip_conc_co2 *= 1e-6

In [112]:
cmip_conc_co2

Area    Date   
global  0-01       0.00027836643883702005
        0-02         0.000278713058010423
        0-03         0.000279013835279965
        0-04       0.00027920961334856496
        0-05         0.000279106444207592
                            ...          
sh      2014-08    0.00039577037859628094
        2014-09      0.000395734959299383
        2014-10      0.000395880866547116
        2014-11    0.00039605692861724196
        2014-12    0.00039616429558609803
Name: CO2 concentration, Length: 72540, dtype: pint[dimensionless]