In [5]:
import matplotlib
%matplotlib widget
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import frictionless as fl

frictionless workflow, with tidy tabular data packages

# representation
* load all example data sets into the data structure at once
* measure memory use
* export the data sets to CSV
* reimport from CSV
* export the data sets to some kind of native storage format
* reimport from the native storage format

## Read in the data

In [3]:
!ls ../../primap-requirements/example_data_sets/

Emissions_Agriculture_Agriculture_total_E_All_Data.csv
lak-download.csv
mole_fraction_of_carbon_dioxide_in_air_input4MIPs_GHGConcentrations_CMIP_UoM-CMIP-1-1-0_gr3-GMNHSH_000001-201412.csv
PRIMAPDB.mat
PRIMAP-hist_v2.1_09-Nov-2019.csv


In [6]:
p = pathlib.Path('../../primap-requirements/example_data_sets/')

### read in FAO emissions agriculture total fao_emi_total

In [21]:
!mkdir -p fl/fao_emi_total/source
!cp $p/Emissions_Agriculture_Agriculture_total_E_All_Data.csv fl/fao_emi_total/source/

In [28]:
import os
import contextlib
from pathlib import Path

@contextlib.contextmanager
def cd(path):
    """Changes working directory and returns to previous on exit."""
    prev_cwd = Path.cwd()
    os.chdir(path)
    try:
        yield
    finally:
        os.chdir(prev_cwd)


In [33]:
with cd('fl/fao_emi_total/source/'):
    resource = fl.describe('Emissions_Agriculture_Agriculture_total_E_All_Data.csv', encoding='latin_1')
    resource['name'] = 'fao_emi_total'
    resource.to_yaml('fao_emi_total.resource.yaml')

In [35]:
fl.validate('fl/fao_emi_total/source/fao_emi_total.resource.yaml')['valid']

True

In [40]:
fl.transform(
    {
        'type': 'package',
        "steps": [
          {"type": "load", "spec": {"loadSource": "fl/fao_emi_total/source/Emissions_Agriculture_Agriculture_total_E_All_Data.csv",
                                    "encoding": 'latin_1'}},
          {"type": "delete_fields", "spec": {"fields": ['Y\d{4}[FN]', ]}},
          {"type": "dump_to_path", "spec": {"outPath": 'fl/fao_emi_total/cleaned/'}},
        ],
    }
)

In [41]:
ls fl/fao_emi_total/cleaned/

datapackage.json  Emissions_Agriculture_Agriculture_total_E_All_Data.csv


**all of this feels incredibly awkward and is poorly documented, and according to the docs is still in development.**

**let's instead look at the usage of data packages for I/O**

In [43]:
from tableschema_pandas import Storage

In [44]:
s = Storage()

In [46]:
s.create?

[0;31mSignature:[0m [0ms[0m[0;34m.[0m[0mcreate[0m[0;34m([0m[0mbucket[0m[0;34m,[0m [0mdescriptor[0m[0;34m,[0m [0mforce[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
https://github.com/frictionlessdata/tableschema-pandas-py#storage
        
[0;31mFile:[0m      ~/.local/lib/python3.8/site-packages/tableschema_pandas/storage.py
[0;31mType:[0m      method


In [47]:
# okay, I give up for this usecase as well. The documentation is byzantine.