In [14]:
dest_dir = "/tmp/faostat_qcl"

In [21]:
import zipfile
import tempfile
import shutil
from pathlib import Path
import os

import pandas as pd

from owid.catalog import Dataset, Table
from owid.walden import Catalog
from etl.paths import DATA_DIR
from etl.steps.data.converters import convert_walden_metadata

In [16]:
walden_ds = Catalog().find_one("faostat", "2021-03-18", "faostat_QCL")

In [17]:
walden_ds.local_path

'/Users/lucasrodes/.owid/walden/faostat/2021-03-18/faostat_QCL.zip'

In [18]:
walden_ds

Dataset(namespace='faostat', short_name='faostat_QCL', name='Production: Crops and livestock products - FAO (2021)', description='Crop statistics are recorded for 173 products, covering the following categories: Crops Primary, Fibre Crops Primary, Cereals, Coarse Grain, Citrus Fruit, Fruit, Jute Jute-like Fibres, Oilcakes Equivalent, Oil crops Primary, Pulses, Roots and Tubers, Treenuts and Vegetables and Melons. Data are expressed in terms of area harvested, production quantity and yield. The objective is to comprehensively cover production of all primary crops for all countries and regions in the world.Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. Area data relate to harvested area. Some countries report sown or cultivated area only', source_name='Food and Agriculture Organization of the United Nations', url='http://www.fa

In [None]:
walden_ds.local_path

In [19]:
tmp_dir = tempfile.mkdtemp()

In [27]:
z = zipfile.ZipFile(walden_ds.local_path)
z.extractall(tmp_dir)
print(os.listdir(tmp_dir))
filename, = list(
    filter(lambda x: "(Normalized)" in x, os.listdir(tmp_dir))
)
filename

['Production_Crops_Livestock_E_All_Data_(Normalized).csv', 'Production_Crops_Livestock_E_Flags.csv']


'Production_Crops_Livestock_E_All_Data_(Normalized).csv'

In [38]:
df = pd.read_csv(os.path.join(tmp_dir, filename), encoding="latin-1")

In [39]:
df.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1975,1975,ha,0.0,F
1,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1976,1976,ha,5900.0,F
2,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1977,1977,ha,6000.0,F
3,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1978,1978,ha,6000.0,F
4,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1979,1979,ha,6000.0,F


In [40]:
df.drop(columns=["Area Code", "Year Code"], inplace=True)

In [41]:
df.head()

Unnamed: 0,Area,Item Code,Item,Element Code,Element,Year,Unit,Value,Flag
0,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1975,ha,0.0,F
1,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1976,ha,5900.0,F
2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1977,ha,6000.0,F
3,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1978,ha,6000.0,F
4,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1979,ha,6000.0,F


In [43]:
df.isnull().any()

Area            False
Item Code       False
Item            False
Element Code    False
Element         False
Year            False
Unit            False
Value            True
Flag             True
dtype: bool

In [None]:
df.dtypes

In [45]:
column_cols = ["Area", "Item", "Element", "Unit", "Flag"]
df[column_cols] = df[column_cols].astype("category")

In [50]:
def check_unique_pairs(df, name_1, name_2):
    if not (
        (df.groupby(name_1)[name_2].nunique() == 1).sum()
        and (df.groupby(name_2)[name_1].nunique() == 1).sum()
    ):
        raise ValueError(
            f"Some `{name_1}` may have multiple `{name_2}` values (or opposite)."
        )

In [51]:
fields = ["Item", "Element"]
for field in fields:
    check_unique_pairs(df, field, f"{field} Code")

In [57]:
df.head()

Unnamed: 0,Area,Item Code,Item,Element Code,Element,Year,Unit,Value,Flag
0,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1975,ha,0.0,F
1,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1976,ha,5900.0,F
2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1977,ha,6000.0,F
3,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1978,ha,6000.0,F
4,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1979,ha,6000.0,F


In [58]:
df.set_index(["Area", "Item", "Element", "Year", "Unit"], inplace=True)

In [61]:
ds = Dataset.create_empty(dest_dir)
ds.metadata = convert_walden_metadata(walden_ds)
ds.save()

In [62]:
ds

Dataset(path='/tmp/faostat_qcl', metadata=DatasetMeta(namespace='faostat', short_name='faostat_QCL', title='Production: Crops and livestock products - FAO (2021)', description='Crop statistics are recorded for 173 products, covering the following categories: Crops Primary, Fibre Crops Primary, Cereals, Coarse Grain, Citrus Fruit, Fruit, Jute Jute-like Fibres, Oilcakes Equivalent, Oil crops Primary, Pulses, Roots and Tubers, Treenuts and Vegetables and Melons. Data are expressed in terms of area harvested, production quantity and yield. The objective is to comprehensively cover production of all primary crops for all countries and regions in the world.Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. Area data relate to harvested area. Some countries report sown or cultivated area only', sources=[Source(name='Food and Agriculture

In [64]:
t = Table(df)
t.metadata.short_name = "faostat_qcl"
ds.add(t)

In [None]:
shutil.rmtree(tmp_dir)