# FAOstat: Crops and livestock products
[_Source data_](https://www.fao.org/faostat/en/#data)

## Parameters

In [20]:
dest_dir = "/tmp/faostat_qcl"

## Imports

In [95]:
import zipfile
import tempfile
import shutil
from pathlib import Path
import os

import requests
import pandas as pd

from owid.catalog import Dataset, Table, frames
from owid.walden import Catalog
from etl.paths import DATA_DIR
from etl.steps.data.converters import convert_walden_metadata

## Fetch walden dataset

In [22]:
walden_ds = Catalog().find_one("faostat", "2021-03-18", "faostat_QCL")

In [23]:
walden_ds

Dataset(namespace='faostat', short_name='faostat_QCL', name='Production: Crops and livestock products - FAO (2021)', description='Crop statistics are recorded for 173 products, covering the following categories: Crops Primary, Fibre Crops Primary, Cereals, Coarse Grain, Citrus Fruit, Fruit, Jute Jute-like Fibres, Oilcakes Equivalent, Oil crops Primary, Pulses, Roots and Tubers, Treenuts and Vegetables and Melons. Data are expressed in terms of area harvested, production quantity and yield. The objective is to comprehensively cover production of all primary crops for all countries and regions in the world.Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. Area data relate to harvested area. Some countries report sown or cultivated area only', source_name='Food and Agriculture Organization of the United Nations', url='http://www.fa

## Make a dataset

In [77]:
ds = Dataset.create_empty(dest_dir)
ds.metadata = convert_walden_metadata(walden_ds)
ds.save()

In [78]:
ds

Dataset(path='/tmp/faostat_qcl', metadata=DatasetMeta(namespace='faostat', short_name='faostat_QCL', title='Production: Crops and livestock products - FAO (2021)', description='Crop statistics are recorded for 173 products, covering the following categories: Crops Primary, Fibre Crops Primary, Cereals, Coarse Grain, Citrus Fruit, Fruit, Jute Jute-like Fibres, Oilcakes Equivalent, Oil crops Primary, Pulses, Roots and Tubers, Treenuts and Vegetables and Melons. Data are expressed in terms of area harvested, production quantity and yield. The objective is to comprehensively cover production of all primary crops for all countries and regions in the world.Cereals: Area and production data on cereals relate to crops harvested for dry grain only. Cereal crops harvested for hay or harvested green for food, feed or silage or used for grazing are therefore excluded. Area data relate to harvested area. Some countries report sown or cultivated area only', sources=[Source(name='Food and Agriculture

## Unzip to temp directory

In [24]:
tmp_dir = tempfile.mkdtemp()

In [25]:
z = zipfile.ZipFile(walden_ds.local_path)
z.extractall(tmp_dir)
# print(os.listdir(tmp_dir))
(filename,) = list(filter(lambda x: "(Normalized)" in x, os.listdir(tmp_dir)))
filename

'Production_Crops_Livestock_E_All_Data_(Normalized).csv'

## Metadata
_To be moved into Walden_

In [119]:
metadata = requests.get(
    "https://fenixservices.fao.org/faostat/api/v1/en/definitions/domain/QCL/itemsgroup?output_type=objects"
).json()

In [80]:
df_meta = pd.DataFrame.from_dict(metadata["data"])

In [81]:
df_meta.set_index(["Item Group Code", "Item Code"], verify_integrity=True, inplace=True)

In [82]:
t = Table(df_meta)
t.metadata.short_name = "item_groups"
ds.add(t)

## Load dataset
### Data file

In [97]:
df = pd.read_csv(os.path.join(tmp_dir, filename), encoding="latin-1")

In [98]:
df.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1975,1975,ha,0.0,F
1,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1976,1976,ha,5900.0,F
2,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1977,1977,ha,6000.0,F
3,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1978,1978,ha,6000.0,F
4,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1979,1979,ha,6000.0,F


## Clean dataset

In [99]:
# Drop unuseful columns
df.drop(columns=["Year Code"], inplace=True)

In [100]:
df.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Year,Unit,Value,Flag
0,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1975,ha,0.0,F
1,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1976,ha,5900.0,F
2,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1977,ha,6000.0,F
3,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1978,ha,6000.0,F
4,2,Afghanistan,221,"Almonds, with shell",5312,Area harvested,1979,ha,6000.0,F


In [101]:
# Check nulls
df.isnull().any()

Area Code       False
Area            False
Item Code       False
Item            False
Element Code    False
Element         False
Year            False
Unit            False
Value            True
Flag             True
dtype: bool

### Sanity checks

In [107]:
# def check_unique_pairs(df, name_1, name_2):
#     if not (
#         (df.groupby(name_1)[name_2].nunique() != 1).any()
#         and (df.groupby(name_2)[name_1].nunique() != 1).any()
#     ):
#         raise ValueError(
#             f"Some `{name_1}` may have multiple `{name_2}` values (or opposite)."
#         )


# fields = ["Item", Element"]
# for field in fields:
#     check_unique_pairs(df, field, f"{field} Code")

In [108]:
x = df.groupby(["Element", "Unit"])["Element Code"].nunique()
if (x > 1).any():
    raise ValueError(f"Element-Unit not unique!")

In [109]:
df.set_index(
    ["Area Code", "Item Code", "Element Code", "Year", "Flag"],
    inplace=True,
    verify_integrity=True,
)

## Add tables

In [110]:
t = Table(df)
t.metadata.short_name = "bulk"
ds.add(t)

## Cleanup

In [111]:
shutil.rmtree(tmp_dir)