# FAOstat: Crops and livestock products
[_Source data_](https://www.fao.org/faostat/en/#data)

## Parameters

In [None]:
dest_dir = "/tmp/faostat_qcl"

## Read data and reference tables

In [None]:
import json
import pandas as pd
from pathlib import Path

from owid import catalog
from etl.paths import BASE_DIR, DATA_DIR

In [None]:
COUNTRY_MAPPING = BASE_DIR / "etl/steps/data/garden/faostat/2021-03-18/faostat_qcl.country_std.json"

## Load meadow dataset

In [None]:
qcl_meadow = catalog.Dataset(DATA_DIR / "meadow/faostat/2021-03-18/faostat_qcl")

In [None]:
metadata = catalog.Dataset(DATA_DIR / "meadow/faostat/2022-02-10/faostat_metadata")

In [None]:
# Bulk data and items metadata
qcl_bulk = qcl_meadow["bulk"]

In [None]:
qcl_bulk.head()

## Clean dataset

### `Area`
Filtering and mapping

In [None]:
# Prepare for Country Tool
# ds = qcl_area.Country.drop_duplicates()
# ds.to_csv("ign.countries.csv", index=False)

In [None]:
qcl_area = metadata["meta_qcl_area"]

In [None]:
# Load
with open(COUNTRY_MAPPING) as f:
    country_mapping = json.load(f)

In [None]:
# Check which countries will be discarded based on our country standardisation file (those without a mapped standardised name)
msk = qcl_area.country.isin(country_mapping)
print(qcl_area.loc[-msk, "country"].tolist())

Finally, we build the `Area Code ---> Country` mapping dictionary.

In [None]:
area_codes_discard = [140, 259, 260]
if set(qcl_bulk.index.levels[0]).intersection(area_codes_discard):
    raise ValueError("There are some changes in the bulk data! Codes that are being discarded might probably be needed")
# Discard
qcl_area = qcl_area.loc[~qcl_area.index.isin(area_codes_discard)]

In [None]:
map_area = qcl_area.loc[msk, "country"].replace(country_mapping).sort_index().to_dict()

### `Item`

In [None]:
qcl_item = metadata["meta_qcl_item"]

In [None]:
# Find Item Groups with more than one Code (legacy?)
x = qcl_item.reset_index()
_ = x.groupby(["item_group"]).agg({"item_group_code": [lambda x: x.nunique(), lambda x: x.unique().tolist()]})
__ = _["item_group_code"]["<lambda_0>"]
_[__ > 1]

In [None]:
# Check if there are codes in bulk that *only* have a group code associated that is to be delete (sanity check before deletion)
codes_present = qcl_bulk.index.get_level_values("item_code").unique().astype(str).tolist()
msk = x["item_code"].astype(str).isin(codes_present)
y = x[msk]
yy = y.groupby("item_code")["item_group_code"].agg(set)
l = yy[yy == {"QC"}].index.tolist()  # Change to see other groups with unique childs
x[x["item_code"].isin(l)].head()

In [None]:
qcl_item = qcl_item[["item_group", "item"]]

### `Element`

In [None]:
qcl_element = metadata["meta_qcl_element"]
qcl_unit = metadata["meta_qcl_unit"]

In [None]:
qcl_element_unit = qcl_element.merge(
    qcl_unit.rename(columns={"description": "unit_description"}),
    left_on="unit",
    right_index=True,
)
assert qcl_element_unit.shape[0] == qcl_element.shape[0]

### Bulk

In [None]:
# Filter countries + Area Code -> Country
qcl_bulk = qcl_bulk.loc[map_area].rename(index=map_area, level=0)
name_map = {"area_code": "country"}
qcl_bulk.index.names = [name_map.get(n, n) for n in qcl_bulk.index.names]

In [None]:
# Drop Unit
qcl_bulk = qcl_bulk.drop(columns=["unit"])

#### Variable name

In [None]:
qcl_bulk.head()

In [None]:
qcl_item.head()

In [None]:
# Get Item names
x = qcl_item.reset_index()
a = (
    x[["item_group_code", "item_group"]]
    .drop_duplicates()
    .rename(columns={"item_group_code": "code", "item_group": "name"})
)
b = x[["item_code", "item"]].drop_duplicates().rename(columns={"item_code": "code", "item": "name"})
c = pd.concat([a, b])
map_items = dict(zip(c.code, c.name))

In [None]:
# manually add some missing names to the map that were removed from the API

missing = {
    1067: "Eggs, hen, in shell (number)",
    1092: "Eggs, other bird, in shell (number)",
    1731: "Oilcrops",
}

for k in missing:
    assert k not in map_items
    map_items[k] = missing[k]

In [None]:
item_names = [map_items[it] for it in qcl_bulk.index.get_level_values(1)]

In [None]:
# Get Element + Unit names
x = qcl_element_unit.reset_index()
y = list(x["element"].astype(str) + " (" + x["unit"].astype(str) + ")")
map_elems = dict(zip(x["element_code"], y))
elem_names = [map_elems[el] for el in qcl_bulk.index.get_level_values(2)]

In [None]:
# Construct variable name
variable_names = [f"{i} - {e}" for i, e in zip(item_names, elem_names)]

In [None]:
# Add variable name to index
qcl_bulk["variable_name"] = variable_names
qcl_bulk = qcl_bulk.reset_index()
qcl_bulk = qcl_bulk.set_index(["country", "item_code", "element_code", "variable_name", "year", "flag"])

## Create Garden dataset

In [None]:
qcl_garden = catalog.Dataset.create_empty(dest_dir)

In [None]:
# Propagate metadata
qcl_garden.metadata = qcl_meadow.metadata
qcl_garden.save()

In [None]:
# Add bulk table
qcl_garden.add(qcl_bulk)
# Add table items
qcl_garden.add(qcl_item)
# Add table elements
qcl_element_unit.metadata = qcl_element.metadata
qcl_garden.add(qcl_element_unit)

In [None]:
qcl_garden.save()