# FAOstat: Crops and livestock products
[_Source data_](https://www.fao.org/faostat/en/#data)

## Parameters

In [1]:
dest_dir = "/tmp/faostat_qcl"

## Read data and reference tables

In [2]:
import json
import pandas as pd
from pathlib import Path

from owid import catalog
from etl.paths import BASE_DIR, DATA_DIR

In [3]:
COUNTRY_MAPPING = (
    BASE_DIR / "etl/steps/data/garden/faostat/2021-03-18/faostat_qcl.country_std.json"
)

## Load meadow dataset

In [4]:
qcl_meadow = catalog.Dataset(DATA_DIR / "meadow/faostat/2021-03-18/faostat_qcl")

In [5]:
metadata = catalog.Dataset(DATA_DIR / "meadow/faostat/2022-02-10/faostat_metadata")

In [6]:
# Bulk data and items metadata
qcl_bulk = qcl_meadow["bulk"]

In [7]:
qcl_bulk.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,unit,value
area_code,item_code,element_code,year,flag,Unnamed: 5_level_1,Unnamed: 6_level_1
2,221,5312,1975,F,ha,0
2,221,5312,1976,F,ha,5900
2,221,5312,1977,F,ha,6000
2,221,5312,1978,F,ha,6000
2,221,5312,1979,F,ha,6000


## Clean dataset

### `Area`
Filtering and mapping

In [8]:
# Prepare for Country Tool
# ds = qcl_area.Country.drop_duplicates()
# ds.to_csv("ign.countries.csv", index=False)

In [9]:
qcl_area = metadata["meta_qcl_area"]

In [10]:
# Load
with open(COUNTRY_MAPPING) as f:
    country_mapping = json.load(f)

In [12]:
# Check which countries will be discarded based on our country standardisation file (those without a mapped standardised name)
msk = qcl_area.country.isin(country_mapping)
print(qcl_area.loc[-msk, "country"].tolist())

['Belgium-Luxembourg', 'Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa', 'Americas', 'Northern America', 'Central America', 'Caribbean', 'South America', 'Central Asia', 'Eastern Asia', 'Southern Asia', 'South-eastern Asia', 'Western Asia', 'Eastern Europe', 'Northern Europe', 'Southern Europe', 'Western Europe', 'Australia and New Zealand', 'Micronesia', 'European Union (27)', 'Least Developed Countries', 'Land Locked Developing Countries', 'Small Island Developing States', 'Low Income Food Deficit Countries', 'Net Food Importing Developing Countries']


Finally, we build the `Area Code ---> Country` mapping dictionary.

In [13]:
area_codes_discard = [140, 259, 260]
if set(qcl_bulk.index.levels[0]).intersection(area_codes_discard):
    raise ValueError(
        "There are some changes in the bulk data! Codes that are being discarded might probably be needed"
    )
# Discard
qcl_area = qcl_area.loc[~qcl_area.index.isin(area_codes_discard)]

In [14]:
map_area = qcl_area.loc[msk, "country"].replace(country_mapping).sort_index().to_dict()

### `Item`

In [15]:
qcl_item = metadata["meta_qcl_item"]

In [16]:
# Find Item Groups with more than one Code (legacy?)
x = qcl_item.reset_index()
_ = x.groupby(["item_group"]).agg(
    {"item_group_code": [lambda x: x.nunique(), lambda x: x.unique().tolist()]}
)
__ = _["item_group_code"]["<lambda_0>"]
_[__ > 1]

Unnamed: 0_level_0,item_group_code,item_group_code
Unnamed: 0_level_1,<lambda_0>,<lambda_1>
item_group,Unnamed: 1_level_2,Unnamed: 2_level_2
Crops Primary,2,"[QC, 1714]"
Live Animals,2,"[QA, 1756]"


In [17]:
# Check if there are codes in bulk that *only* have a group code associated that is to be delete (sanity check before deletion)
codes_present = (
    qcl_bulk.index.get_level_values("item_code").unique().astype(str).tolist()
)
msk = x["item_code"].astype(str).isin(codes_present)
y = x[msk]
yy = y.groupby("item_code")["item_group_code"].agg(set)
l = yy[yy == {"QC"}].index.tolist()  # Change to see other groups with unique childs
x[x["item_code"].isin(l)].head()

Unnamed: 0,item_group_code,item_code,cpc_code,factor,hs_code,hs07_code,hs12_code,item,item_group
1,QC,1753,F1753,1.0,,,,Fibre Crops Primary,Crops Primary
101,QC,1717,F1717,1.0,,,,"Cereals, Total",Crops Primary
118,QC,1804,F1804,1.0,,,,"Citrus Fruit, Total",Crops Primary
127,QC,813,01929.08,1.0,,530500.0,530500.0,Coir,Crops Primary
148,QC,1738,F1738,1.0,,,,Fruit Primary,Crops Primary


In [18]:
qcl_item = qcl_item[["item_group", "item"]]

### `Element`

In [19]:
qcl_element = metadata["meta_qcl_element"]
qcl_unit = metadata["meta_qcl_unit"]

In [20]:
qcl_element_unit = qcl_element.merge(
    qcl_unit.rename(columns={"description": "unit_description"}),
    left_on="unit",
    right_index=True,
)
assert qcl_element_unit.shape[0] == qcl_element.shape[0]

### Bulk

In [21]:
# Filter countries + Area Code -> Country
qcl_bulk = qcl_bulk.loc[map_area].rename(index=map_area, level=0)
name_map = {"area_code": "country"}
qcl_bulk.index.names = [name_map.get(n, n) for n in qcl_bulk.index.names]

  qcl_bulk = qcl_bulk.loc[map_area].rename(index=map_area, level=0)


In [22]:
# Drop Unit
qcl_bulk = qcl_bulk.drop(columns=["unit"])

#### Variable name

In [23]:
qcl_bulk.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,value
country,item_code,element_code,year,flag,Unnamed: 5_level_1
Armenia,221,5312,1992,M,
Armenia,221,5312,1993,M,
Armenia,221,5312,1994,M,
Armenia,221,5312,1995,M,
Armenia,221,5312,1996,M,


In [24]:
qcl_item.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_group,item
item_group_code,item_code,Unnamed: 2_level_1,Unnamed: 3_level_1
QC,1714,Crops Primary,Crops Primary
QC,1753,Crops Primary,Fibre Crops Primary
QC,1730,Crops Primary,Oilcrops Primary
QA,1756,Live Animals,Live Animals
QL,1777,Livestock primary,"Hides and skins, primary"


In [25]:
# Get Item names
x = qcl_item.reset_index()
a = (
    x[["item_group_code", "item_group"]]
    .drop_duplicates()
    .rename(columns={"item_group_code": "code", "item_group": "name"})
)
b = (
    x[["item_code", "item"]]
    .drop_duplicates()
    .rename(columns={"item_code": "code", "item": "name"})
)
c = pd.concat([a, b])
map_items = dict(zip(c.code, c.name))

In [26]:
# manually add some missing names to the map that were removed from the API

missing = {
    1067: "Eggs, hen, in shell (number)",
    1092: "Eggs, other bird, in shell (number)",
    1731: "Oilcrops",
}

for k in missing:
    assert k not in map_items
    map_items[k] = missing[k]

In [27]:
item_names = [map_items[it] for it in qcl_bulk.index.get_level_values(1)]

In [28]:
# Get Element + Unit names
x = qcl_element_unit.reset_index()
y = list(x["element"].astype(str) + " (" + x["unit"].astype(str) + ")")
map_elems = dict(zip(x["element_code"], y))
elem_names = [map_elems[el] for el in qcl_bulk.index.get_level_values(2)]

In [29]:
# Construct variable name
variable_names = [f"{i} - {e}" for i, e in zip(item_names, elem_names)]

In [30]:
# Add variable name to index
qcl_bulk["variable_name"] = variable_names
qcl_bulk = qcl_bulk.reset_index()
qcl_bulk = qcl_bulk.set_index(
    ["country", "item_code", "element_code", "variable_name", "year", "flag"]
)

## Create Garden dataset

In [31]:
qcl_garden = catalog.Dataset.create_empty(dest_dir)

In [32]:
# Propagate metadata
qcl_garden.metadata = qcl_meadow.metadata
qcl_garden.save()

In [33]:
# Add bulk table
qcl_garden.add(qcl_bulk)
# Add table items
qcl_garden.add(qcl_item)
# Add table elements
qcl_element_unit.metadata = qcl_element.metadata
qcl_garden.add(qcl_element_unit)

In [None]:
qcl_garden.save()