# FAOstat: Crops and livestock products
[_Source data_](https://www.fao.org/faostat/en/#data)

## Parameters

In [1]:
dest_dir = "/tmp/faostat_qcl"

## Read data and reference tables

In [2]:
from owid import catalog
import pandas as pd
from pathlib import Path

In [3]:
from etl.paths import BASE_DIR as base_path

## Load meadow dataset

In [6]:
qcl_meadow = catalog.Dataset(
    (base_path / "data/meadow/faostat/2021-03-18/faostat_qcl").as_posix()
)

In [7]:
# Bulk data and items metadata
qcl_bulk = qcl_meadow["bulk"]

In [8]:
qcl_bulk.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unit,Value
Area Code,Item Code,Element Code,Year,Flag,Unnamed: 5_level_1,Unnamed: 6_level_1
2,221,5312,1975,F,ha,0
2,221,5312,1976,F,ha,5900
2,221,5312,1977,F,ha,6000
2,221,5312,1978,F,ha,6000
2,221,5312,1979,F,ha,6000


## Clean dataset

### `Area`
Filtering and mapping

In [9]:
# Prepare for Country Tool
# ds = qcl_area.Country.drop_duplicates()
# ds.to_csv("ign.countries.csv", index=False)

In [10]:
# Load
with open(
    base_path
    / "etl"
    / "steps"
    / "data"
    / "garden"
    / "faostat"
    / "2021-03-18"
    / "faostat_qcl.country_std.csv"
) as f:
    df_country_std = pd.read_csv(f)
df_country_std = df_country_std.dropna()
mapping = dict(zip(df_country_std["Country"], df_country_std["Our World In Data Name"]))

In [11]:
# Create mapping Area Code -> Country
qcl_area = qcl_meadow["meta_area"]
msk = qcl_area.Country.isin(mapping)
map_area = qcl_area.Country.replace(mapping).loc[msk].sort_index().to_dict()

### `Item`

In [12]:
qcl_item = qcl_meadow["meta_item"]

In [13]:
# Find Item Groups with more than one Code (legacy?)
x = qcl_item.reset_index()
_ = x.groupby(["Item Group"]).agg(
    {"Item Group Code": [lambda x: x.nunique(), lambda x: x.unique().tolist()]}
)
__ = _["Item Group Code"]["<lambda_0>"]
_[__ > 1]

Unnamed: 0_level_0,Item Group Code,Item Group Code
Unnamed: 0_level_1,<lambda_0>,<lambda_1>
Item Group,Unnamed: 1_level_2,Unnamed: 2_level_2
Crops Primary,2,"[1714, QC]"
Live Animals,2,"[1756, QA]"


In [14]:
# Check if there are codes in bulk that *only* have a group code associated that is to be delete (sanity check before deletion)
codes_present = (
    qcl_bulk.index.get_level_values("Item Code").unique().astype(str).tolist()
)
msk = x["Item Code"].astype(str).isin(codes_present)
y = x[msk]
yy = y.groupby("Item Code")["Item Group Code"].agg(set)
l = yy[yy == {"QC"}].index.tolist()  # Change to see other groups with unique childs
x[x["Item Code"].isin(l)].head()

Unnamed: 0,Item Group Code,Item Code,Item Group,Item,Factor,HS Code,HS07 Code,HS12 Code,CPC Code
96,QC,1717,Crops Primary,"Cereals, Total",1.0,,,,
113,QC,1804,Crops Primary,"Citrus Fruit, Total",1.0,,,,
122,QC,813,Crops Primary,Coir,1.0,,530500.0,530500.0,1929.08
138,QC,1753,Crops Primary,Fibre Crops Primary,1.0,,,,
145,QC,1738,Crops Primary,Fruit Primary,1.0,,,,


In [15]:
qcl_item = qcl_item[["Item Group", "Item"]]

### `Element`

In [16]:
qcl_element = qcl_meadow["meta_element"]
qcl_unit = qcl_meadow["meta_unit"]

In [17]:
qcl_element_unit = qcl_element.merge(
    qcl_unit.rename(columns={"Description": "Unit Description"}),
    left_on="Unit",
    right_index=True,
)
assert qcl_element_unit.shape[0] == qcl_element.shape[0]

### Bulk

In [18]:
# Filter countries + Area Code -> Country
qcl_bulk = qcl_bulk.loc[map_area].rename(index=map_area, level=0)
name_map = {"Area Code": "Country"}
qcl_bulk.index.names = [name_map.get(n, n) for n in qcl_bulk.index.names]

In [19]:
# Drop Unit
qcl_bulk = qcl_bulk.drop(columns=["Unit"])

#### Variable name

In [20]:
# Get Item names
x = qcl_item.reset_index()
a = (
    x[["Item Group Code", "Item Group"]]
    .drop_duplicates()
    .rename(columns={"Item Group Code": "Code", "Item Group": "Name"})
)
b = (
    x[["Item Code", "Item"]]
    .drop_duplicates()
    .rename(columns={"Item Code": "Code", "Item": "Name"})
)
c = pd.concat([a, b])
map_items = dict(zip(c.Code, c.Name))
item_names = [map_items[it] for it in qcl_bulk.index.get_level_values(1)]

In [21]:
# Get Element + Unit names
x = qcl_element_unit.reset_index()
y = list(x["Element"].astype(str) + " (" + x["Unit"].astype(str) + ")")
map_elems = dict(zip(x["Element Code"], y))
elem_names = [map_elems[el] for el in qcl_bulk.index.get_level_values(2)]

In [22]:
# Construct variable name
variable_names = [f"{i} - {e}" for i, e in zip(item_names, elem_names)]

In [23]:
# Add variable name to index
qcl_bulk["Variable Name"] = variable_names
qcl_bulk = qcl_bulk.reset_index()
qcl_bulk = qcl_bulk.set_index(
    ["Country", "Item Code", "Element Code", "Variable Name", "Year", "Flag"]
)

## Create Garden dataset

In [25]:
qcl_garden = catalog.Dataset.create_empty(dest_dir)

In [26]:
# Propagate metadata
qcl_garden.metadata = qcl_meadow.metadata
qcl_garden.save()

In [38]:
# Add bulk table
qcl_garden.add(qcl_bulk)
# Add table items
qcl_garden.add(qcl_item)
# Add table elements
qcl_element_unit.metadata = qcl_element.metadata
qcl_garden.add(qcl_element_unit)

In [39]:
qcl_garden.save()