# FAOstat: Crops and livestock products
[_Source data_](https://www.fao.org/faostat/en/#data)

## Parameters

In [1]:
dest_dir = "/tmp/faostat_fbsh"

## Imports

In [2]:
import zipfile
import tempfile
import shutil
from pathlib import Path
import os

import requests
import pandas as pd

from owid.catalog import Dataset, Table, frames
from owid.walden import Catalog
from etl.paths import DATA_DIR
from etl.steps.data.converters import convert_walden_metadata

## Fetch walden dataset

In [3]:
walden_ds = Catalog().find_one("faostat", "2017-12-11", "faostat_FBSH")

In [4]:
walden_ds

Dataset(namespace='faostat', short_name='faostat_FBSH', name='Food Balance: Food Balances (-2013, old methodology and population) - FAO (2017)', description="Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per

## Make a dataset

In [10]:
ds = Dataset.create_empty(dest_dir)
ds.metadata = convert_walden_metadata(walden_ds)
ds.save()

In [11]:
ds

Dataset(path='/tmp/faostat_fbsh', metadata=DatasetMeta(namespace='faostat', short_name='faostat_FBSH', title='Food Balance: Food Balances (-2013, old methodology and population) - FAO (2017)', description="Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food s

## Metadata
_To be moved into Walden_

In [12]:
data_extra = [
    {
        "url": "https://fenixservices.fao.org/faostat/api/v1/en/definitions/domain/FBSH/itemgroup?output_type=objects",
        "index": ["Item Group Code", "Item Code"],
        "short_name": "meta_item",
    },
    {
        "url": "https://fenixservices.fao.org/faostat/api/v1/en/definitions/domain/FBSH/area?output_type=objects",
        "index": ["Country Code"],
        "short_name": "meta_area",
    },
    {
        "url": "https://fenixservices.fao.org/faostat/api/v1/en/definitions/domain/FBSH/element?output_type=objects",
        "index": ["Element Code"],
        "short_name": "meta_element",
    },
    {
        "url": "https://fenixservices.fao.org/faostat/api/v1/en/definitions/domain/FBSH/unit?output_type=objects",
        "index": ["Unit Name"],
        "short_name": "meta_unit",
    },
]

In [13]:
for data_ in data_extra:
    metadata = requests.get(data_["url"]).json()
    df_ = pd.DataFrame.from_dict(metadata["data"])
    df_.set_index(data_["index"], verify_integrity=True, inplace=True)
    t = Table(df_)
    t.metadata.short_name = data_["short_name"]
    ds.add(t)

meta_item
meta_area
meta_element
meta_unit


## Unzip to temp directory

In [14]:
tmp_dir = tempfile.mkdtemp()

In [15]:
z = zipfile.ZipFile(walden_ds.local_path)
z.extractall(tmp_dir)
# print(os.listdir(tmp_dir))
(filename,) = list(filter(lambda x: "(Normalized)" in x, os.listdir(tmp_dir)))
filename

'FoodBalanceSheetsHistoric_E_All_Data_(Normalized).csv'

## Load dataset
### Data file

In [16]:
df = pd.read_csv(os.path.join(tmp_dir, filename), encoding="latin-1")

In [17]:
df.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,Afghanistan,2501,Population,511,Total Population - Both sexes,1961,1961,1000 persons,8954.0,
1,2,Afghanistan,2501,Population,511,Total Population - Both sexes,1962,1962,1000 persons,9142.0,
2,2,Afghanistan,2501,Population,511,Total Population - Both sexes,1963,1963,1000 persons,9340.0,
3,2,Afghanistan,2501,Population,511,Total Population - Both sexes,1964,1964,1000 persons,9547.0,
4,2,Afghanistan,2501,Population,511,Total Population - Both sexes,1965,1965,1000 persons,9765.0,


## Clean dataset

### Sanity checks

In [18]:
# Check nulls
df.isnull().any()

Area Code       False
Area            False
Item Code       False
Item            False
Element Code    False
Element         False
Year Code       False
Year            False
Unit            False
Value           False
Flag             True
dtype: bool

In [19]:
# def check_unique_pairs(df, name_1, name_2):
#     if not (
#         (df.groupby(name_1)[name_2].nunique() != 1).any()
#         and (df.groupby(name_2)[name_1].nunique() != 1).any()
#     ):
#         raise ValueError(
#             f"Some `{name_1}` may have multiple `{name_2}` values (or opposite)."
#         )


# fields = ["Item", Element"]
# for field in fields:
#     check_unique_pairs(df, field, f"{field} Code")

In [20]:
x = df.groupby(["Element", "Unit"])["Element Code"].nunique()
if (x > 1).any():
    raise ValueError(f"Element-Unit not unique!")

### Drop columns and set Index

In [21]:
df.drop(columns=["Area", "Item", "Element", "Year Code"], inplace=True)

In [22]:
df.set_index(
    ["Area Code", "Item Code", "Element Code", "Year", "Flag"],
    inplace=True,
    verify_integrity=True,
)

In [23]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unit,Value
Area Code,Item Code,Element Code,Year,Flag,Unnamed: 5_level_1,Unnamed: 6_level_1
2,2501,511,1961,,1000 persons,8954.0
2,2501,511,1962,,1000 persons,9142.0
2,2501,511,1963,,1000 persons,9340.0
2,2501,511,1964,,1000 persons,9547.0
2,2501,511,1965,,1000 persons,9765.0


## Add tables

In [24]:
t = Table(df)
t.metadata.short_name = "bulk"
ds.add(t)

## Cleanup

In [25]:
shutil.rmtree(tmp_dir)