# FAOstat: Food Balances (2014-)
[_Source data_](https://www.fao.org/faostat/en/#data)

## Parameters

In [2]:
dest_dir = "/tmp/faostat_fbs"

## Imports

In [3]:
import zipfile
import tempfile
import shutil
from pathlib import Path
import os

import pandas as pd

from owid.catalog import Dataset, Table
from owid.walden import Catalog
from etl.paths import DATA_DIR
from etl.steps.data.converters import convert_walden_metadata

## Fetch walden dataset

In [20]:
walden_ds = Catalog().find_one("faostat", "2021-04-09", "faostat_FBS")

In [21]:
walden_ds

Dataset(namespace='faostat', short_name='faostat_FBS', name='Food Balance: Food Balances (2014-) - FAO (2021)', description="Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consumption. The per caput supply of each such food i

## Unzip to temp directory

In [22]:
tmp_dir = tempfile.mkdtemp()

In [23]:
z = zipfile.ZipFile(walden_ds.local_path)
z.extractall(tmp_dir)
# print(os.listdir(tmp_dir))
(filename,) = list(filter(lambda x: "(Normalized)" in x, os.listdir(tmp_dir)))
filename

'FoodBalanceSheets_E_All_Data_(Normalized).csv'

## Load dataset

In [24]:
df = pd.read_csv(os.path.join(tmp_dir, filename), encoding="latin-1")

In [25]:
df.head()

Unnamed: 0,Area Code,Area,Item Code,Item,Element Code,Element,Year Code,Year,Unit,Value,Flag
0,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2014,2014,1000 persons,33371.0,*
1,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2015,2015,1000 persons,34414.0,*
2,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2016,2016,1000 persons,35383.0,*
3,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2017,2017,1000 persons,36296.0,*
4,2,Afghanistan,2501,Population,511,Total Population - Both sexes,2018,2018,1000 persons,37172.0,*


## Clean dataset

In [26]:
# Drop unuseful columns
df.drop(columns=["Area Code", "Year Code"], inplace=True)

In [27]:
df.head()

Unnamed: 0,Area,Item Code,Item,Element Code,Element,Year,Unit,Value,Flag
0,Afghanistan,2501,Population,511,Total Population - Both sexes,2014,1000 persons,33371.0,*
1,Afghanistan,2501,Population,511,Total Population - Both sexes,2015,1000 persons,34414.0,*
2,Afghanistan,2501,Population,511,Total Population - Both sexes,2016,1000 persons,35383.0,*
3,Afghanistan,2501,Population,511,Total Population - Both sexes,2017,1000 persons,36296.0,*
4,Afghanistan,2501,Population,511,Total Population - Both sexes,2018,1000 persons,37172.0,*


In [28]:
# Check nulls
df.isnull().any()

Area            False
Item Code       False
Item            False
Element Code    False
Element         False
Year            False
Unit            False
Value           False
Flag            False
dtype: bool

### Reduce df memory usage

In [29]:
# Check memory usage (MB)
df.memory_usage(index=True).sum() / 1000000

103.63628

In [30]:
df.dtypes

Area             object
Item Code         int64
Item             object
Element Code      int64
Element          object
Year              int64
Unit             object
Value           float64
Flag             object
dtype: object

In [31]:
column_cols = ["Area", "Item", "Element", "Unit", "Flag", "Item Code", "Element Code"]
df[column_cols] = df[column_cols].astype("category")
df["Year"] = df["Year"].astype("uint16")

In [32]:
# Check memory usage (MB)
df.memory_usage(index=True).sum() / 1000000

25.931254

In [33]:
df.head()

Unnamed: 0,Area,Item Code,Item,Element Code,Element,Year,Unit,Value,Flag
0,Afghanistan,2501,Population,511,Total Population - Both sexes,2014,1000 persons,33371.0,*
1,Afghanistan,2501,Population,511,Total Population - Both sexes,2015,1000 persons,34414.0,*
2,Afghanistan,2501,Population,511,Total Population - Both sexes,2016,1000 persons,35383.0,*
3,Afghanistan,2501,Population,511,Total Population - Both sexes,2017,1000 persons,36296.0,*
4,Afghanistan,2501,Population,511,Total Population - Both sexes,2018,1000 persons,37172.0,*


### Sanity checks

In [35]:
def check_unique_pairs(df, name_1, name_2):
    if not (
        (df.groupby(name_1)[name_2].nunique() == 1).sum()
        and (df.groupby(name_2)[name_1].nunique() == 1).sum()
    ):
        raise ValueError(
            f"Some `{name_1}` may have multiple `{name_2}` values (or opposite)."
        )

In [36]:
fields = ["Item", "Element"]
for field in fields:
    check_unique_pairs(df, field, f"{field} Code")

In [37]:
df.set_index(["Area", "Item", "Element", "Year", "Unit"], inplace=True)

## Make a dataset

In [38]:
ds = Dataset.create_empty(dest_dir)
ds.metadata = convert_walden_metadata(walden_ds)
ds.save()

In [39]:
ds

Dataset(path='/tmp/faostat_fbs', metadata=DatasetMeta(namespace='faostat', short_name='faostat_FBS', title='Food Balance: Food Balances (2014-) - FAO (2021)', description="Food Balance Sheet presents a comprehensive picture of the pattern of a country's food supply during a specified reference period. The food balance sheet shows for each food item - i.e. each primary commodity and a number of processed commodities potentially available for human consumption - the sources of supply and its utilization. The total quantity of foodstuffs produced in a country added to the total quantity imported and adjusted to any change in stocks that may have occurred since the beginning of the reference period gives the supply available during that period. On the utilization side a distinction is made between the quantities exported, fed to livestock, used for seed, put to manufacture for food use and non-food uses, losses during storage and transportation, and food supplies available for human consum

## Add tables

In [40]:
t = Table(df)
t.metadata.short_name = "faostat_fbs"
ds.add(t)

## Cleanup

In [41]:
shutil.rmtree(tmp_dir)