# Parameters

In [1]:
dest_dir = "/tmp/faostat_qcl"

# Read data and reference tables

In [2]:
from owid import catalog
import pandas as pd
from pathlib import Path

In [3]:
from etl.paths import BASE_DIR as base_path

## Load reference

In [4]:
reference_dataset = catalog.Dataset((base_path / "data/reference").as_posix())

In [5]:
countries = reference_dataset["countries_regions"]

## Load meadow dataset

In [6]:
qcl_meadow = catalog.Dataset(
    (base_path / "data/meadow/faostat/2021-03-18/faostat_QCL").as_posix()
)

In [7]:
# Bulk data and items metadata
qcl_bulk = qcl_meadow["bulk"]
qcl_items = qcl_meadow["item_groups"]

In [8]:
qcl_bulk.shape

(3811461, 5)

In [9]:
qcl_bulk.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Area,Item,Element,Unit,Value
Area Code,Item Code,Element Code,Year,Flag,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,221,5312,1975,F,Afghanistan,"Almonds, with shell",Area harvested,ha,0
2,221,5312,1976,F,Afghanistan,"Almonds, with shell",Area harvested,ha,5900
2,221,5312,1977,F,Afghanistan,"Almonds, with shell",Area harvested,ha,6000
2,221,5312,1978,F,Afghanistan,"Almonds, with shell",Area harvested,ha,6000
2,221,5312,1979,F,Afghanistan,"Almonds, with shell",Area harvested,ha,6000


## Clean dataset

### `Area`
Filtering and mapping

In [93]:
# Prepare for Country Tool
# ds = qcl_bulk.Area.drop_duplicates()
# ds.name = "Country"
# ds.to_csv("countries.csv", index=False)

In [10]:
# Load
with open(
    base_path
    / "etl"
    / "steps"
    / "data"
    / "garden"
    / "faostat"
    / "2021-03-18"
    / "faostat_qcl.country_std.csv"
) as f:
    df_country_std = pd.read_csv(f)
df_country_std = df_country_std.dropna()

In [11]:
# Table to DataFrame
df = pd.DataFrame(qcl_bulk)

In [13]:
# Filter by `Area` values
msk = df.Area.isin(df_country_std.Country)
df = df[msk]

In [15]:
# Replace `Area` values
mapping = dict(zip(df_country_std["Country"], df_country_std["Our World In Data Name"]))
df["Area"] = df["Area"].replace(mapping)

In [21]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Area,Item,Element,Unit,Value
Area Code,Item Code,Element Code,Year,Flag,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2,221,5312,1975,F,Afghanistan,"Almonds, with shell",Area harvested,ha,0
2,221,5312,1976,F,Afghanistan,"Almonds, with shell",Area harvested,ha,5900
2,221,5312,1977,F,Afghanistan,"Almonds, with shell",Area harvested,ha,6000
2,221,5312,1978,F,Afghanistan,"Almonds, with shell",Area harvested,ha,6000
2,221,5312,1979,F,Afghanistan,"Almonds, with shell",Area harvested,ha,6000


---

## Create Garden dataset

In [31]:
qcl_garden = catalog.Dataset.create_empty(dest_dir)

In [32]:
# Propagate metadata
qcl_garden.metadata = qcl_meadow.metadata
qcl_garden.save()

In [33]:
# Add bulk table
t = catalog.Table(df)
t.metadata = qcl_meadow["bulk"].metadata
qcl_garden.add(t)
# Add table items
qcl_garden.add(qcl_items)

In [34]:
qcl_garden.save()