# Hyde 3.2 baseline

## Parameters

In [1]:
dest_dir = "/tmp/hyde_3_2_baseline"

## Imports

In [2]:
import zipfile
import tempfile
import shutil
from pathlib import Path

import pandas as pd

from owid.catalog import Dataset, Table
from owid.walden import Catalog
from etl.paths import DATA_DIR
from etl.steps.data.converters import convert_walden_metadata

## Fetch walden dataset

In [3]:
walden_ds = Catalog().find_one("hyde", "2017", "baseline")

In [4]:
walden_ds

Dataset(namespace='hyde', short_name='baseline', name='Hyde 3.2 (baseline estimates)', description='HYDE is an internally consistent combination of updated historical population (gridded) estimates and land use for the past 12,000 years. Categories include cropland, with a new distinction into irrigated and rain fed crops (other than rice) and irrigated and rain fed rice. Also grazing lands are provided, divided into more intensively used pasture, converted rangeland and non-converted natural (less intensively used) rangeland. Population is represented by maps of total, urban, rural population and population density as well as built-up area.', source_name='PBL Netherlands Environmental Assessment Agency', url='https://www.pbl.nl/en/image/links/hyde', date_accessed='2021-10-01', file_extension='zip', license_url='https://dataportaal.pbl.nl/downloads/HYDE/HYDE3.2/readme_release_HYDE3.2.1.txt', source_data_url='https://dataportaal.pbl.nl/downloads/HYDE/HYDE3.2/baseline.zip', md5='acdbbd39

## Load country codes

In [5]:
gf_path = (DATA_DIR / "meadow" / "hyde" / "2017" / "general_files").as_posix()
codes = Dataset(gf_path)["country_codes"]
codes

Unnamed: 0_level_0,country
country_code,Unnamed: 1_level_1
4,Afghanistan
8,Albania
12,Algeria
16,American Samoa
20,Andorra
...,...
887,Yemen
891,Serbia and Montenegro
894,Zambia
499,Montenegro


## Unzip to temp directory

In [6]:
temp_dir = tempfile.mkdtemp()

In [7]:
import zipfile
import tempfile
import shutil
from pathlib import Path

import pandas as pd

from owid.catalog import Dataset, Table
from owid.walden import Catalog
from etl.paths import DATA_DIR
from etl.steps.data.converters import convert_walden_metadata

In [8]:
z = zipfile.ZipFile.extractall

In [9]:
z = zipfile.ZipFile(walden_ds.local_path)

In [10]:
data_files = [f for f in z.namelist() if "/png/" not in f and "/zip/" not in f]
data_files

['baseline/',
 'baseline/txt/',
 'baseline/txt/conv_rangeland_c.txt',
 'baseline/txt/conv_rangeland_r.txt',
 'baseline/txt/cropland_c.txt',
 'baseline/txt/cropland_r.txt',
 'baseline/txt/grazing_c.txt',
 'baseline/txt/grazing_r.txt',
 'baseline/txt/ir_norice_c.txt',
 'baseline/txt/ir_norice_r.txt',
 'baseline/txt/ir_rice_c.txt',
 'baseline/txt/ir_rice_r.txt',
 'baseline/txt/pasture_c.txt',
 'baseline/txt/pasture_r.txt',
 'baseline/txt/popc_c.txt',
 'baseline/txt/popc_r.txt',
 'baseline/txt/popd_c.txt',
 'baseline/txt/popd_r.txt',
 'baseline/txt/rangeland_c.txt',
 'baseline/txt/rangeland_r.txt',
 'baseline/txt/rf_norice_c.txt',
 'baseline/txt/rf_norice_r.txt',
 'baseline/txt/rf_rice_c.txt',
 'baseline/txt/rf_rice_r.txt',
 'baseline/txt/rurc_c.txt',
 'baseline/txt/rurc_r.txt',
 'baseline/txt/tot_irri_c.txt',
 'baseline/txt/tot_irri_r.txt',
 'baseline/txt/tot_rainfed_c.txt',
 'baseline/txt/tot_rainfed_r.txt',
 'baseline/txt/tot_rice_c.txt',
 'baseline/txt/tot_rice_r.txt',
 'baseline/txt/u

In [11]:
z.extractall(temp_dir, members=data_files)

In [12]:
!ls {temp_dir}/baseline/txt

conv_rangeland_c.txt popc_c.txt           tot_irri_c.txt
conv_rangeland_r.txt popc_r.txt           tot_irri_r.txt
cropland_c.txt       popd_c.txt           tot_rainfed_c.txt
cropland_r.txt       popd_r.txt           tot_rainfed_r.txt
grazing_c.txt        rangeland_c.txt      tot_rice_c.txt
grazing_r.txt        rangeland_r.txt      tot_rice_r.txt
ir_norice_c.txt      rf_norice_c.txt      uopp_c.txt
ir_norice_r.txt      rf_norice_r.txt      uopp_r.txt
ir_rice_c.txt        rf_rice_c.txt        urbc_c.txt
ir_rice_r.txt        rf_rice_r.txt        urbc_r.txt
pasture_c.txt        rurc_c.txt
pasture_r.txt        rurc_r.txt


## Make a dataset

In [13]:
ds = Dataset.create_empty(dest_dir)
ds.metadata = convert_walden_metadata(walden_ds)
ds.save()

## Add tables

### Population

In [15]:
country_path = Path(temp_dir) / "baseline" / "txt" / "popc_c.txt"
population = (
    pd.read_csv(country_path.as_posix(), sep=" ")
    .rename({"region": "country_code"}, axis=1)
    .melt(id_vars="country_code", var_name="year", value_name="population")
)
population = population[-population.country_code.isin(["Total"])]
population["year"] = population.year.astype(int)
population["country_code"] = population.country_code.astype(int)

population_norm = pd.merge(
    codes, population, on="country_code", how="inner", validate="one_to_many"
).drop(columns="country_code")
population_norm.set_index(["country", "year"], inplace=True)

t = Table(population_norm)
t.metadata.short_name = "population"
ds.add(t)

## Cleanup

In [16]:
shutil.rmtree(temp_dir)