# Hyde 3.2 baseline

## Parameters

In [1]:
dest_dir = "/tmp/hyde_3_2_baseline"

## Imports

In [2]:
import zipfile
import tempfile
import shutil
from pathlib import Path

import pandas as pd

from owid.catalog import Dataset, Table
from etl.snapshot import Snapshot
from etl.paths import DATA_DIR, SNAPSHOTS_DIR
from etl.steps.data.converters import convert_snapshot_metadata

## Get snapshot

In [3]:
snap = Snapshot(SNAPSHOTS_DIR / "hyde/2017/baseline.zip")

## Load country codes

In [4]:
gf_path = (DATA_DIR / "meadow" / "hyde" / "2017" / "general_files").as_posix()
codes = Dataset(gf_path)["country_codes"]
codes

Unnamed: 0_level_0,country
country_code,Unnamed: 1_level_1
4,Afghanistan
8,Albania
12,Algeria
16,American Samoa
20,Andorra
...,...
887,Yemen
891,Serbia and Montenegro
894,Zambia
499,Montenegro


## Unzip to temp directory

In [5]:
temp_dir = tempfile.mkdtemp()

In [6]:
z = zipfile.ZipFile(snap.path)

In [7]:
data_files = [f for f in z.namelist() if "/png/" not in f and "/zip/" not in f]
data_files

['baseline/',
 'baseline/txt/',
 'baseline/txt/conv_rangeland_c.txt',
 'baseline/txt/conv_rangeland_r.txt',
 'baseline/txt/cropland_c.txt',
 'baseline/txt/cropland_r.txt',
 'baseline/txt/grazing_c.txt',
 'baseline/txt/grazing_r.txt',
 'baseline/txt/ir_norice_c.txt',
 'baseline/txt/ir_norice_r.txt',
 'baseline/txt/ir_rice_c.txt',
 'baseline/txt/ir_rice_r.txt',
 'baseline/txt/pasture_c.txt',
 'baseline/txt/pasture_r.txt',
 'baseline/txt/popc_c.txt',
 'baseline/txt/popc_r.txt',
 'baseline/txt/popd_c.txt',
 'baseline/txt/popd_r.txt',
 'baseline/txt/rangeland_c.txt',
 'baseline/txt/rangeland_r.txt',
 'baseline/txt/rf_norice_c.txt',
 'baseline/txt/rf_norice_r.txt',
 'baseline/txt/rf_rice_c.txt',
 'baseline/txt/rf_rice_r.txt',
 'baseline/txt/rurc_c.txt',
 'baseline/txt/rurc_r.txt',
 'baseline/txt/tot_irri_c.txt',
 'baseline/txt/tot_irri_r.txt',
 'baseline/txt/tot_rainfed_c.txt',
 'baseline/txt/tot_rainfed_r.txt',
 'baseline/txt/tot_rice_c.txt',
 'baseline/txt/tot_rice_r.txt',
 'baseline/txt/u

In [None]:
z.extractall(temp_dir, members=data_files)

In [None]:
!ls {temp_dir}/baseline/txt

## Make a dataset

In [None]:
ds = Dataset.create_empty(dest_dir)
ds.metadata = convert_snapshot_metadata(snap.metadata)
ds.save()

## Add tables

### Population

In [None]:
country_path = Path(temp_dir) / "baseline" / "txt" / "popc_c.txt"
population = (
    pd.read_csv(country_path.as_posix(), sep=" ")
    .rename({"region": "country_code"}, axis=1)
    .melt(id_vars="country_code", var_name="year", value_name="population")
)
population = population[-population.country_code.isin(["Total"])]
population["year"] = population.year.astype(int)
population["country_code"] = population.country_code.astype(int)

population_norm = pd.merge(codes, population, on="country_code", how="inner", validate="one_to_many").drop(
    columns="country_code"
)
population_norm.set_index(["country", "year"], inplace=True)

t = Table(population_norm)
t.metadata.short_name = "population"
ds.add(t)

## Cleanup

In [None]:
shutil.rmtree(temp_dir)