# Hyde 3.2 baseline

## Parameters

In [None]:
dest_dir = "/tmp/hyde_3_2_baseline"

## Imports

In [None]:
import zipfile
import tempfile
import shutil
from pathlib import Path

import pandas as pd

from owid.catalog import Dataset, Table
from owid.walden import Catalog
from etl.paths import DATA_DIR
from etl.steps.data.converters import convert_walden_metadata

## Fetch walden dataset

In [None]:
walden_ds = Catalog().find_one("hyde", "2017", "baseline")

In [None]:
walden_ds

## Load country codes

In [None]:
gf_path = (DATA_DIR / "meadow" / "hyde" / "2017" / "general_files").as_posix()
codes = Dataset(gf_path)["country_codes"]
codes

## Unzip to temp directory

In [None]:
temp_dir = tempfile.mkdtemp()

In [None]:
import zipfile
import tempfile
import shutil
from pathlib import Path

import pandas as pd

from owid.catalog import Dataset, Table
from owid.walden import Catalog
from etl.paths import DATA_DIR
from etl.steps.data.converters import convert_walden_metadata
from etl import files

In [None]:
try:
    z = zipfile.ZipFile(walden_ds.local_path)
except zipfile.BadZipFile as e:
    if files.checksum_file(walden_ds.local_path) != walden_ds.md5:
        raise zipfile.BadZipFile(
            f"Hyde baseline dataset has about 5GB, it is possible that the file wasn't downloaded completely. Please remove file "
            f"{walden_ds.local_path} manually and try again."
        )

In [None]:
data_files = [f for f in z.namelist() if "/png/" not in f and "/zip/" not in f]
data_files

In [None]:
z.extractall(temp_dir, members=data_files)

In [None]:
!ls {temp_dir}/baseline/txt

## Make a dataset

In [None]:
ds = Dataset.create_empty(dest_dir)
ds.metadata = convert_walden_metadata(walden_ds)
ds.save()

## Add tables

### Population

In [None]:
country_path = Path(temp_dir) / "baseline" / "txt" / "popc_c.txt"
population = (
    pd.read_csv(country_path.as_posix(), sep=" ")
    .rename({"region": "country_code"}, axis=1)
    .melt(id_vars="country_code", var_name="year", value_name="population")
)
population = population[-population.country_code.isin(["Total"])]
population["year"] = population.year.astype(int)
population["country_code"] = population.country_code.astype(int)

population_norm = pd.merge(codes, population, on="country_code", how="inner", validate="one_to_many").drop(
    columns="country_code"
)
population_norm.set_index(["country", "year"], inplace=True)

t = Table(population_norm)
t.metadata.short_name = "population"
ds.add(t)

## Cleanup

In [None]:
shutil.rmtree(temp_dir)