## Parameters

In [1]:
dest_dir = "/tmp/lpd_20210910"

## Imports

In [2]:
import zipfile
import tempfile
import pandas as pd
import os

from owid import walden, catalog
from etl.steps.data import converters

In [3]:
raw_dataset = walden.Catalog().find_one("living_planet")

In [4]:
raw_dataset

Dataset(namespace='living_planet', short_name='lpd', name='Living Planet Database - (2020-09)', description='The Living Planet Database contains tens of thousands of vertebrate population time-series from around the world. It is the largest collection of its kind, and is publicly available, making it an invaluable tool for both research and conservation. This dataset contains time-series of population abundance data for vertebrate species spanning years between 1970 and 2016. These data were used in the Living Planet Report 2020. Confidential records that cannot be shared have been removed from this data set.', source_name='The Zoological Society of London', url='http://stats.livingplanetindex.org/', date_accessed='2021-11-22', file_extension='zip', license_url='https://livingplanetindex.org/documents/data_agreement.pdf', source_data_url=None, md5='9c16afc205cd1aff073fad6726ebbdc3', publication_year=2020, publication_date='2020-09-10', owid_data_url='http://nyc3.digitaloceanspaces.com/

In [5]:
raw_dataset.local_path

'/Users/mojmir/.owid/walden/living_planet/2020-09-10/lpd.zip'

In [6]:
tmp_dir = tempfile.mkdtemp(prefix="etl-")

In [7]:
## Unzip

 #### Must run '.venv/bin/etl data://meadow/living_planet/2021-09-10/lpd' before this stage or the file won't be in your local walden

In [8]:
zipfile.ZipFile(raw_dataset.local_path).extractall(tmp_dir)

In [9]:
df = pd.read_csv(
    os.path.join(tmp_dir, "Public data set/LPR2020data_public.csv"), encoding="latin-1"
)

In [10]:
df.head()

Unnamed: 0,ID,Binomial,Reference,Citation,Class,Order,Family,Genus,Species,Subspecies,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,4,Copsychus_sechellarum,"{Komdeur, 1996 #990}","Komdeur, J. (1996). Breeding of the Seychelles...",Aves,Passeriformes,Muscicapidae,Copsychus,sechellarum,,...,,,,,,,,,,
1,5,Falco_punctatus,"{Groombridge, 2001 #987}","Groombridge, J. J., Bruford, M.W., Jones, C.G,...",Aves,Falconiformes,Falconidae,Falco,punctatus,,...,,,,,,,,,,
2,6,Pternistis_ochropectus,"{WPA/Birdlife/ SSC Partridge, 2000 #1450}","WPA/Birdlife/ SSC Partridge, Q. a. F. S. G. (2...",Aves,Galliformes,Phasianidae,Pternistis,ochropectus,,...,,,,,,,,,,
3,7,Gyps_coprotheres,"{WWF-SA, 2000 #1184}","WWF-SA (2000). Cape griffon.""""",Aves,Accipitriformes,Accipitridae,Gyps,coprotheres,,...,,,,,,,,,,
4,8,Gyps_coprotheres,"{WWF-SA, 2000 #1184}","WWF-SA (2000). Cape griffon.""""",Aves,Accipitriformes,Accipitridae,Gyps,coprotheres,,...,,,,,,,,,,


In [11]:
df.columns

Index(['ID', 'Binomial', 'Reference', 'Citation', 'Class', 'Order', 'Family',
       'Genus', 'Species', 'Subspecies', 'Common_name', 'Location', 'Country',
       'All_countries', 'Region', 'Latitude', 'Longitude', 'Specific_location',
       'temperate_or_tropical', 'System', 'T_realm', 'T_biome', 'FW_realm',
       'FW_biome', 'M_realm', 'M_ocean', 'M_biome', 'Units', 'Method', '1950',
       '1951', '1952', '1953', '1954', '1955', '1956', '1957', '1958', '1959',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018'],
      d

## Cleaning data

In [13]:
df_clean = df.drop(
    [
        "Reference",
        "All_countries",
        "Region",
        "temperate_or_tropical",
        "System",
        "T_realm",
        "T_biome",
        "FW_realm",
        "FW_biome",
        "M_realm",
        "M_ocean",
        "M_biome",
        "Method",
    ],
    axis=1,
)

In [14]:
df_long = pd.melt(
    df_clean,
    id_vars=[
        "ID",
        "Binomial",
        "Citation",
        "Class",
        "Order",
        "Family",
        "Genus",
        "Species",
        "Subspecies",
        "Common_name",
        "Location",
        "Country",
        "Latitude",
        "Longitude",
        "Specific_location",
        "Units",
    ],
    value_vars=[str(i) for i in range(1950, 2019)],
)

In [15]:
df_long = df_long.rename(columns={"variable": "Year"})

## Making a dataset container

In [16]:
ds = catalog.Dataset.create_empty(dest_dir)
ds.metadata = converters.convert_walden_metadata(raw_dataset)
ds.save()

## Adding LPD table to container

In [22]:
from owid.catalog import utils

t = catalog.Table(df_long)
t.metadata.short_name = "living_planet_database"
t.metadata.title = "Living Planet Database - animal population trends 1950-2018"
ds.add(utils.underscore_table(t))