# DwC Occurrences. Keister Zooplankton Hood Canal 2012-13 data

University of Washington Pelagic Hypoxia Hood Canal project, Zooplankton dataset.

5/8,4, 4/5-4, 3/28,7, 2023. 2022-3-14, 1-12

**BRIEFLY DESCRIBE WHAT THIS NOTEBOOK DOES**

In [1]:
from datetime import datetime
from pathlib import Path
import uuid

import numpy as np
import pandas as pd

from data_preprocess import read_and_parse_sourcedata

In [2]:
data_pth = Path(".")

## Settings

Set to `True` when debugging. `csv` ﬁles will not be exported when `debug_no_csvexport = True`

In [3]:
debug_no_csvexport = False

## Process JSON file containing common mappings and strings

In [4]:
with open(data_pth / 'common_mappings.json') as f:
    common_mappings = json.load(f)

In [5]:
DatasetCode = common_mappings['datasetcode']
sex_dwciri_terms = common_mappings['sex_dwciri_terms']
life_stage_mappings = common_mappings['life_stage_mappings']

# TODO: Create two mappings from life_stage_mappings, for lifeStage and dwciri:lifeStage
lifeStage_mapping = {k:v[0] for k,v in life_stage_mappings.items()}
lifeStage_dwciri_terms = {k:v[1] for k,v in life_stage_mappings.items()}

## Read the data

### Read the pre-processed csv file

`usecols` defines the columns that will be kept and the order in which they'll be organized

In [6]:
usecols = ['sample_code', 'species', 'life_history_stage', 'lhs_0', 'lhs_1']
occursource_df = read_and_parse_sourcedata()[usecols]

In [7]:
len(occursource_df)

6884

In [8]:
occursource_df.head()

Unnamed: 0,sample_code,species,life_history_stage,lhs_0,lhs_1
0,20131003DBDm2_200,ACARTIA,3;_CIII,3,CIII
1,20130906DBiDm1_200,ACARTIA,5;_CV,5,CV
2,20131003DBDm1_200,ACARTIA,Female;_Adult,Female,Adult
3,20131003DBDm1_200,ACARTIA,Male;_Adult,Male,Adult
4,20120614DBDm3_200,ACARTIA_CLAUSI,Female;_Adult,Female,Adult


## Merge resolved taxonomy from taxonomy csv

In [9]:
taxonomy_df = pd.read_csv(
    data_pth / "intermediate_DwC_taxonomy.csv"
)

In [10]:
occursource_df = occursource_df.merge(
    taxonomy_df, 
    how='inner', 
    left_on='species',
    right_on='verbatimIdentification'
)

In [11]:
len(occursource_df)

6871

In [12]:
occursource_df.head()

Unnamed: 0,sample_code,species,life_history_stage,lhs_0,lhs_1,scientificName,scientificNameID,taxonRank,kingdom,phylum,class,order,family,genus,scientificNameAuthorship,verbatimIdentification
0,20131003DBDm2_200,ACARTIA,3;_CIII,3,CIII,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA
1,20130906DBiDm1_200,ACARTIA,5;_CV,5,CV,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA
2,20131003DBDm1_200,ACARTIA,Female;_Adult,Female,Adult,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA
3,20131003DBDm1_200,ACARTIA,Male;_Adult,Male,Adult,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA
4,20130905DUNm1_200,ACARTIA,5;_CV,5,CV,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA


## Create and populate `occurrence_df` dataframe

### Add `occurrenceID`, `basisOfRecord`, `occurrenceStatus`

In [13]:
occurrence_df = occursource_df.copy()

In [14]:
occurrence_df.insert(1, 'occurrenceID', [uuid.uuid4() for i in range(len(occurrence_df))])
occurrence_df.insert(2, 'basisOfRecord', 'MaterialSample')
occurrence_df.insert(3, 'occurrenceStatus', 'present')

occurrence_df.rename(columns={'sample_code':'eventID'}, inplace=True)

In [15]:
len(occurrence_df)

6871

In [16]:
occurrence_df.head(10)

Unnamed: 0,eventID,occurrenceID,basisOfRecord,occurrenceStatus,species,life_history_stage,lhs_0,lhs_1,scientificName,scientificNameID,taxonRank,kingdom,phylum,class,order,family,genus,scientificNameAuthorship,verbatimIdentification
0,20131003DBDm2_200,4a0bb9e9-51dd-4544-95f6-6d74bed08acb,MaterialSample,present,ACARTIA,3;_CIII,3,CIII,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA
1,20130906DBiDm1_200,1e3118ce-a285-47de-88ae-2eb2ca425118,MaterialSample,present,ACARTIA,5;_CV,5,CV,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA
2,20131003DBDm1_200,0aa579b0-94f6-47c0-8fda-01f225ae0e52,MaterialSample,present,ACARTIA,Female;_Adult,Female,Adult,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA
3,20131003DBDm1_200,890f26e9-c9cd-467c-b4e2-07282b20cfcd,MaterialSample,present,ACARTIA,Male;_Adult,Male,Adult,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA
4,20130905DUNm1_200,d4deb48e-b5de-42cd-a61f-2241db0ab2da,MaterialSample,present,ACARTIA,5;_CV,5,CV,Acartia,urn:lsid:marinespecies.org:taxname:104108,Genus,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Dana, 1846",ACARTIA
5,20120614DBDm3_200,19e71d22-be86-4c0e-b8a9-934ba97be144,MaterialSample,present,ACARTIA_CLAUSI,Female;_Adult,Female,Adult,Acartia clausi,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
6,20120614DBDm3_200,41b8fd6f-8d2f-448d-a522-b5b08f23243a,MaterialSample,present,ACARTIA_CLAUSI,Male;_Adult,Male,Adult,Acartia clausi,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
7,20120614DBDm4_200,c859878e-0fa4-4e5b-badc-afc9da179841,MaterialSample,present,ACARTIA_CLAUSI,Female;_Adult,Female,Adult,Acartia clausi,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
8,20120712DBDm1_200,102bd25a-6dd0-4d56-91e8-2f1d92a63554,MaterialSample,present,ACARTIA_CLAUSI,5;_CV,5,CV,Acartia clausi,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
9,20121004DBDm4_200,c1e13cd2-1137-432a-b8cf-e458adef501f,MaterialSample,present,ACARTIA_CLAUSI,Female;_Adult,Female,Adult,Acartia clausi,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI


### Map `life_history_stage` into `sex` and `lifeStage`

In [17]:
# dwc standard columns
occurrence_df.insert(
    4, 'sex', 
    occurrence_df['lhs_0'].apply(
        lambda s: s.lower() if s in ['Female', 'Male'] else 'indeterminate'
    )
)
occurrence_df.insert(
    5, 'lifeStage', 
    occurrence_df['lhs_0'].apply(lambda s: lifeStage_mapping[s])
)

# dwciri columns
nercvocabs_url = "http://vocab.nerc.ac.uk/collection/"
occurrence_df.insert(
    6, 'dwciri:sex', 
    occurrence_df['sex'].apply(lambda s: nercvocabs_url + sex_dwciri_terms[s])
)
occurrence_df.insert(
    7, 'dwciri:lifeStage', 
    occurrence_df['lhs_0'].apply(
        lambda s: None if lifeStage_dwciri_terms[s] is None 
        else nercvocabs_url + lifeStage_dwciri_terms[s]
    )
)

In [18]:
occurrence_df = (
    occurrence_df
    .sort_values(by=['eventID', 'scientificName', 'lifeStage', 'sex'])
    .reset_index(drop=True)
)

In [19]:
occurrence_df.head(5)

Unnamed: 0,eventID,occurrenceID,basisOfRecord,occurrenceStatus,sex,lifeStage,dwciri:sex,dwciri:lifeStage,species,life_history_stage,...,scientificNameID,taxonRank,kingdom,phylum,class,order,family,genus,scientificNameAuthorship,verbatimIdentification
0,20120611UNDm1_200,5a6e5651-1814-4cdf-b4ca-a2b29982030b,MaterialSample,present,female,adult,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,ACARTIA_CLAUSI,Female;_Adult,...,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
1,20120611UNDm1_200,9e463465-6836-4928-9a09-d1465f81bc9b,MaterialSample,present,male,adult,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,ACARTIA_CLAUSI,Male;_Adult,...,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
2,20120611UNDm1_200,621edd83-ce74-4c23-8e31-b5afda51b56f,MaterialSample,present,indeterminate,copepodites C5,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,ACARTIA_CLAUSI,5;_CV,...,urn:lsid:marinespecies.org:taxname:104251,Species,Animalia,Arthropoda,Copepoda,Calanoida,Acartiidae,Acartia,"Giesbrecht, 1889",ACARTIA_CLAUSI
3,20120611UNDm1_200,265618fe-4aed-4a6b-b061-6b2ad5e490cc,MaterialSample,present,indeterminate,medusae,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,AGLANTHA,Medusa,...,urn:lsid:marinespecies.org:taxname:117212,Genus,Animalia,Cnidaria,Hydrozoa,Trachymedusae,Rhopalonematidae,Aglantha,"Haeckel, 1879",AGLANTHA
4,20120611UNDm1_200,3c0eb533-378c-4738-9c42-f1eb4155e9b4,MaterialSample,present,indeterminate,veliger,http://vocab.nerc.ac.uk/collection/S10/current...,http://vocab.nerc.ac.uk/collection/S11/current...,BIVALVIA,Veliger,...,urn:lsid:marinespecies.org:taxname:105,Class,Animalia,Mollusca,Bivalvia,,,,"Linnaeus, 1758",BIVALVIA


## Export intermediate table for `life_history_stage` matching to csv

In [20]:
if not debug_no_csvexport:
    occurrence_df[['occurrenceID', 'life_history_stage']].to_csv(
        data_pth / 'intermediate_DwC_occurrence_life_history_stage.csv', index=False
    )

## Export `occurrence_df` to csv

### Cleanup

In [21]:
occurrence_df.drop(columns=['species', 'life_history_stage', 'lhs_0', 'lhs_1'], inplace=True)

if not debug_no_csvexport:
    occurrence_df.to_csv(data_pth / 'aligned_csvs' / 'DwC_occurrence.csv', index=False)

## Package versions

In [22]:
print(
    f"{datetime.utcnow()} +00:00\n"
    f"pandas: {pd.__version__}"
)

2023-05-09 07:28:26.729078 +00:00
pandas: 1.5.3
