## This Notebook integrates all the wild data sources.
### This includes our data from a variety of sources, as well as data from Salmon Coast Field Station, Cedar Creek Field Station, and the Hakai Institute

In [193]:
import pandas as pd
from pathlib import Path

## Configuration variables for this notebook

In [194]:

# paths to files for our data
wild_data_dir = Path('.')
events_filepath = wild_data_dir / 'wild_sample_events.csv'
fish_lice_filepath = wild_data_dir / 'wild_fish_lice.csv'

# paths to the Salmon Coast Field Station data files
scfs_events_filepath = wild_data_dir / 'salmon_coast_wild_sample_events.csv'
scfs_fish_lice_filepath = wild_data_dir / 'salmon_coast_wild_fish_lice.csv'

# paths to the Cedar Creek Field Station data files
ccfs_events_filepath = wild_data_dir / 'cedar_coast_wild_sample_events.csv'
ccfs_fish_lice_filepath = wild_data_dir / 'cedar_coast_wild_fish_lice.csv'

# paths to the Hakai Institute data files
hakai_events_filepath = wild_data_dir / 'hakai_wild_sample_events.csv'
hakai_fish_lice_filepath = wild_data_dir / 'hakai_wild_fish_lice.csv'

# -- OUTPUT --

# output paths for writing the combined data
all_events_filepath = wild_data_dir / 'all_wild_sample_events.csv'
all_fish_lice_filepath = wild_data_dir / 'all_wild_fish_lice.csv'

## Join all the event data

In [195]:
# load the event data files
events_df = pd.read_csv(events_filepath, parse_dates=['sampledate'])
scfs_events_df = pd.read_csv(scfs_events_filepath, parse_dates=['sampledate'])
ccfs_events_df = pd.read_csv(ccfs_events_filepath, parse_dates=['sampledate'])
hakai_events_df = pd.read_csv(hakai_events_filepath, parse_dates=['sampledate'])

In [196]:
hakai_events_df.sampledate.dtype

dtype('<M8[ns]')

In [197]:
# concatenate all the files
all_events_df = pd.concat([events_df, scfs_events_df, ccfs_events_df, hakai_events_df], ignore_index=True, sort=False)
all_events_df.source.unique()

array(['Fisheries and Oceans Canada', 'Mainstream Biological Consulting',
       'Broughton Archipelago Monitoring Plan',
       'Marine Environmental Research Program', 'Marty Krkosek',
       'Pacificus Biological Services', 'Kitasoo First Nation',
       'Salmon Coast Field Station', 'Cedar Coast Field Station',
       'Hakai Institute'], dtype=object)

In [198]:
# make a useful source_code column - redundant but easier typing
source_code_mapping = {
    'Fisheries and Oceans Canada': 'DFO',
    'Mainstream Biological Consulting': 'MBC',
    'Broughton Archipelago Monitoring Plan': 'BAMP',
    'Marine Environmental Research Program': 'MERP',
    'Marty Krkosek': 'MK',
    'Pacificus Biological Services': 'Pacif',
    'Kitasoo First Nation': 'Kit',
    'Salmon Coast Field Station': 'SCS',
    'Cedar Coast Field Station': 'CC',
    'Hakai Institute': 'Hak'
}
all_events_df['source_code'] = all_events_df['source'].apply(
    lambda x: source_code_mapping[x] if x in source_code_mapping else None)
all_events_df.source_code.unique()

array(['DFO', 'MBC', 'BAMP', 'MERP', 'MK', 'Pacif', 'Kit', 'SCS', 'CC',
       'Hak'], dtype=object)

In [199]:
# write out to CSV
all_events_df.to_csv(all_events_filepath, index=False)

## Join all the fish/lice data

In [200]:
# load the fish/lice files
fish_lice_df = pd.read_csv(fish_lice_filepath)
scfs_fish_lice_df = pd.read_csv(scfs_fish_lice_filepath)
ccfs_fish_lice_df = pd.read_csv(ccfs_fish_lice_filepath)
hakai_fish_lice_df = pd.read_csv(hakai_fish_lice_filepath)

In [201]:
scfs_fish_lice_df.fish_species.unique()

array(['Pink Salmon', 'Chum Salmon', 'Sockeye Salmon'], dtype=object)

In [202]:
# concatenate all the files
all_fish_lice_df = pd.concat([fish_lice_df, scfs_fish_lice_df, ccfs_fish_lice_df, hakai_fish_lice_df],
                             ignore_index=True, sort=False)
all_fish_lice_df.head()

Unnamed: 0,event_id,fish_id,length,weight,height,fish_species,lep_cop,lep_chal,lep_motile,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown,lice_protocol
0,5666,1,44.0,0.96,,Chum Salmon,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,5666,2,39.0,0.61,,Pink Salmon,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,5666,3,45.0,0.94,,Pink Salmon,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,5666,4,43.0,0.84,,Chum Salmon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,5666,5,38.0,0.53,,Pink Salmon,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [203]:
# for our processing we are interested in lethal vs non-lethal lice protocols
def set_lice_protocol(source_code, protocol_value):
    """
    Sets the lice protocol values for events matching the source code.
    Lice protocol values are attached to fish in all_fish_lice
    :param source_code: Code for the source of the fish data (matches one of source_code in all_events_df)
    :type source_code: str
    :param protocol_value: The value to put in the lice_protocol field
    :type protocol_value: str
    """
    source_event_ids = list(all_events_df[all_events_df['source_code'] == source_code].event_id.unique())
    assert len(source_event_ids) > 0
    all_fish_lice_df.loc[all_fish_lice_df['event_id'].isin(source_event_ids), 'lice_protocol'] = protocol_value

In [204]:
# convert all the lice_protocol values for the sources that did all one kind of sampling
set_lice_protocol('SCS', 'Non-lethal')
set_lice_protocol('MK', 'Non-lethal')
set_lice_protocol('CC', 'Non-lethal')
set_lice_protocol('Kit', 'Lethal')
set_lice_protocol('DFO', 'Lethal')
set_lice_protocol('BAMP', 'Lethal')
set_lice_protocol('MERP', 'Lethal')
set_lice_protocol('MBC', 'Lethal')
set_lice_protocol('Pacif', 'Lethal')
set_lice_protocol('Hak', 'Mixed')

all_fish_lice_df.lice_protocol.unique()

array(['Lethal', 'Non-lethal', nan, 'Mixed'], dtype=object)

In [205]:
source_event_ids = list(all_events_df[all_events_df['source_code'] == 'CC'].event_id.unique())
assert len(source_event_ids) > 0
all_fish_lice_df.loc[all_fish_lice_df['event_id'].isin(source_event_ids), 'lice_protocol'].unique()
# all_fish_lice_df[pd.isnull(all_fish_lice_df['lice_protocol'])]

array(['Non-lethal'], dtype=object)

In [206]:
# write out to csv
all_fish_lice_df.to_csv(all_fish_lice_filepath, index=False)

In [207]:
wild_df = pd.merge(all_events_df, all_fish_lice_df, on='event_id', how='left')

In [208]:
wild_df.sampledate.dtype

dtype('<M8[ns]')