## This Notebook formats the Hakai Institute wild sampling data to fit the format of our wild sampling data.
### To integrate it with other wild data, run 'Integrate_Wild_Data.ipynb' after generating the formatted data.

The data was downloaded on 12 March 2025 from [GitHub](https://github.com/HakaiInstitute/jsp-data) linked from the [Hakai site](https://catalogue.hakai.org/dataset/ca-cioos_6c449900-c726-4e9a-b241-707711e253a7)

In [789]:
import pandas as pd
from pathlib import Path

### Path configuration variables

In [790]:
raw_data_dir = Path('.') / 'Hakai' / 'supplemental_materials' / 'raw_data'

sites_filepath = raw_data_dir / 'sites.csv'
seine_filepath = raw_data_dir / 'seine_data.csv'
survey_filepath = raw_data_dir / 'survey_data.csv'

fish_field_filepath = raw_data_dir / 'fish_field_data.csv'
fish_lab_filepath = raw_data_dir / 'fish_lab_data.csv'

sealice_dir = raw_data_dir / 'sample_results' / 'sealice'
sealice_lab_fs_filepath = sealice_dir / 'sealice_lab_fs.csv'
sealice_lab_mot_filepath = sealice_dir / 'sealice_lab_mot.csv'
sealice_field_filepath = sealice_dir / 'sealice_field.csv'

# output paths for writing the formatted Hakai Institute data
wild_data_dir = Path('.')
hakai_formatted_events_filepath = wild_data_dir / 'hakai_wild_sample_events.csv'
hakai_formatted_lice_filepath = wild_data_dir / 'hakai_wild_fish_lice.csv'

### Load the dataframes

In [791]:
sites_df = pd.read_csv(sites_filepath)
seine_df = pd.read_csv(seine_filepath)
survey_df = pd.read_csv(survey_filepath, parse_dates=['survey_date'])

fish_field_df = pd.read_csv(fish_field_filepath)
fish_lab_df = pd.read_csv(fish_lab_filepath)

sealice_field_df = pd.read_csv(sealice_field_filepath)
sealice_lab_fs_df = pd.read_csv(sealice_lab_fs_filepath, parse_dates=['date_liced'])
sealice_lab_mot_df = pd.read_csv(sealice_lab_mot_filepath)

In [792]:
# survey_date is a full datetime with timezone - make timezone naive as it makes combining with other data easier
survey_df['survey_date'] = survey_df.survey_date.dt.tz_localize(None)
survey_df.survey_date

0     2020-05-15
1     2020-05-19
2     2020-05-22
3     2020-05-26
4     2020-05-27
         ...    
777   2023-06-28
778   2023-06-29
779   2023-07-03
780   2023-07-04
781   2023-07-05
Name: survey_date, Length: 782, dtype: datetime64[ns]

# Make the events file
The sample_events that we use are most similar to seine_data in the Hakai data.
We use survey_data to get the date. We use sites to get a string name of the site, and the region.

In [793]:
seine_survey_df = pd.merge(seine_df, survey_df, on='survey_id', how='left')
seine_survey_site_df = pd.merge(seine_survey_df, sites_df, on='site_id', how='left')
seine_survey_site_df.columns

Index(['seine_id', 'survey_id', 'set_number', 'set_type', 'lat', 'long',
       'set_time', 'time_searching', 'set_sliders', 'set_poppers',
       'set_dimpling', 'fish_retained', 'so_taken', 'so_total', 'pi_taken',
       'pi_total', 'cu_taken', 'cu_total', 'co_taken', 'co_total', 'he_taken',
       'he_total', 'ck_taken', 'ck_total', 'collection_protocol',
       'seine_comments', 'seine_quality_log', 'survey_date', 'site_id', 'crew',
       'precip', 'cloud_cover', 'sea_state', 'wind_speed', 'wind_direction',
       'tide_state', 'survey_time_start', 'survey_time_end', 'net_sets',
       'secchi', 'ysi_bout', 'ctd_bout', 'zoop_bout', 'survey_type',
       'survey_comments', 'site_name', 'region', 'zone', 'site_priority',
       'pfma', 'site_notes', 'survey_start_lat', 'survey_start_lon',
       'survey_end_lat', 'survey_end_lon', 'ocgy_std_lat', 'ocgy_std_lon'],
      dtype='object')

In [794]:
seine_survey_site_df.region.unique()

array(['DI', 'JS'], dtype=object)

In [795]:
# use the seine_id as the event_id and prepend 'hakai_' to make unique among other wild data
seine_survey_site_df['event_id'] = seine_survey_df.seine_id.apply(lambda eid: 'hakai_' + str(eid))

In [796]:
# rename some columns to fit the format
seine_survey_site_df.rename(columns={'survey_date': 'sampledate',
                                'lat': 'latitude',
                                'long': 'longitude',
                                'site_name': 'sample_site'},
                       inplace=True)

In [797]:
# mapping from Hakai regions to the regions we use and the DFO zones
region_map = {'DI': ('Discovery Islands', '3_2'),
              'JS': ('Broughton Archipelago', '3_3')}
# add/rename the column values. Have to do dfozone first.
seine_survey_site_df['dfozone'] = seine_survey_site_df.region.apply(
    lambda x: region_map[x][1] if x in region_map else None)
seine_survey_site_df['region'] = seine_survey_site_df.region.apply(
    lambda x: region_map[x][0] if x in region_map else None)

seine_survey_site_df.region.unique()

# Region mapping Hakai regions to DFO zones is not perfect. Some of the JS (Johnstone Straight) sites are on
# the border of Discovery and Broughton.
# We set all sites with longitude between -126.5 and -125.9 to None so they are not used in any region grouping
seine_survey_site_df['region'] = seine_survey_site_df.apply(
    lambda row: None if pd.isnull(row.longitude) or -126.5 < row.longitude < -125.9 else row.region,
    axis=1
)
seine_survey_site_df['dfozone'] = seine_survey_site_df.apply(
    lambda row: None if pd.isnull(row.longitude) or -126.5 < row.longitude < -125.9 else row.dfozone,
    axis=1
)

# seine_survey_site_df.head()

In [798]:
seine_survey_site_df.region.unique()

array([None, 'Discovery Islands', 'Broughton Archipelago'], dtype=object)

In [799]:
# outputs some lat/long files for visual debugging in mapping software, e.g. QGIS
ba_df = seine_survey_site_df[seine_survey_site_df['region'] == 'Broughton Archipelago'][['latitude', 'longitude']]
ba_df.to_csv('BA.csv', index=False)
di_df = seine_survey_site_df[seine_survey_site_df['region'] == 'Discovery Islands'][['latitude', 'longitude']]
di_df.to_csv('DI.csv', index=False)
na_df = seine_survey_site_df[pd.isnull(seine_survey_site_df['region'])][['latitude', 'longitude']]
na_df.to_csv('NA.csv', index=False)

In [800]:
seine_survey_site_df['source'] = 'Hakai Institute'

In [801]:
# restrict to just the required columns
hakai_events_df = seine_survey_site_df.reindex(columns=['event_id', 'sampledate', 'region', 'dfozone', 'sample_site',
                                                        'latitude', 'longitude', 'source'])
hakai_events_df.head()

Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source
0,hakai_DE100N2,2015-05-12,,,Hurtado Point,,,Hakai Institute
1,hakai_DE100N1,2015-05-12,,,Hurtado Point,,,Hakai Institute
2,hakai_DE100N3,2015-05-12,Discovery Islands,3_2,Hurtado Point,49.9613,-124.747,Hakai Institute
3,hakai_DE101N1,2015-05-13,Discovery Islands,3_2,Kinghorn Island,50.07902,-124.8509,Hakai Institute
4,hakai_DE102N1,2015-05-14,Discovery Islands,3_2,Francisco Point,49.99932,-125.1395,Hakai Institute


In [802]:
hakai_events_df.to_csv(hakai_formatted_events_filepath, index=False)

# Make the Fish Lice file
Hakai data has two each fish data files - labelled Lab and Field. There are three lice files: field, lab_fs, and mot.
Our interpretation:
Fish are collected in the field in a seine event. All the fish are documented in the fish_field_data, some of them have lice counts
documented in sealice_field. Some of the fish are also sent to the lab and some of those have lice counts documented in sealice_lab_fs.
Some fish (distinct from lab_fs) underwent a different lab analysis just counting motiles - they are in lab_mot

We use lab counts of sea lice if they are available, and default to field counts if they are not.

In [803]:
fish_field_df.head()

Unnamed: 0,ufn,semsp_id,seine_id,species,package_id,fish_time_out,fish_time_dewar,fork_length_field,height_field,weight_field,lice_id_protocol_field,lice_presence_absence,analysis_planned,fish_field_comments,quality_log
0,U5531,2015-05-12-D01-CO-1,DE100N3,CO,,,,,,,,,SEMSP,,
1,U5521,2015-05-12-D01-CU-1,DE100N3,CU,,,,,,,,,SEMSP,,
2,U5530,2015-05-12-D01-CU-10,DE100N3,CU,,,,,,,,,SEMSP,,
3,U5522,2015-05-12-D01-CU-2,DE100N3,CU,,,,,,,,,SEMSP,,
4,U5523,2015-05-12-D01-CU-3,DE100N3,CU,,,,,,,,,SEMSP,,


In [804]:
fish_lab_df.head()

Unnamed: 0,ufn,date_processed,cwt,adipose,weight,standard_length,fork_length,comments_protocol,comments_fish_health_lab,dissector,dissection_protocol,lice_collection_protocol,lice_id_protocol_lab
0,U01,2015-09-10T00:00:00Z,,Present,,,118.0,From thawed JS fish. One otolith collected,,CK,nocold,lab_fine_2015,
1,U02,2015-09-10T00:00:00Z,,Present,22.2,,128.0,From thawed JS fish,Small spleen,CK,nocold,lab_fine_2015,
2,U11,2015-09-10T00:00:00Z,,Present,,,132.0,From thawed JS fish,,KI,nocold,lab_fine_2015,
3,U12,2015-09-10T00:00:00Z,,Present,,,143.0,From thawed JS fish. One broken otolith,One broken otolith,KI,nocold,lab_fine_2015,
4,U03,2015-09-11T00:00:00Z,,Present,24.0,,134.0,From thawed JS fish. Sample weighed w/ some ic...,liver was pale,DS,nocold,lab_fine_2015,


In [805]:
# collect all info on the fish.
# lab fish is a subset of field fish so merge on field ufn
fish_df = pd.merge(fish_field_df, fish_lab_df, on='ufn', how='left')

In [806]:
sealice_field_df.head()

Unnamed: 0,ufn,cal_cope_field,lep_cope_field,chal_a_field,chal_b_field,cal_mot_field,cgf_field,lpam_field,lpaf_field,lam_field,...,cs,ms,ps,hem,egp,ggp,mg,pb,comments_fish_health_field,quality_log
0,U11324,0.0,0.0,0.0,0.0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,H:ANF,
1,U11325,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,
2,U11326,0.0,0.0,0.0,0.0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,Dorsal surface lesion,
3,U11327,0.0,0.0,0.0,0.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,
4,U11328,0.0,0.0,9.0,3.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,,


In [807]:
sealice_lab_fs_df.head()

Unnamed: 0,ufn,sample_id,date_liced,lab_count_fine_no_id,lab_count_fine_total,lep_cop,lep_cunifer_cop,lep_chal_a,lep_chal_b,lep_pa_m_1,...,cal_pa_f,cal_a_m,cal_a_f,cal_grav_f,cal_mot_unid,unid_louse,lab_staff,comments,quality_level,quality_log
0,U929,S929SL1,2016-11-15,11.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,LP,Cal C4 UNID missing tail,Raw,
1,U997,S997SL1,2016-11-15,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,LP,,Raw,
2,U273,S273SL1,2016-11-17,4.0,4.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,LP,,Raw,
3,U357,S357SL1,2016-11-17,16.0,15.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,LP,,Raw,
4,U394,S394SL1,2016-11-17,5.0,6.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,LP,,Raw,


In [808]:
sealice_lab_mot_df.head()

Unnamed: 0,ufn,cm_lab,cpaf_lab,caf_lab,cgf_lab,ucal_lab,lpaf_lab,lpam_lab,lam_lab,laf_lab,lgf_lab,ulep_lab,lab_count_motiles,comments,quality_level,quality_log
0,U11001,0,0,0,0,0,0,0,0,0,0,0,0,,Raw,
1,U11002,0,0,0,0,0,0,0,0,0,0,0,0,,Raw,
2,U11003,0,0,0,0,0,1,0,0,0,0,0,1,,Raw,
3,U11004,0,0,0,0,0,0,0,0,0,0,0,0,,Raw,
4,U11005,0,0,0,0,0,0,0,0,0,0,0,0,,Raw,


In [809]:
# join the three sealice dataframes together
# the sets have a small overlap but are mostly distinct, so merge keeping everything
lice_df = pd.merge(sealice_field_df, sealice_lab_fs_df, on='ufn', how='outer')
lice_df = pd.merge(lice_df, sealice_lab_mot_df, on='ufn', how='outer')
lice_df.head()

Unnamed: 0,ufn,cal_cope_field,lep_cope_field,chal_a_field,chal_b_field,cal_mot_field,cgf_field,lpam_field,lpaf_field,lam_field,...,lpaf_lab,lpam_lab,lam_lab,laf_lab,lgf_lab,ulep_lab,lab_count_motiles,comments_y,quality_level_y,quality_log
0,U11324,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Raw,
1,U11325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Raw,
2,U11326,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Raw,
3,U11327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Raw,
4,U11328,0.0,0.0,9.0,3.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,Raw,


In [810]:
print("field: {}\nfs: {}\nmot: {}\ntotal: {}".format(len(sealice_field_df), len(sealice_lab_fs_df),
                                                     len(sealice_lab_mot_df), len(lice_df)))

field: 2411
fs: 1469
mot: 4580
total: 7509


In [811]:
set.intersection(set(fish_df.columns), set(lice_df.columns))

{'quality_log', 'ufn'}

In [812]:
# this duplicate column makes things complicated later and we don't use it, so get rid of it here
fish_df.drop('quality_log', axis=1, inplace=True)
lice_df.drop('quality_log', axis=1, inplace=True)

In [813]:
# merge fish and lice together
fish_lice_df = pd.merge(fish_df, lice_df, on='ufn', how='left')

In [814]:
fish_lice_df.species.unique()

array(['CO', 'CU', 'HE', 'SO', 'PI', 'CK'], dtype=object)

In [815]:
# convert columns to our names and format
fish_lice_df['event_id'] = fish_lice_df.seine_id.apply(lambda sid: 'hakai_' + sid)
fish_lice_df['fish_id'] = fish_lice_df.ufn.apply(lambda ufn: 'hakai_' + ufn)
fish_lice_df.replace({'species': {'SO': 'Sockeye Salmon',
                                  'PI': 'Pink Salmon',
                                  'CU': 'Chum Salmon',
                                  'CO': 'Coho Salmon',
                                  'CK': 'Chinook Salmon',
                                  'HE': 'Pacific Herring'}},
                     inplace=True)
fish_lice_df.rename(columns={'species': 'fish_species'}, inplace=True)

In [816]:
sealice_field_df.columns

Index(['ufn', 'cal_cope_field', 'lep_cope_field', 'chal_a_field',
       'chal_b_field', 'cal_mot_field', 'cgf_field', 'lpam_field',
       'lpaf_field', 'lam_field', 'laf_field', 'lgf_field', 'unid_cope_field',
       'unid_chal_field', 'cs', 'ms', 'ps', 'hem', 'egp', 'ggp', 'mg', 'pb',
       'comments_fish_health_field', 'quality_log'],
      dtype='object')

In [817]:
sealice_lab_fs_df.columns

Index(['ufn', 'sample_id', 'date_liced', 'lab_count_fine_no_id',
       'lab_count_fine_total', 'lep_cop', 'lep_cunifer_cop', 'lep_chal_a',
       'lep_chal_b', 'lep_pa_m_1', 'lep_pa_m_2', 'lep_pa_f_1', 'lep_pa_f_2',
       'lep_pa_unid', 'lep_a_m', 'lep_a_f', 'lep_grav_f', 'cal_cop',
       'cal_chal_a_1', 'cal_chal_a_2', 'cal_chal_b_3', 'cal_chal_b_4_f',
       'cal_chal_b_4_m', 'cal_chal_4_unid', 'cal_chal_a_unid',
       'cal_chal_b_unid', 'cal_pa_m', 'cal_pa_f', 'cal_a_m', 'cal_a_f',
       'cal_grav_f', 'cal_mot_unid', 'unid_louse', 'lab_staff', 'comments',
       'quality_level', 'quality_log'],
      dtype='object')

In [818]:
sealice_lab_mot_df.columns

Index(['ufn', 'cm_lab', 'cpaf_lab', 'caf_lab', 'cgf_lab', 'ucal_lab',
       'lpaf_lab', 'lpam_lab', 'lam_lab', 'laf_lab', 'lgf_lab', 'ulep_lab',
       'lab_count_motiles', 'comments', 'quality_level', 'quality_log'],
      dtype='object')

In [819]:
# convert NaN lice counts to 0
col_names = [
    # field counts
    'cal_cope_field', 'lep_cope_field', 'chal_a_field', 'chal_b_field', 'cal_mot_field', 'cgf_field',
    'lpam_field', 'lpaf_field', 'lam_field', 'laf_field', 'lgf_field', 'unid_cope_field', 'unid_chal_field',
    # lab finescale counts
    'lep_cop', 'lep_cunifer_cop', 'lep_chal_a',
    'lep_chal_b', 'lep_pa_m_1', 'lep_pa_m_2', 'lep_pa_f_1', 'lep_pa_f_2',
    'lep_pa_unid', 'lep_a_m', 'lep_a_f', 'lep_grav_f', 'cal_cop',
    'cal_chal_a_1', 'cal_chal_a_2', 'cal_chal_b_3', 'cal_chal_b_4_f',
    'cal_chal_b_4_m', 'cal_chal_4_unid', 'cal_chal_a_unid',
    'cal_chal_b_unid', 'cal_pa_m', 'cal_pa_f', 'cal_a_m', 'cal_a_f',
    'cal_grav_f', 'cal_mot_unid', 'unid_louse',
    # lab mot counts
    'cm_lab', 'cpaf_lab', 'caf_lab', 'cgf_lab', 'ucal_lab',
    'lpaf_lab', 'lpam_lab', 'lam_lab', 'laf_lab', 'lgf_lab', 'ulep_lab',
    'lab_count_motiles'
]

# first, make sure our expression checking for NaN works
assert fish_lice_df[col_names].isnull().values.any()

# fill all the NaNs
fill_dict = {col: 0 for col in col_names}
fish_lice_df.fillna(value=fill_dict, inplace=True)

# confirm all nulls have been filled with values
assert not fish_lice_df[col_names].isnull().values.any()

In [820]:
# fill in the fish fields we use, with preference for lab counts otherwise use field values
# (lice_id_protocol_lab is not null if the fish has gone to the lab)
fish_lice_df['length'] = fish_lice_df.apply(
    lambda row: row['fork_length'] if not pd.isnull(row.lice_id_protocol_lab) else row['fork_length_field'],
    axis=1
)
fish_lice_df['weight'] = fish_lice_df.apply(
    lambda row: row['weight'] if not pd.isnull(row.lice_id_protocol_lab) else row['weight_field'],
    axis=1
)
# no lab measurement for height
fish_lice_df['height'] = fish_lice_df['height_field']

In [821]:
# split fish_lice_df into lab and field-only for lice counts
lab_fish_lice_df = fish_lice_df.loc[~pd.isnull(fish_lice_df.lice_id_protocol_lab)]
field_fish_lice_df = fish_lice_df.loc[pd.isnull(fish_lice_df.lice_id_protocol_lab)]

In [822]:
# Get the lice counts for lab fish

# lab_fish_lice_df['lep_cop'] = lab_fish_lice_df.lep_cop
lab_fish_lice_df['lep_chal'] = lab_fish_lice_df.lep_chal_a + lab_fish_lice_df.lep_chal_b
lep_motile_cols = ['lep_pa_m_1', 'lep_pa_m_2', 'lep_pa_f_1', 'lep_pa_f_2', 'lep_pa_unid', 'lep_a_m',
                   'lep_a_f', 'lep_grav_f', 'lpaf_lab', 'lpam_lab', 'lam_lab', 'laf_lab', 'lgf_lab']
lab_fish_lice_df['lep_motile'] = lab_fish_lice_df[lep_motile_cols].sum(axis=1)
lab_fish_lice_df['lep_unknown'] = lab_fish_lice_df.ulep_lab

# lab_fish_lice_df['cal_cop'] = lab_fish_lice_df.cal_cop
# already a cal_cop field with the correct count
cal_chal_cols = ['cal_chal_a_1', 'cal_chal_a_2', 'cal_chal_b_3', 'cal_chal_b_4_f',
                 'cal_chal_b_4_m', 'cal_chal_4_unid', 'cal_chal_a_unid', 'cal_chal_b_unid']
lab_fish_lice_df['cal_chal'] = lab_fish_lice_df[cal_chal_cols].sum(axis=1)
cal_motile_cols = ['cal_pa_m', 'cal_pa_f', 'cal_a_m', 'cal_a_f', 'cal_grav_f', 'cal_mot_unid',
                   'cm_lab', 'cpaf_lab', 'caf_lab', 'cgf_lab']
lab_fish_lice_df['cal_motile'] = lab_fish_lice_df[cal_motile_cols].sum(axis=1)
lab_fish_lice_df['cal_unknown'] = lab_fish_lice_df.ucal_lab
lab_fish_lice_df['unknown_cop'] = 0
lab_fish_lice_df['unknown_chal'] = 0
lab_fish_lice_df['unknown_unknown'] = lab_fish_lice_df['unid_louse']

# keep the lice protocol
lab_fish_lice_df['lice_protocol'] = lab_fish_lice_df.lice_id_protocol_lab

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_fish_lice_df['lep_chal'] = lab_fish_lice_df.lep_chal_a + lab_fish_lice_df.lep_chal_b
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_fish_lice_df['lep_motile'] = lab_fish_lice_df[lep_motile_cols].sum(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_fish_lice_df['lep_unknown'] = lab

In [823]:
lab_fish_lice_df.head()

Unnamed: 0,ufn,semsp_id,seine_id,fish_species,package_id,fish_time_out,fish_time_dewar,fork_length_field,height_field,weight_field,...,lep_chal,lep_motile,lep_unknown,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_unknown,lice_protocol
13,U344,2015-05-12-D01-SO-1,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0,lportner_fine
14,U338,2015-05-12-D01-SO-10,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,0.0,6.0,0.0,0.0,0,0,0.0,lportner_fine
24,U340,2015-05-12-D01-SO-2,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,0.0,2.0,0.0,0.0,0,0,0.0,lportner_fine
35,U352,2015-05-12-D01-SO-3,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,0.0,4.0,0.0,0.0,0,0,0.0,lportner_fine
37,U337,2015-05-12-D01-SO-4,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,0.0,2.0,0.0,0.0,0,0,0.0,lportner_fine


In [824]:
# Get the lice counts for field-only fish

field_fish_lice_df['lep_cop'] = field_fish_lice_df['lep_cope_field']
field_fish_lice_df['lep_chal'] = 0  # Hakai doesn't differentiate species of chalimus in the field
field_lep_motile_cols = ['lpam_field', 'lpaf_field', 'lam_field', 'laf_field', 'lgf_field']
field_fish_lice_df['lep_motile'] = field_fish_lice_df[field_lep_motile_cols].sum(axis=1)
field_fish_lice_df['lep_unknown'] = 0
field_fish_lice_df['cal_cop'] = field_fish_lice_df.cal_cope_field
field_fish_lice_df['cal_chal'] = field_fish_lice_df.chal_a_field + field_fish_lice_df.chal_b_field
field_fish_lice_df['cal_motile'] = field_fish_lice_df.cal_mot_field + field_fish_lice_df.cgf_field
field_fish_lice_df['cal_unknown'] = 0
field_fish_lice_df['unknown_cop'] = field_fish_lice_df.unid_cope_field
field_unknown_chal_cols = ['chal_a_field', 'chal_b_field', 'unid_chal_field']
field_fish_lice_df['unknown_chal'] = field_fish_lice_df[field_unknown_chal_cols].sum(axis=1)
field_fish_lice_df['unknown_motile'] = 0
field_fish_lice_df['unknown_unknown'] = 0

# keep the lice protocol
field_fish_lice_df['lice_protocol'] = field_fish_lice_df.lice_id_protocol_field

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_fish_lice_df['lep_cop'] = field_fish_lice_df['lep_cope_field']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_fish_lice_df['lep_chal'] = 0  # Hakai doesn't differentiate species of chalimus in the field
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_fish_lice_df['lep_motile'] = fi

In [825]:
field_fish_lice_df.reindex()

Unnamed: 0,ufn,semsp_id,seine_id,fish_species,package_id,fish_time_out,fish_time_dewar,fork_length_field,height_field,weight_field,...,lep_motile,lep_unknown,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown,lice_protocol
0,U5531,2015-05-12-D01-CO-1,DE100N3,Coho Salmon,,,,,,,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0,
1,U5521,2015-05-12-D01-CU-1,DE100N3,Chum Salmon,,,,,,,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0,
2,U5530,2015-05-12-D01-CU-10,DE100N3,Chum Salmon,,,,,,,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0,
3,U5522,2015-05-12-D01-CU-2,DE100N3,Chum Salmon,,,,,,,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0,
4,U5523,2015-05-12-D01-CU-3,DE100N3,Chum Salmon,,,,,,,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17240,U25207,,DE1028N1,Pink Salmon,,,,104.0,19.0,9.9,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0,modified_salmoncaost_allstages
17241,U25208,,DE1028N1,Pink Salmon,,,,106.0,17.0,9.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0,modified_salmoncaost_allstages
17242,U25209,,DE1028N1,Pink Salmon,,,,83.0,13.0,5.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0,modified_salmoncaost_allstages
17243,U25210,,DE1028N1,Pink Salmon,,,,85.0,13.0,5.9,...,0.0,0,0.0,0.0,0,0.0,0.0,0,0,modified_salmoncaost_allstages


In [826]:
field_fish_lice_df = field_fish_lice_df.reindex(
        columns=['event_id', 'fish_id', 'length','weight', 'height', 'fish_species',
             'lep_cop', 'lep_chal', 'lep_motile', 'lep_unknown',
             'cal_cop', 'cal_chal', 'cal_motile', 'cal_unknown',
             'unknown_cop', 'unknown_chal', 'unknown_motile', 'unknown_unknown',
             'lice_protocol']
)

In [827]:
# recombine the field and lab dataframes

fish_lice_df = pd.concat([lab_fish_lice_df, field_fish_lice_df], ignore_index=True, sort=False)
fish_lice_df.head()

Unnamed: 0,ufn,semsp_id,seine_id,fish_species,package_id,fish_time_out,fish_time_dewar,fork_length_field,height_field,weight_field,...,lep_motile,lep_unknown,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_unknown,lice_protocol,unknown_motile
0,U344,2015-05-12-D01-SO-1,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,lportner_fine,
1,U338,2015-05-12-D01-SO-10,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,lportner_fine,
2,U340,2015-05-12-D01-SO-2,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,lportner_fine,
3,U352,2015-05-12-D01-SO-3,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,lportner_fine,
4,U337,2015-05-12-D01-SO-4,DE100N3,Sockeye Salmon,,,,,,,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,lportner_fine,


In [828]:
# restrict to the columns we want
hakai_fish_lice_df = fish_lice_df.reindex(
    columns=['event_id', 'fish_id', 'length','weight', 'height', 'fish_species',
             'lep_cop', 'lep_chal', 'lep_motile', 'lep_unknown',
             'cal_cop', 'cal_chal', 'cal_motile', 'cal_unknown',
             'unknown_cop', 'unknown_chal', 'unknown_motile', 'unknown_unknown',
             'lice_protocol'])
hakai_fish_lice_df.head()

Unnamed: 0,event_id,fish_id,length,weight,height,fish_species,lep_cop,lep_chal,lep_motile,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown,lice_protocol
0,hakai_DE100N3,hakai_U344,95.0,10.3,,Sockeye Salmon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,lportner_fine
1,hakai_DE100N3,hakai_U338,105.0,14.6,,Sockeye Salmon,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,,0.0,lportner_fine
2,hakai_DE100N3,hakai_U340,95.0,11.7,,Sockeye Salmon,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,,0.0,lportner_fine
3,hakai_DE100N3,hakai_U352,86.0,8.7,,Sockeye Salmon,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,,0.0,lportner_fine
4,hakai_DE100N3,hakai_U337,95.0,10.6,,Sockeye Salmon,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,,0.0,lportner_fine


In [829]:
hakai_fish_lice_df.lice_protocol.unique()

array(['lportner_fine', 'lab_motiles', nan, 'salmoncoast_allstages',
       'salmoncoast_motiles', 'presence/absence',
       'modified_salmoncaost_allstages'], dtype=object)

In [830]:
hakai_fish_lice_df.to_csv(hakai_formatted_lice_filepath, index=False)

In [831]:
hakai_events_df.columns

Index(['event_id', 'sampledate', 'region', 'dfozone', 'sample_site',
       'latitude', 'longitude', 'source'],
      dtype='object')

In [832]:
hakai_events_df.sampledate.dtype

dtype('<M8[ns]')