## This Notebook formats the Salmon Coast Research Station wild sampling data to fit the format of our wild sampling data.
### To integrate it with other wild data, run 'Integrate_Wild_Data.ipynb' after generating the formatted data.

The data was downloaded on 13 March 2025 from [GitHub](https://github.com/CedarCoastFieldStation/Sea-lice-database). The [Cedar Coast site](https://cedarcoastsociety.org)

In [555]:
import pandas as pd
from pathlib import Path

### Configuration variables for this notebook
Adjust for your local setup

In [556]:
# paths to the source Cedar Creek data files
ccfs_wild_data_dir = Path('CedarCoast') / 'Data'
ccfs_events_filepath = ccfs_wild_data_dir / 'ClayoquotSeaLice_Site_Data.csv'
ccfs_fish_lice_filepath = ccfs_wild_data_dir / 'ClayoquotSeaLice_fishData.csv'

# output paths for writing the formatted Cedar Creek Field Station data
wild_data_dir = Path('.')
ccfs_formatted_events_filepath = wild_data_dir / 'cedar_coast_wild_sample_events.csv'
ccfs_formatted_lice_filepath = wild_data_dir / 'cedar_coast_wild_fish_lice.csv'

# Format the event file

In [557]:
# read in the event file and parse the year, month, day columns into a date
ccfs_events_df = pd.read_csv(ccfs_events_filepath, parse_dates={'sampledate': ['year', 'month', 'day']})
ccfs_events_df.head()

Unnamed: 0,sampledate,Site ID,time,location,salt_surf,salt_1m,temp_surf,temp_1m,salmon_captured,salmon_examined,...,chum_examined,sockeye_examined,morts_recovery,morts_other,P_ratio,C_ratio,latitude,longitude,bycatch,notes
0,2018-04-26,1,,Buckle Bay,,,,,47.0,47.0,...,47.0,,,,,,,,,
1,2018-05-16,2,,Buckle Bay,28.04,28.11,11.88,11.72,35.0,16.0,...,16.0,,,,,,49°10.921,--125°57.883,1 oolichan,
2,2018-05-16,3,,Elbow Bank,26.73,26.65,12.12,12.14,53.0,53.0,...,48.0,,,,,,49°12.290,--125°57.134,"tube snout, 4 lingcod, 4 flatfish, 2 sculpin, ...",
3,2018-05-16,4,,Cypre River,25.1,25.18,12.28,12.12,8.0,8.0,...,,,,,,,49°16.192,--125°54.358,tube snout,
4,2018-05-10,5,,Elbow Bank,24.8,24.81,11.56,11.47,26.0,26.0,...,26.0,,,,,,49°11.798,-125°56.885,"1 sculpin, 25-30 flatfish, juvenile cod",


In [558]:
ccfs_events_df.sampledate.unique()

array(['2018-04-26T00:00:00.000000000', '2018-05-16T00:00:00.000000000',
       '2018-05-10T00:00:00.000000000', '2018-05-23T00:00:00.000000000',
       '2018-05-27T00:00:00.000000000', '2018-06-09T00:00:00.000000000',
       '2018-06-23T00:00:00.000000000', '2019-03-01T00:00:00.000000000',
       '2019-03-05T00:00:00.000000000', '2019-03-14T00:00:00.000000000',
       '2019-03-29T00:00:00.000000000', '2019-04-10T00:00:00.000000000',
       '2019-04-17T00:00:00.000000000', '2019-04-18T00:00:00.000000000',
       '2019-04-25T00:00:00.000000000', '2019-04-30T00:00:00.000000000',
       '2019-05-01T00:00:00.000000000', '2019-05-02T00:00:00.000000000',
       '2019-05-04T00:00:00.000000000', '2019-05-08T00:00:00.000000000',
       '2019-05-10T00:00:00.000000000', '2019-05-14T00:00:00.000000000',
       '2019-05-15T00:00:00.000000000', '2019-05-22T00:00:00.000000000',
       '2019-05-23T00:00:00.000000000', '2019-05-30T00:00:00.000000000',
       '2019-05-31T00:00:00.000000000', '2019-06-01

In [559]:
len(ccfs_events_df) == len(ccfs_events_df['Site ID'].unique())

False

In [560]:
# the Site ID column is not unique (there is one repeat) so can't be used as an event ID.
# However, date and site ID is, so use that with the Cedar Creek prefix so it's globally unique
ccfs_events_df['event_id'] = 'ccfs_' \
                             + ccfs_events_df.sampledate.astype(str).str.strip() \
                             + '_' + ccfs_events_df['Site ID'].astype(str).str.strip()
ccfs_events_df.head()

Unnamed: 0,sampledate,Site ID,time,location,salt_surf,salt_1m,temp_surf,temp_1m,salmon_captured,salmon_examined,...,sockeye_examined,morts_recovery,morts_other,P_ratio,C_ratio,latitude,longitude,bycatch,notes,event_id
0,2018-04-26,1,,Buckle Bay,,,,,47.0,47.0,...,,,,,,,,,,ccfs_2018-04-26_1
1,2018-05-16,2,,Buckle Bay,28.04,28.11,11.88,11.72,35.0,16.0,...,,,,,,49°10.921,--125°57.883,1 oolichan,,ccfs_2018-05-16_2
2,2018-05-16,3,,Elbow Bank,26.73,26.65,12.12,12.14,53.0,53.0,...,,,,,,49°12.290,--125°57.134,"tube snout, 4 lingcod, 4 flatfish, 2 sculpin, ...",,ccfs_2018-05-16_3
3,2018-05-16,4,,Cypre River,25.1,25.18,12.28,12.12,8.0,8.0,...,,,,,,49°16.192,--125°54.358,tube snout,,ccfs_2018-05-16_4
4,2018-05-10,5,,Elbow Bank,24.8,24.81,11.56,11.47,26.0,26.0,...,,,,,,49°11.798,-125°56.885,"1 sculpin, 25-30 flatfish, juvenile cod",,ccfs_2018-05-10_5


In [561]:
len(ccfs_events_df) == len(ccfs_events_df['event_id'].unique())

True

In [562]:
# debugging
event_event_ids = list(ccfs_events_df.event_id.unique())
len(event_event_ids)

184

In [563]:
len(ccfs_events_df) == len(ccfs_events_df.event_id.unique())

True

In [564]:
# add in columns needed for future processing
# These didn't make sense in a single region, single source setting
ccfs_events_df['region'] = 'Clayoquot Sound'
ccfs_events_df['dfozone'] = '2_3'
ccfs_events_df['sample_site'] = ccfs_events_df['location'].str.strip()
ccfs_events_df['source'] = 'Cedar Coast Field Station'

In [565]:
# there are a couple of inconsistencies in the location names
ccfs_events_df['sample_site'].replace('Bedwell estuary', 'Bedwell Estuary', inplace=True)
ccfs_events_df['sample_site'].replace('Cypre', 'Cypre River', inplace=True)
ccfs_events_df['sample_site'].replace('Meares North', 'North Meares', inplace=True)
ccfs_events_df['sample_site'].replace('Cancer', 'Cancer (Herbert)', inplace=True)

In [566]:
def convert_deg_min_to_dec_deg(deg_min):
    """
    Converts a lat or long expressed as a string with degrees and decimal minutes, to a float of decimal degrees.
    :param deg_min: Degree and decimal minutes, e.g. "--125°57.883"
    :type deg_min: str
    :return: Decimal degree equivalent
    :rtype: float
    """
    # sometimes it's null - just return it
    if pd.isnull(deg_min):
        return deg_min

    # split the string at the degree symbol. strip to remove any extra spaces
    try:
        degree_str, minute_str = [x.strip() for x in deg_min.split('°')]
    except AttributeError as e:
        print("{} ({})".format(deg_min, type(deg_min)))
        raise e

    # sometimes the degree has '--' at the front
    if degree_str.startswith('--'):
        degree_str = degree_str[1:]

    degree = float(degree_str)
    minute = float(minute_str) if degree > 0 else -float(minute_str)
    dec_degree = degree + minute / 60

    return dec_degree

In [567]:
# convert latitude and longitude to decimal degrees
ccfs_events_df.latitude = ccfs_events_df.latitude.apply(convert_deg_min_to_dec_deg)
ccfs_events_df.longitude = ccfs_events_df.longitude.apply(convert_deg_min_to_dec_deg)

ccfs_events_df.head()

Unnamed: 0,sampledate,Site ID,time,location,salt_surf,salt_1m,temp_surf,temp_1m,salmon_captured,salmon_examined,...,C_ratio,latitude,longitude,bycatch,notes,event_id,region,dfozone,sample_site,source
0,2018-04-26,1,,Buckle Bay,,,,,47.0,47.0,...,,,,,,ccfs_2018-04-26_1,Clayoquot Sound,2_3,Buckle Bay,Cedar Coast Field Station
1,2018-05-16,2,,Buckle Bay,28.04,28.11,11.88,11.72,35.0,16.0,...,,49.182017,-125.964717,1 oolichan,,ccfs_2018-05-16_2,Clayoquot Sound,2_3,Buckle Bay,Cedar Coast Field Station
2,2018-05-16,3,,Elbow Bank,26.73,26.65,12.12,12.14,53.0,53.0,...,,49.204833,-125.952233,"tube snout, 4 lingcod, 4 flatfish, 2 sculpin, ...",,ccfs_2018-05-16_3,Clayoquot Sound,2_3,Elbow Bank,Cedar Coast Field Station
3,2018-05-16,4,,Cypre River,25.1,25.18,12.28,12.12,8.0,8.0,...,,49.269867,-125.905967,tube snout,,ccfs_2018-05-16_4,Clayoquot Sound,2_3,Cypre River,Cedar Coast Field Station
4,2018-05-10,5,,Elbow Bank,24.8,24.81,11.56,11.47,26.0,26.0,...,,49.196633,-125.948083,"1 sculpin, 25-30 flatfish, juvenile cod",,ccfs_2018-05-10_5,Clayoquot Sound,2_3,Elbow Bank,Cedar Coast Field Station


In [568]:
# limit to the common columns with the main events file
ccfs_events_df = ccfs_events_df.reindex(columns=['event_id', 'sampledate', 'region',
                                                 'dfozone', 'sample_site', 'latitude',
                                                 'longitude', 'source'])

In [569]:
ccfs_events_df.sampledate.dtype

dtype('<M8[ns]')

In [570]:
# write out to a CSV
ccfs_events_df.to_csv(ccfs_formatted_events_filepath, index=False)

In [571]:
ccfs_events_df.sampledate.dtype

dtype('<M8[ns]')

# Format the fish and lice data file

In [572]:
# load the fish and lice data files
ccfs_fish_lice_df = pd.read_csv(ccfs_fish_lice_filepath, parse_dates={'sampledate': ['year', 'month', 'day']})
ccfs_fish_lice_df.head()

Unnamed: 0,sampledate,fish_id,site_id,location,fish_num,species,length,height,Lep_cope,chalA,...,white_eye,Grazed_gill_plate,Lironca,comments,sum_all_lice,Prevalence,Motile Lep,Motile Caligus,chal,attached
0,2018-04-26,1,1,Buckle Bay,1,chum,44.0,5.0,1.0,2.0,...,,,,,3.0,1.0,0.0,0.0,2.0,3.0
1,2018-04-26,2,1,Buckle Bay,2,chum,46.0,7.0,,1.0,...,,,,,2.0,1.0,0.0,0.0,2.0,2.0
2,2018-04-26,3,1,Buckle Bay,3,chum,49.0,6.0,,2.0,...,,,,,4.0,1.0,0.0,0.0,4.0,4.0
3,2018-04-26,4,1,Buckle Bay,4,chum,48.0,9.0,,3.0,...,,,,,3.0,1.0,0.0,0.0,3.0,3.0
4,2018-04-26,5,1,Buckle Bay,5,chum,50.0,10.0,,1.0,...,,,,,2.0,1.0,0.0,0.0,2.0,2.0


In [573]:
ccfs_fish_lice_df.sampledate.unique()

array(['2018-04-26T00:00:00.000000000', '2018-05-16T00:00:00.000000000',
       '2018-05-10T00:00:00.000000000', '2018-05-23T00:00:00.000000000',
       '2018-06-09T00:00:00.000000000', '2018-06-23T00:00:00.000000000',
       '2019-03-01T00:00:00.000000000', '2019-03-05T00:00:00.000000000',
       '2019-03-14T00:00:00.000000000', '2019-03-29T00:00:00.000000000',
       '2019-04-10T00:00:00.000000000', '2019-04-17T00:00:00.000000000',
       '2019-04-18T00:00:00.000000000', '2019-04-25T00:00:00.000000000',
       '2019-04-30T00:00:00.000000000', '2019-05-01T00:00:00.000000000',
       '2019-05-02T00:00:00.000000000', '2019-05-04T00:00:00.000000000',
       '2019-05-08T00:00:00.000000000', '2019-05-10T00:00:00.000000000',
       '2019-05-14T00:00:00.000000000', '2019-05-15T00:00:00.000000000',
       '2019-05-22T00:00:00.000000000', '2019-05-23T00:00:00.000000000',
       '2019-05-30T00:00:00.000000000', '2019-05-31T00:00:00.000000000',
       '2019-06-01T00:00:00.000000000', '2019-06-08

In [574]:
# rename Cedar Coast columns that have equivalent data
ccfs_fish_lice_df.rename(columns={'species': 'fish_species',
                                  'Lep_cope': 'lep_cop',
                                  'Caligus_cope': 'cal_cop',
                                  'unid_cope': 'unknown_cop'},
                         inplace=True)

### Convert event_id and fish_id

In [575]:
# make the event_id column in fish_lice matche the event_id in the events
ccfs_fish_lice_df['event_id'] = 'ccfs_' \
                              + ccfs_fish_lice_df.sampledate.astype(str).str.strip() \
                              + '_' + ccfs_fish_lice_df['site_id'].astype(str).str.strip()

In [576]:
# debugging
fish_event_ids = list(ccfs_fish_lice_df.event_id.unique())
fish_not_event_ids = [eid for eid in fish_event_ids if eid not in event_event_ids]
len(fish_not_event_ids)

40

In [577]:
fish_without_events_df = ccfs_fish_lice_df[ccfs_fish_lice_df.event_id.isin(fish_not_event_ids)]
fish_without_events_df.to_csv('CC_fish_without_events.csv', index=False)
len(fish_without_events_df)

1177

In [578]:
# there are 1177 fish without an event in the events file

In [579]:
# make sure Cedar Coast fish get globally unique IDs - use the same prefix method as event IDs
ccfs_fish_lice_df['fish_id'] = 'ccfs_' + ccfs_fish_lice_df.fish_id.astype(str).str.strip()

In [580]:
ccfs_fish_lice_df.head()

Unnamed: 0,sampledate,fish_id,site_id,location,fish_num,fish_species,length,height,lep_cop,chalA,...,Grazed_gill_plate,Lironca,comments,sum_all_lice,Prevalence,Motile Lep,Motile Caligus,chal,attached,event_id
0,2018-04-26,ccfs_1,1,Buckle Bay,1,chum,44.0,5.0,1.0,2.0,...,,,,3.0,1.0,0.0,0.0,2.0,3.0,ccfs_2018-04-26_1
1,2018-04-26,ccfs_2,1,Buckle Bay,2,chum,46.0,7.0,,1.0,...,,,,2.0,1.0,0.0,0.0,2.0,2.0,ccfs_2018-04-26_1
2,2018-04-26,ccfs_3,1,Buckle Bay,3,chum,49.0,6.0,,2.0,...,,,,4.0,1.0,0.0,0.0,4.0,4.0,ccfs_2018-04-26_1
3,2018-04-26,ccfs_4,1,Buckle Bay,4,chum,48.0,9.0,,3.0,...,,,,3.0,1.0,0.0,0.0,3.0,3.0,ccfs_2018-04-26_1
4,2018-04-26,ccfs_5,1,Buckle Bay,5,chum,50.0,10.0,,1.0,...,,,,2.0,1.0,0.0,0.0,2.0,2.0,ccfs_2018-04-26_1


In [581]:
len(ccfs_fish_lice_df)

4953

## Convert Cedar Coast fish species to match the names we use
Our names: ['Chum Salmon', 'Pink Salmon', 'Coho Salmon',
       'Three-Spined Stickleback', 'Chinook Salmon', 'Other Species',
       'Pacific Herring', 'Sockeye Salmon']

In [582]:
# check the fish species in the SRS dataset
list(ccfs_fish_lice_df.fish_species.unique())

['chum',
 'coho',
 'pink',
 'chinook',
 'chum ',
 'coho ',
 'sockeye',
 'herring',
 'chinook ']

In [583]:
# replace salmon species names as the string literals differ
ccfs_fish_lice_df.replace({'fish_species': {'chum': 'Chum Salmon',
                                          'chum ': 'Chum Salmon',
                                          'coho': 'Coho Salmon',
                                          'coho ': 'Coho Salmon',
                                          'pink': 'Pink Salmon',
                                          'chinook': 'Chinook Salmon',
                                          'chinook ': 'Chinook Salmon',
                                          'sockeye': 'Sockeye Salmon',
                                          'herring': 'Pacific Herring',
                                          }},
                     inplace=True)

## Convert Cedar Coast lice counts to have the same groupings that we use

In [584]:
# convert all the NaN in lice counts to 0
# first, make sure our expression checking for NaN works
assert ccfs_fish_lice_df.loc[:, 'lep_cop':'unid_adult'].isnull().values.any()

In [585]:
# convert all the NaN lice counts to 0
col_names = ccfs_fish_lice_df.loc[:, 'lep_cop':'unid_adult'].columns
fill_dict = {col: 0 for col in col_names}
ccfs_fish_lice_df.fillna(value=fill_dict, inplace=True)

# confirm that they are all converted
assert not ccfs_fish_lice_df.loc[:, 'lep_cop':'unid_adult'].isnull().values.any()

In [586]:
# generate lice count fields to match our data from the SRS lice counts
ccfs_fish_lice_df['unknown_chal'] = ccfs_fish_lice_df['chalA'] + \
                               ccfs_fish_lice_df['chalB'] + \
                               ccfs_fish_lice_df['chal_unid']

ccfs_fish_lice_df['lep_motile'] = ccfs_fish_lice_df['Lep_PAmale'] + \
                            ccfs_fish_lice_df['Lep_PAfemale'] + \
                            ccfs_fish_lice_df['Lep_male'] + \
                            ccfs_fish_lice_df['Lep_nongravid'] + \
                            ccfs_fish_lice_df['Lep_gravid']

ccfs_fish_lice_df['cal_motile'] = ccfs_fish_lice_df['Caligus_mot'] + ccfs_fish_lice_df['Caligus_gravid']

ccfs_fish_lice_df['unknown_motile'] = ccfs_fish_lice_df['unid_PA'] + ccfs_fish_lice_df['unid_adult']

In [587]:
# remove all the redundant working columns
# easiest (not most efficient) way is to load our data file and use that list of columns
our_fish_lice_data_df = pd.read_csv(wild_data_dir / "wild_fish_lice.csv")
ccfs_fish_lice_df = ccfs_fish_lice_df.reindex(columns=our_fish_lice_data_df.columns)
ccfs_fish_lice_df.head()

Unnamed: 0,event_id,fish_id,length,weight,height,fish_species,lep_cop,lep_chal,lep_motile,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown
0,ccfs_2018-04-26_1,ccfs_1,44.0,,5.0,Chum Salmon,1,,0.0,,0.0,,0.0,,0.0,2.0,0.0,
1,ccfs_2018-04-26_1,ccfs_2,46.0,,7.0,Chum Salmon,0,,0.0,,0.0,,0.0,,0.0,2.0,0.0,
2,ccfs_2018-04-26_1,ccfs_3,49.0,,6.0,Chum Salmon,0,,0.0,,0.0,,0.0,,0.0,4.0,0.0,
3,ccfs_2018-04-26_1,ccfs_4,48.0,,9.0,Chum Salmon,0,,0.0,,0.0,,0.0,,0.0,3.0,0.0,
4,ccfs_2018-04-26_1,ccfs_5,50.0,,10.0,Chum Salmon,0,,0.0,,0.0,,0.0,,0.0,2.0,0.0,


In [588]:
len(ccfs_fish_lice_df)

4953

In [589]:
# There are some new lice count columns - so again make sure all lice count columns NaN means 0
col_names = ccfs_fish_lice_df.loc[:, 'lep_cop':'unknown_unknown'].columns
fill_dict = {col: 0 for col in col_names}
ccfs_fish_lice_df.fillna(value=fill_dict, inplace=True)

# check if there are any NaN in the lice counts
assert not ccfs_fish_lice_df.loc[:, 'lep_cop':'unknown_unknown'].isnull().values.any()

In [590]:
# write out to CSV file
ccfs_fish_lice_df.to_csv(ccfs_formatted_lice_filepath, index=False)

## Some data checks

In [591]:
ccfs_events_df.sampledate.dtype

dtype('<M8[ns]')

In [592]:
fish_event_ids = list(ccfs_fish_lice_df.event_id.unique())
len(fish_event_ids)

151

In [593]:
fish_event_ids = list(ccfs_fish_lice_df.event_id.unique())
fish_not_event_ids = [eid for eid in fish_event_ids if eid not in event_event_ids]
len(fish_not_event_ids)

40

In [594]:
unique_sid = {}
for eid in fish_not_event_ids:
    unique_sid[eid.split('_')[-1]] = 1
unique_sid.keys()

dict_keys(['51', '72', '74', '76', '77', '79', '78', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '101', '100', '102', '103', '105', '106', '107', '112', '113', '125'])

In [595]:
ccfs_events_df[ccfs_events_df['event_id'].str.startswith('ccfs_2019-05-14')]

Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source
51,ccfs_2019-05-14_52,2019-05-14,Clayoquot Sound,2_3,Bedwell River,49.360867,-125.775533,Cedar Coast Field Station
52,ccfs_2019-05-14_53,2019-05-14,Clayoquot Sound,2_3,Bedwell River,49.360867,-125.775533,Cedar Coast Field Station
53,ccfs_2019-05-14_54,2019-05-14,Clayoquot Sound,2_3,Bedwell River,49.360867,-125.775533,Cedar Coast Field Station


In [596]:
# wild_df = pd.merge(ccfs_events_df, cc_fish_lice_df, on='event_id', how='left')

In [597]:
# wild_df.sampledate.dtype