## This Notebook formats the Salmon Coast Research Station wild sampling data to fit the format of our wild sampling data.
### To integrate it with other wild data, run 'Integrate_Wild_Data.ipynb' after generating the formatted data.

In [65]:
import pandas as pd
from pathlib import Path

### Configuration variables for this notebook
Adjust for your local setup

In [66]:
# paths to the source Cedar Creek data files
ccfs_wild_data_dir = Path('CedarCoast') / 'Data'
ccfs_events_filepath = ccfs_wild_data_dir / 'ClayoquotSeaLice_Site_Data.csv'
ccfs_fish_lice_filepath = ccfs_wild_data_dir / 'ClayoquotSeaLice_fishData.csv'

# output paths for writing the formatted Cedar Creek Field Station data
wild_data_dir = Path('.')
ccfs_formatted_events_filepath = wild_data_dir / 'cedar_coast_wild_sample_events.csv'
ccfs_formatted_lice_filepath = wild_data_dir / 'cedar_coast_wild_fish_lice.csv'

# Format the event file

In [67]:
# read in the event file and parse the year, month, day columns into a date
ccfs_events_df = pd.read_csv(ccfs_events_filepath, parse_dates={'sampledate': ['year', 'month', 'day']})
ccfs_events_df.head()

Unnamed: 0,sampledate,Site ID,time,location,salt_surf,salt_1m,temp_surf,temp_1m,salmon_captured,salmon_examined,...,chum_examined,sockeye_examined,morts_recovery,morts_other,P_ratio,C_ratio,latitude,longitude,bycatch,notes
0,2018-04-26,1,,Buckle Bay,,,,,47.0,47.0,...,47.0,,,,,,,,,
1,2018-05-16,2,,Buckle Bay,28.04,28.11,11.88,11.72,35.0,16.0,...,16.0,,,,,,49°10.921,--125°57.883,1 oolichan,
2,2018-05-16,3,,Elbow Bank,26.73,26.65,12.12,12.14,53.0,53.0,...,48.0,,,,,,49°12.290,--125°57.134,"tube snout, 4 lingcod, 4 flatfish, 2 sculpin, ...",
3,2018-05-16,4,,Cypre River,25.1,25.18,12.28,12.12,8.0,8.0,...,,,,,,,49°16.192,--125°54.358,tube snout,
4,2018-05-10,5,,Elbow Bank,24.8,24.81,11.56,11.47,26.0,26.0,...,26.0,,,,,,49°11.798,-125°56.885,"1 sculpin, 25-30 flatfish, juvenile cod",


In [68]:
# the Site ID column is not unique (there is one repeat) so can't be used as an event ID.
# However, date and site ID is, so use that with the Cedar Creek prefix so it's globally unique
ccfs_events_df['event_id'] = 'ccfs_' + ccfs_events_df.sampledate.astype(str) + ccfs_events_df['Site ID'].astype(str)
ccfs_events_df.head()

Unnamed: 0,sampledate,Site ID,time,location,salt_surf,salt_1m,temp_surf,temp_1m,salmon_captured,salmon_examined,...,sockeye_examined,morts_recovery,morts_other,P_ratio,C_ratio,latitude,longitude,bycatch,notes,event_id
0,2018-04-26,1,,Buckle Bay,,,,,47.0,47.0,...,,,,,,,,,,ccfs_2018-04-261
1,2018-05-16,2,,Buckle Bay,28.04,28.11,11.88,11.72,35.0,16.0,...,,,,,,49°10.921,--125°57.883,1 oolichan,,ccfs_2018-05-162
2,2018-05-16,3,,Elbow Bank,26.73,26.65,12.12,12.14,53.0,53.0,...,,,,,,49°12.290,--125°57.134,"tube snout, 4 lingcod, 4 flatfish, 2 sculpin, ...",,ccfs_2018-05-163
3,2018-05-16,4,,Cypre River,25.1,25.18,12.28,12.12,8.0,8.0,...,,,,,,49°16.192,--125°54.358,tube snout,,ccfs_2018-05-164
4,2018-05-10,5,,Elbow Bank,24.8,24.81,11.56,11.47,26.0,26.0,...,,,,,,49°11.798,-125°56.885,"1 sculpin, 25-30 flatfish, juvenile cod",,ccfs_2018-05-105


In [69]:
len(ccfs_events_df) == len(ccfs_events_df.event_id.unique())

True

In [70]:
# add in columns needed for future processing
# These didn't make sense in a single region, single source setting
ccfs_events_df['region'] = 'Clayoquot Sound'
ccfs_events_df['dfozone'] = '2_3'
ccfs_events_df['sample_site'] = ccfs_events_df['location']
ccfs_events_df['source'] = 'Cedar Coast Field Station'

In [71]:
def convert_deg_min_to_dec_deg(deg_min):
    """
    Converts a lat or long expressed as a string with degrees and decimal minutes, to a float of decimal degrees.
    :param deg_min: Degree and decimal minutes, e.g. "--125°57.883"
    :type deg_min: str
    :return: Decimal degree equivalent
    :rtype: float
    """
    # sometimes it's null - just return it
    if pd.isnull(deg_min):
        return deg_min

    # split the string at the degree symbol. strip to remove any extra spaces
    try:
        degree_str, minute_str = [x.strip() for x in deg_min.split('°')]
    except AttributeError as e:
        print("{} ({})".format(deg_min, type(deg_min)))
        raise e

    # sometimes the degree has '--' at the front
    if degree_str.startswith('--'):
        degree_str = degree_str[1:]

    degree = float(degree_str)
    minute = float(minute_str) if degree > 0 else -float(minute_str)
    dec_degree = degree + minute / 60

    return dec_degree

In [72]:
# convert latitude and longitude to decimal degrees
ccfs_events_df.latitude = ccfs_events_df.latitude.apply(convert_deg_min_to_dec_deg)
ccfs_events_df.longitude = ccfs_events_df.longitude.apply(convert_deg_min_to_dec_deg)

ccfs_events_df.head()

Unnamed: 0,sampledate,Site ID,time,location,salt_surf,salt_1m,temp_surf,temp_1m,salmon_captured,salmon_examined,...,C_ratio,latitude,longitude,bycatch,notes,event_id,region,dfozone,sample_site,source
0,2018-04-26,1,,Buckle Bay,,,,,47.0,47.0,...,,,,,,ccfs_2018-04-261,Clayoquot Sound,2_3,Buckle Bay,Cedar Coast Field Station
1,2018-05-16,2,,Buckle Bay,28.04,28.11,11.88,11.72,35.0,16.0,...,,49.182017,-125.964717,1 oolichan,,ccfs_2018-05-162,Clayoquot Sound,2_3,Buckle Bay,Cedar Coast Field Station
2,2018-05-16,3,,Elbow Bank,26.73,26.65,12.12,12.14,53.0,53.0,...,,49.204833,-125.952233,"tube snout, 4 lingcod, 4 flatfish, 2 sculpin, ...",,ccfs_2018-05-163,Clayoquot Sound,2_3,Elbow Bank,Cedar Coast Field Station
3,2018-05-16,4,,Cypre River,25.1,25.18,12.28,12.12,8.0,8.0,...,,49.269867,-125.905967,tube snout,,ccfs_2018-05-164,Clayoquot Sound,2_3,Cypre River,Cedar Coast Field Station
4,2018-05-10,5,,Elbow Bank,24.8,24.81,11.56,11.47,26.0,26.0,...,,49.196633,-125.948083,"1 sculpin, 25-30 flatfish, juvenile cod",,ccfs_2018-05-105,Clayoquot Sound,2_3,Elbow Bank,Cedar Coast Field Station


In [73]:
# limit to the common columns with the main events file
ccfs_events_df = ccfs_events_df.reindex(columns=['event_id', 'sampledate', 'region',
                                                 'dfozone', 'sample_site', 'latitude',
                                                 'longitude', 'source'])

In [74]:
ccfs_events_df.head()

Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source
0,ccfs_2018-04-261,2018-04-26,Clayoquot Sound,2_3,Buckle Bay,,,Cedar Coast Field Station
1,ccfs_2018-05-162,2018-05-16,Clayoquot Sound,2_3,Buckle Bay,49.182017,-125.964717,Cedar Coast Field Station
2,ccfs_2018-05-163,2018-05-16,Clayoquot Sound,2_3,Elbow Bank,49.204833,-125.952233,Cedar Coast Field Station
3,ccfs_2018-05-164,2018-05-16,Clayoquot Sound,2_3,Cypre River,49.269867,-125.905967,Cedar Coast Field Station
4,ccfs_2018-05-105,2018-05-10,Clayoquot Sound,2_3,Elbow Bank,49.196633,-125.948083,Cedar Coast Field Station


In [75]:
# write out to a CSV
ccfs_events_df.to_csv(ccfs_formatted_events_filepath, index=False)

# Format the fish and lice data file

In [76]:
# load the fish and lice data files
cc_fish_lice_df = pd.read_csv(ccfs_fish_lice_filepath, parse_dates={'sampledate': ['year', 'month', 'day']})
cc_fish_lice_df.head()

Unnamed: 0,sampledate,fish_id,site_id,location,fish_num,species,length,height,Lep_cope,chalA,...,white_eye,Grazed_gill_plate,Lironca,comments,sum_all_lice,Prevalence,Motile Lep,Motile Caligus,chal,attached
0,2018-04-26,1,1,Buckle Bay,1,chum,44.0,5.0,1.0,2.0,...,,,,,3.0,1.0,0.0,0.0,2.0,3.0
1,2018-04-26,2,1,Buckle Bay,2,chum,46.0,7.0,,1.0,...,,,,,2.0,1.0,0.0,0.0,2.0,2.0
2,2018-04-26,3,1,Buckle Bay,3,chum,49.0,6.0,,2.0,...,,,,,4.0,1.0,0.0,0.0,4.0,4.0
3,2018-04-26,4,1,Buckle Bay,4,chum,48.0,9.0,,3.0,...,,,,,3.0,1.0,0.0,0.0,3.0,3.0
4,2018-04-26,5,1,Buckle Bay,5,chum,50.0,10.0,,1.0,...,,,,,2.0,1.0,0.0,0.0,2.0,2.0


In [77]:
# rename Cedar Coast columns that have equivalent data
cc_fish_lice_df.rename(columns={'species': 'fish_species',
                                'Lep_cope': 'lep_cop',
                                'Caligus_cope': 'cal_cop',
                                'unid_cope': 'unknown_cop'},
                  inplace=True)

### Convert event_id and fish_id

In [78]:
# make the event_id column in fish_lice matche the event_id in the events
cc_fish_lice_df['event_id'] = 'ccfs_' + cc_fish_lice_df.sampledate.astype(str) + cc_fish_lice_df['site_id'].astype(str)

In [79]:
# make sure Cedar Coast fish get globally unique IDs - use the same prefix method as event IDs
cc_fish_lice_df['fish_id'] = 'ccfs_' + cc_fish_lice_df.fish_id.astype(str)

In [80]:
cc_fish_lice_df.head()

Unnamed: 0,sampledate,fish_id,site_id,location,fish_num,fish_species,length,height,lep_cop,chalA,...,Grazed_gill_plate,Lironca,comments,sum_all_lice,Prevalence,Motile Lep,Motile Caligus,chal,attached,event_id
0,2018-04-26,ccfs_1,1,Buckle Bay,1,chum,44.0,5.0,1.0,2.0,...,,,,3.0,1.0,0.0,0.0,2.0,3.0,ccfs_2018-04-261
1,2018-04-26,ccfs_2,1,Buckle Bay,2,chum,46.0,7.0,,1.0,...,,,,2.0,1.0,0.0,0.0,2.0,2.0,ccfs_2018-04-261
2,2018-04-26,ccfs_3,1,Buckle Bay,3,chum,49.0,6.0,,2.0,...,,,,4.0,1.0,0.0,0.0,4.0,4.0,ccfs_2018-04-261
3,2018-04-26,ccfs_4,1,Buckle Bay,4,chum,48.0,9.0,,3.0,...,,,,3.0,1.0,0.0,0.0,3.0,3.0,ccfs_2018-04-261
4,2018-04-26,ccfs_5,1,Buckle Bay,5,chum,50.0,10.0,,1.0,...,,,,2.0,1.0,0.0,0.0,2.0,2.0,ccfs_2018-04-261


## Convert Cedar Coast fish species to match the names we use
Our names: ['Chum Salmon', 'Pink Salmon', 'Coho Salmon',
       'Three-Spined Stickleback', 'Chinook Salmon', 'Other Species',
       'Pacific Herring', 'Sockeye Salmon']

In [81]:
# check the fish species in the SRS dataset
['{}'.format(species) for species in cc_fish_lice_df.fish_species.unique()]

['chum',
 'coho',
 'pink',
 'chinook',
 'chum ',
 'coho ',
 'sockeye',
 'herring',
 'chinook ']

In [82]:
# replace salmon species names as the string literals differ
cc_fish_lice_df.replace({'fish_species': {'chum': 'Chum Salmon',
                                          'chum ': 'Chum Salmon',
                                          'coho': 'Coho Salmon',
                                          'coho ': 'Coho Salmon',
                                          'pink': 'Pink Salmon',
                                          'chinook': 'Chinook Salmon',
                                          'chinook ': 'Chinook Salmon',
                                          'sockeye': 'Sockeye Salmon',
                                          'herring': 'Pacific Herring',
                                          }},
                     inplace=True)

## Convert Cedar Coast lice counts to have the same groupings that we use

In [83]:
# convert all the NaN in lice counts to 0
# first, make sure our expression checking for NaN works
assert cc_fish_lice_df.loc[:, 'lep_cop':'unid_adult'].isnull().values.any()

In [84]:
# convert all the NaN lice counts to 0
col_names = cc_fish_lice_df.loc[:, 'lep_cop':'unid_adult'].columns
fill_dict = {col: 0 for col in col_names}
cc_fish_lice_df.fillna(value=fill_dict, inplace=True)

# confirm that they are all converted
assert not cc_fish_lice_df.loc[:, 'lep_cop':'unid_adult'].isnull().values.any()

In [85]:
# generate lice count fields to match our data from the SRS lice counts
cc_fish_lice_df['unknown_chal'] = cc_fish_lice_df['chalA'] + \
                               cc_fish_lice_df['chalB'] + \
                               cc_fish_lice_df['chal_unid']

cc_fish_lice_df['lep_motile'] = cc_fish_lice_df['Lep_PAmale'] + \
                            cc_fish_lice_df['Lep_PAfemale'] + \
                            cc_fish_lice_df['Lep_male'] + \
                            cc_fish_lice_df['Lep_nongravid'] + \
                            cc_fish_lice_df['Lep_gravid']

cc_fish_lice_df['cal_motile'] = cc_fish_lice_df['Caligus_mot'] + cc_fish_lice_df['Caligus_gravid']

cc_fish_lice_df['unknown_motile'] = cc_fish_lice_df['unid_PA'] + cc_fish_lice_df['unid_adult']

In [86]:
# remove all the redundant working columns
# easiest (not most efficient) way is to load our data file and use that list of columns
our_fish_data_df = pd.read_csv(wild_data_dir / "wild_fish_lice.csv")
scfs_fish_df = cc_fish_lice_df.reindex(columns=our_fish_data_df.columns)
scfs_fish_df.head()

Unnamed: 0,event_id,fish_id,length,weight,height,fish_species,lep_cop,lep_chal,lep_motile,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown
0,ccfs_2018-04-261,ccfs_1,44.0,,5.0,Chum Salmon,1,,0.0,,0.0,,0.0,,0.0,2.0,0.0,
1,ccfs_2018-04-261,ccfs_2,46.0,,7.0,Chum Salmon,0,,0.0,,0.0,,0.0,,0.0,2.0,0.0,
2,ccfs_2018-04-261,ccfs_3,49.0,,6.0,Chum Salmon,0,,0.0,,0.0,,0.0,,0.0,4.0,0.0,
3,ccfs_2018-04-261,ccfs_4,48.0,,9.0,Chum Salmon,0,,0.0,,0.0,,0.0,,0.0,3.0,0.0,
4,ccfs_2018-04-261,ccfs_5,50.0,,10.0,Chum Salmon,0,,0.0,,0.0,,0.0,,0.0,2.0,0.0,


In [87]:
# There are some new lice count columns - so again make sure all lice count columns NaN means 0
col_names = scfs_fish_df.loc[:, 'lep_cop':'unknown_unknown'].columns
fill_dict = {col: 0 for col in col_names}
scfs_fish_df.fillna(value=fill_dict, inplace=True)

# check if there are any NaN in the lice counts
assert not scfs_fish_df.loc[:, 'lep_cop':'unknown_unknown'].isnull().values.any()

In [88]:

cc_fish_lice_df.to_csv(ccfs_formatted_lice_filepath, index=False)