## This Notebook integrates the Salmon Coast Research Station wild sampling data into our wild sampling data.

In [93]:
import pandas as pd
from pathlib import Path

### Configuration variables for this notebook

In [94]:
# paths to files for our data
wild_data_dir = Path('.')
events_filepath = wild_data_dir / 'wild_sample_events.csv'
fish_lice_filepath = wild_data_dir / 'wild_fish_lice.csv'

# paths to the SRS data files
srs_wild_data_dir = Path('SalmonCoast') / 'Data'
srs_events_filepath = srs_wild_data_dir / 'BroughtonSeaLice_siteData.csv'
srs_fish_lice_filepath = srs_wild_data_dir / 'BroughtonSeaLice_fishData.csv'

# output paths for writing the combined data
all_events_filepath = wild_data_dir / 'all_wild_sample_events.csv'
all_fish_lice_filepath = wild_data_dir / 'all_wild_fish_lice.csv'

# Combine the event files

In [95]:
# load the events data files
events_df = pd.read_csv(events_filepath, parse_dates=['sampledate'])
srs_events_df = pd.read_csv(srs_events_filepath)

In [96]:
events_df.head()

Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source
0,1,2003-05-13,Broughton Archipelago,3_3,Adeane Point,50.71978,-125.6795,Fisheries and Oceans Canada
1,2,2003-05-21,Broughton Archipelago,3_3,Adeane Point,50.71978,-125.6795,Fisheries and Oceans Canada
2,3,2003-05-26,Broughton Archipelago,3_3,Adeane Point,50.7197,-125.6795,Fisheries and Oceans Canada
3,4,2003-06-02,Broughton Archipelago,3_3,Adeane Point,50.7197,-125.6795,Fisheries and Oceans Canada
4,5,2003-05-13,Broughton Archipelago,3_3,Adeane Point,50.7384,-125.67985,Fisheries and Oceans Canada


In [97]:
srs_events_df.head()

Unnamed: 0,site_id,year,month,day,location,salt,temp,salmon_captured,salmon_examined,pink_examined,chum_examined,sockeye_examined,morts_recovery,morts_other,P_ratio,C_ratio,latitude,longitude,bycatch,notes
0,1.0,2001,6,12,Burdwood,,,,18.0,18.0,0.0,0.0,,,,,,,,
1,2.0,2001,6,13,Wicklow,,,,10.0,10.0,0.0,0.0,,,,,,,,
2,3.0,2001,6,16,Glacier,,,,77.0,77.0,0.0,0.0,,,,,,,,
3,4.0,2001,6,23,Burdwood,,,,11.0,11.0,0.0,0.0,,,,,,,,
4,5.0,2001,6,24,Burdwood,,,,12.0,12.0,0.0,0.0,,,,,,,,


In [98]:
# shift the SRS site_id to maintain uniqueness when we put the DFs together (add a bit extra so it's easy to tell the different sources)
max_old_event_id = max(events_df.event_id) + 50000

# put the new event IDs in a new column
srs_events_df['event_id'] = srs_events_df['site_id'] + max_old_event_id

In [99]:
max_old_event_id

65092

In [100]:
# add in columns needed for future processing
# These didn't make sense in a single region, single source setting
srs_events_df['region'] = 'Broughton Archipelago'
srs_events_df['dfozone'] = '3_3'
srs_events_df['sample_site'] = srs_events_df['location']
srs_events_df['source'] = 'Salmon Coast Field Station / Raincoast Research'

In [101]:
# create a date field from SRS year, month, day fields
srs_events_df['sampledate'] = srs_events_df.apply(lambda x: pd.Timestamp(x.year,
                                                                         x.month,
                                                                         x.day),
                                                  axis=1)

In [102]:
# limit to the common columns with the main events file
srs_events_df = srs_events_df.reindex(columns=['event_id', 'sampledate', 'region',
                                               'dfozone', 'sample_site', 'latitude',
                                               'longitude', 'source'])

In [103]:
srs_events_df.head()

Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source
0,65093.0,2001-06-12,Broughton Archipelago,3_3,Burdwood,,,Salmon Coast Field Station / Raincoast Research
1,65094.0,2001-06-13,Broughton Archipelago,3_3,Wicklow,,,Salmon Coast Field Station / Raincoast Research
2,65095.0,2001-06-16,Broughton Archipelago,3_3,Glacier,,,Salmon Coast Field Station / Raincoast Research
3,65096.0,2001-06-23,Broughton Archipelago,3_3,Burdwood,,,Salmon Coast Field Station / Raincoast Research
4,65097.0,2001-06-24,Broughton Archipelago,3_3,Burdwood,,,Salmon Coast Field Station / Raincoast Research


In [104]:
# append to the main events DF
all_events_df = pd.concat([events_df, srs_events_df])
all_events_df.head()

Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source
0,1.0,2003-05-13,Broughton Archipelago,3_3,Adeane Point,50.71978,-125.6795,Fisheries and Oceans Canada
1,2.0,2003-05-21,Broughton Archipelago,3_3,Adeane Point,50.71978,-125.6795,Fisheries and Oceans Canada
2,3.0,2003-05-26,Broughton Archipelago,3_3,Adeane Point,50.7197,-125.6795,Fisheries and Oceans Canada
3,4.0,2003-06-02,Broughton Archipelago,3_3,Adeane Point,50.7197,-125.6795,Fisheries and Oceans Canada
4,5.0,2003-05-13,Broughton Archipelago,3_3,Adeane Point,50.7384,-125.67985,Fisheries and Oceans Canada


In [105]:
# write out to a CSV. This file will be used for all
all_events_df.to_csv(all_events_filepath, index=False)

# Combine the fish and lice data files

In [106]:
# load the fish and lice data files
fish_df = pd.read_csv(fish_lice_filepath)
srs_fish_df = pd.read_csv(srs_fish_lice_filepath, encoding='latin-1')

In [107]:
max(fish_df.event_id)

15092

In [108]:
fish_df.head()

Unnamed: 0,event_id,fish_id,length,weight,height,fish_species,lep_cop,lep_chal,lep_motile,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown
0,5666,1,44.0,0.96,,Chum Salmon,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5666,2,39.0,0.61,,Pink Salmon,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5666,3,45.0,0.94,,Pink Salmon,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5666,4,43.0,0.84,,Chum Salmon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5666,5,38.0,0.53,,Pink Salmon,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
srs_fish_df.head()

Unnamed: 0,fish_id,site_id,year,day,month,location,fish_num,species,length,height,...,pred_scar,mot_scar,hem,mateguarding,eroded_gill,white_eye,blue_blotches,pinched_belly,scales,comments
0,1,1.0,2001,12,6,Burdwood,,pink,50.0,,...,,,,,,,,,,
1,2,1.0,2001,12,6,Burdwood,,pink,50.0,,...,,,,,,,,,,
2,3,1.0,2001,12,6,Burdwood,,pink,50.0,,...,,,,,,,,,,
3,4,1.0,2001,12,6,Burdwood,,pink,53.0,,...,,,,,,,,,,
4,5,1.0,2001,12,6,Burdwood,,pink,54.0,,...,,,,,,,,,,


In [110]:
# rename SRS columns that have equivalent data
srs_fish_df.rename(columns={'species': 'fish_species',
                            'Lep_cope': 'lep_cop',
                            'Caligus_cope': 'cal_cop',
                            'unid_cope': 'unknown_cop'},
                  inplace=True)

## Convert event_id and fish_id

In [111]:
# make sure SRS fish get unique IDs
max_fish_id = max(fish_df.fish_id)
srs_fish_df.fish_id = srs_fish_df.fish_id + max_fish_id

In [112]:
# make a new SRS event_id column that matches the event_id in the all_events_df
srs_fish_df['event_id'] = srs_fish_df.site_id + max_old_event_id

## Convert SRS fish species to match

In [113]:
# check the fish species in the SRS dataset
srs_fish_df.fish_species.unique()

array(['pink', 'chum', 'sockeye'], dtype=object)

In [114]:
fish_df.fish_species.unique()

array(['Chum Salmon', 'Pink Salmon', 'Coho Salmon',
       'Three-Spined Stickleback', 'Chinook Salmon', 'Other Species',
       'Pacific Herring', 'Sockeye Salmon'], dtype=object)

In [115]:
# replace salmon species names as the string literals differ
srs_fish_df.replace({'fish_species': {'chum': 'Chum Salmon',
                                      'pink': 'Pink Salmon',
                                      'sockeye': 'Sockeye Salmon'}},
                   inplace=True)

## Convert SRS lice counts

In [116]:
# convert all the NaN in lice counts to 0
# first, make sure our expression checking for NaN works
assert srs_fish_df.loc[:, 'lep_cop':'unid_adult'].isnull().values.any()

In [117]:
# convert all the NaN lice counts to 0
col_names = srs_fish_df.loc[:, 'lep_cop':'unid_adult'].columns
fill_dict = {col: 0 for col in col_names}
srs_fish_df.fillna(value=fill_dict, inplace=True)

# confirm that they are all converted
assert not srs_fish_df.loc[:, 'lep_cop':'unid_adult'].isnull().values.any()

In [118]:
# confirm there are no NaN lice counts in our data - should return False
assert not fish_df.loc[:, 'lep_cop':'unknown_unknown'].isnull().values.any()

In [119]:
# generate lice count fields to match our data from the SRS lice counts
srs_fish_df['unknown_chal'] = srs_fish_df['chalA'] + \
                              srs_fish_df['chalB'] + \
                              srs_fish_df['chal_unid']

srs_fish_df['lep_motile'] = srs_fish_df['Lep_PAmale'] + \
                            srs_fish_df['Lep_PAfemale'] + \
                            srs_fish_df['Lep_male'] + \
                            srs_fish_df['Lep_nongravid'] + \
                            srs_fish_df['Lep_gravid']

srs_fish_df['cal_motile'] = srs_fish_df['Caligus_mot'] + srs_fish_df['Caligus_gravid']

srs_fish_df['unknown_motile'] = srs_fish_df['unid_PA'] + srs_fish_df['unid_adult']

In [120]:
srs_fish_df.columns

Index(['fish_id', 'site_id', 'year', 'day', 'month', 'location', 'fish_num',
       'fish_species', 'length', 'height', 'lep_cop', 'chalA', 'chalB',
       'Lep_PAmale', 'Lep_PAfemale', 'Lep_male', 'Lep_nongravid', 'Lep_gravid',
       'cal_cop', 'Caligus_mot', 'Caligus_gravid', 'unknown_cop', 'chal_unid',
       'unid_PA', 'unid_adult', 'chal_scar', 'pred_scar', 'mot_scar', 'hem',
       'mateguarding', 'eroded_gill', 'white_eye', 'blue_blotches',
       'pinched_belly', 'scales', 'comments', 'event_id', 'unknown_chal',
       'lep_motile', 'cal_motile', 'unknown_motile'],
      dtype='object')

## Join the two fish/lice dataframes

In [121]:
# make sure only the common columns are in the SRS dataframe
srs_fish_df = srs_fish_df.reindex(columns=fish_df.columns)

In [122]:
srs_fish_df.columns

Index(['event_id', 'fish_id', 'length', 'weight', 'height', 'fish_species',
       'lep_cop', 'lep_chal', 'lep_motile', 'lep_unknown', 'cal_cop',
       'cal_chal', 'cal_motile', 'cal_unknown', 'unknown_cop', 'unknown_chal',
       'unknown_motile', 'unknown_unknown'],
      dtype='object')

In [123]:
# There are some new lice count columns - so again make sure all lice count columns NaN means 0
col_names = srs_fish_df.loc[:, 'lep_cop':'unknown_unknown'].columns
fill_dict = {col: 0 for col in col_names}
srs_fish_df.fillna(value=fill_dict, inplace=True)

# check if there are any NaN in the lice counts
assert not srs_fish_df.loc[:, 'lep_cop':'unknown_unknown'].isnull().values.any()

In [124]:
# join the DFs together
all_fish_df = pd.concat([fish_df, srs_fish_df])

In [125]:
# rename species: Blenny, Cutthroat Trout, Non Salmonid => Other Species
other_species = "Other Species"
all_fish_df['fish_species'].replace({'Blenny': other_species,
                                     'Cutthroat Trout': other_species,
                                     'Non Salmonid': other_species},
                                    inplace=True)
all_fish_df.head()

Unnamed: 0,event_id,fish_id,length,weight,height,fish_species,lep_cop,lep_chal,lep_motile,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown
0,5666.0,1,44.0,0.96,,Chum Salmon,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5666.0,2,39.0,0.61,,Pink Salmon,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5666.0,3,45.0,0.94,,Pink Salmon,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5666.0,4,43.0,0.84,,Chum Salmon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5666.0,5,38.0,0.53,,Pink Salmon,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
# save to file
all_fish_df.to_csv(all_fish_lice_filepath, index=False)