## This Notebook formats the Salmon Coast Research Station wild sampling data to fit the format of our wild sampling data.
### To integrate it with other wild data, run 'Integrate_Wild_Data.ipynb' after generating the formatted data.

In [32]:
import pandas as pd
from pathlib import Path

### Configuration variables for this notebook
Adjust for your local setup

In [33]:
# paths to the source Salmon Coast data files
scfs_wild_data_dir = Path('SalmonCoast') / 'Data'
scfs_events_filepath = scfs_wild_data_dir / 'BroughtonSeaLice_siteData.csv'
scfs_fish_lice_filepath = scfs_wild_data_dir / 'BroughtonSeaLice_fishData.csv'

# output paths for writing the formatted Salmon Coast Field Station data
wild_data_dir = Path('.')
scfs_formatted_events_filepath = wild_data_dir / 'salmon_coast_wild_sample_events.csv'
scfs_formatted_lice_filepath = wild_data_dir / 'salmon_coast_wild_fish_lice.csv'

# Format the event file

In [34]:
# read in the event file and parse the year, month, day columns into a date
scfs_events_df = pd.read_csv(scfs_events_filepath, parse_dates={'sampledate': ['year', 'month', 'day']})
scfs_events_df.head()

Unnamed: 0,sampledate,site_id,location,salt,temp,salmon_captured,salmon_examined,pink_examined,chum_examined,sockeye_examined,morts_recovery,morts_other,P_ratio,C_ratio,latitude,longitude,bycatch,notes
0,2001-06-12,1.0,Burdwood,,,,18.0,18.0,0.0,0.0,,,,,,,,
1,2001-06-13,2.0,Wicklow,,,,10.0,10.0,0.0,0.0,,,,,,,,
2,2001-06-16,3.0,Glacier,,,,77.0,77.0,0.0,0.0,,,,,,,,
3,2001-06-23,4.0,Burdwood,,,,11.0,11.0,0.0,0.0,,,,,,,,
4,2001-06-24,5.0,Burdwood,,,,12.0,12.0,0.0,0.0,,,,,,,,


In [35]:
# check that site_id is unique. If it is we can use it as an event ID.
len(scfs_events_df) == len(scfs_events_df.site_id.unique())

True

In [36]:
# make an event ID that will be unique over all wild data sources - make it a string and prepend 'scfs_'
scfs_events_df['event_id'] = scfs_events_df.site_id.apply(lambda sid: 'scfs_' + str(sid))
scfs_events_df.head()

Unnamed: 0,sampledate,site_id,location,salt,temp,salmon_captured,salmon_examined,pink_examined,chum_examined,sockeye_examined,morts_recovery,morts_other,P_ratio,C_ratio,latitude,longitude,bycatch,notes,event_id
0,2001-06-12,1.0,Burdwood,,,,18.0,18.0,0.0,0.0,,,,,,,,,scfs_1.0
1,2001-06-13,2.0,Wicklow,,,,10.0,10.0,0.0,0.0,,,,,,,,,scfs_2.0
2,2001-06-16,3.0,Glacier,,,,77.0,77.0,0.0,0.0,,,,,,,,,scfs_3.0
3,2001-06-23,4.0,Burdwood,,,,11.0,11.0,0.0,0.0,,,,,,,,,scfs_4.0
4,2001-06-24,5.0,Burdwood,,,,12.0,12.0,0.0,0.0,,,,,,,,,scfs_5.0


In [37]:
# add in columns needed for future processing
# These didn't make sense in a single region, single source setting
scfs_events_df['region'] = 'Broughton Archipelago'
scfs_events_df['dfozone'] = '3_3'
scfs_events_df['sample_site'] = scfs_events_df['location']
scfs_events_df['source'] = 'Salmon Coast Field Station'

In [38]:
# some of the longitudes are positive - they should be around -126, others are NaN, some are correct
# make sure they're all numbers
scfs_events_df['longitude'] = pd.to_numeric(scfs_events_df.longitude, errors='coerce')
# fix up the wrong ones
scfs_events_df['longitude'] = scfs_events_df.longitude.apply(lambda long: -long if not pd.isnull(long) and long > 0 else long)

In [39]:
# limit to the common columns with the main events file
scfs_events_df = scfs_events_df.reindex(columns=['event_id', 'sampledate', 'region',
                                                 'dfozone', 'sample_site', 'latitude',
                                                 'longitude', 'source'])

In [40]:
scfs_events_df.head()

Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source
0,scfs_1.0,2001-06-12,Broughton Archipelago,3_3,Burdwood,,,Salmon Coast Field Station
1,scfs_2.0,2001-06-13,Broughton Archipelago,3_3,Wicklow,,,Salmon Coast Field Station
2,scfs_3.0,2001-06-16,Broughton Archipelago,3_3,Glacier,,,Salmon Coast Field Station
3,scfs_4.0,2001-06-23,Broughton Archipelago,3_3,Burdwood,,,Salmon Coast Field Station
4,scfs_5.0,2001-06-24,Broughton Archipelago,3_3,Burdwood,,,Salmon Coast Field Station


In [41]:
# write out to a CSV
scfs_events_df.to_csv(scfs_formatted_events_filepath, index=False)

# Format the fish and lice data file

In [42]:
# load the fish and lice data files
scfs_fish_df = pd.read_csv(scfs_fish_lice_filepath, encoding='latin-1')
scfs_fish_df.head()

Unnamed: 0,fish_id,site_id,year,day,month,location,fish_num,species,length,height,...,pred_scar,mot_scar,hem,mateguarding,eroded_gill,white_eye,blue_blotches,pinched_belly,scales,comments
0,1,1.0,2001,12,6,Burdwood,,pink,50.0,,...,,,,,,,,,,
1,2,1.0,2001,12,6,Burdwood,,pink,50.0,,...,,,,,,,,,,
2,3,1.0,2001,12,6,Burdwood,,pink,50.0,,...,,,,,,,,,,
3,4,1.0,2001,12,6,Burdwood,,pink,53.0,,...,,,,,,,,,,
4,5,1.0,2001,12,6,Burdwood,,pink,54.0,,...,,,,,,,,,,


In [43]:
# rename SRS columns that have equivalent data
scfs_fish_df.rename(columns={'species': 'fish_species',
                            'Lep_cope': 'lep_cop',
                            'Caligus_cope': 'cal_cop',
                            'unid_cope': 'unknown_cop'},
                  inplace=True)

### Convert event_id and fish_id

In [44]:
# make the event_id column in fish_lice matche the event_id in the events
scfs_fish_df['event_id'] = scfs_fish_df.site_id.apply(lambda sid: 'scfs_' + str(sid))

In [45]:
# make sure SRS fish get globally unique IDs - use the same prefix method as event IDs
scfs_fish_df['fish_id'] = scfs_fish_df.fish_id.apply(lambda fid: 'scfs_' + str(fid))
# max_fish_id = max(fish_df.fish_id)
# srs_fish_df.fish_id = srs_fish_df.fish_id + max_fish_id

In [46]:
scfs_fish_df.head()

Unnamed: 0,fish_id,site_id,year,day,month,location,fish_num,fish_species,length,height,...,mot_scar,hem,mateguarding,eroded_gill,white_eye,blue_blotches,pinched_belly,scales,comments,event_id
0,scfs_1,1.0,2001,12,6,Burdwood,,pink,50.0,,...,,,,,,,,,,scfs_1.0
1,scfs_2,1.0,2001,12,6,Burdwood,,pink,50.0,,...,,,,,,,,,,scfs_1.0
2,scfs_3,1.0,2001,12,6,Burdwood,,pink,50.0,,...,,,,,,,,,,scfs_1.0
3,scfs_4,1.0,2001,12,6,Burdwood,,pink,53.0,,...,,,,,,,,,,scfs_1.0
4,scfs_5,1.0,2001,12,6,Burdwood,,pink,54.0,,...,,,,,,,,,,scfs_1.0


## Convert Salmon Coast fish species to match the names we use
Our names: ['Chum Salmon', 'Pink Salmon', 'Coho Salmon',
       'Three-Spined Stickleback', 'Chinook Salmon', 'Other Species',
       'Pacific Herring', 'Sockeye Salmon']

In [47]:
# check the fish species in the SRS dataset
scfs_fish_df.fish_species.unique()

array(['pink', 'chum', 'sockeye'], dtype=object)

In [48]:
# replace salmon species names as the string literals differ
scfs_fish_df.replace({'fish_species': {'chum': 'Chum Salmon',
                                       'pink': 'Pink Salmon',
                                       'sockeye': 'Sockeye Salmon'}},
                     inplace=True)

## Convert SRS lice counts

In [49]:
# convert all the NaN in lice counts to 0
# first, make sure our expression checking for NaN works
assert scfs_fish_df.loc[:, 'lep_cop':'unid_adult'].isnull().values.any()

In [50]:
# convert all the NaN lice counts to 0
col_names = scfs_fish_df.loc[:, 'lep_cop':'unid_adult'].columns
fill_dict = {col: 0 for col in col_names}
scfs_fish_df.fillna(value=fill_dict, inplace=True)

# confirm that they are all converted
assert not scfs_fish_df.loc[:, 'lep_cop':'unid_adult'].isnull().values.any()

In [51]:
# generate lice count fields to match our data from the SRS lice counts
scfs_fish_df['unknown_chal'] = scfs_fish_df['chalA'] + \
                               scfs_fish_df['chalB'] + \
                               scfs_fish_df['chal_unid']

scfs_fish_df['lep_motile'] = scfs_fish_df['Lep_PAmale'] + \
                            scfs_fish_df['Lep_PAfemale'] + \
                            scfs_fish_df['Lep_male'] + \
                            scfs_fish_df['Lep_nongravid'] + \
                            scfs_fish_df['Lep_gravid']

scfs_fish_df['cal_motile'] = scfs_fish_df['Caligus_mot'] + scfs_fish_df['Caligus_gravid']

scfs_fish_df['unknown_motile'] = scfs_fish_df['unid_PA'] + scfs_fish_df['unid_adult']

In [52]:
# remove all the redundant working columns
# easiest (not most efficient) way is to load our data file and use that list of columns
our_fish_data_df = pd.read_csv(wild_data_dir / "wild_fish_lice.csv")
scfs_fish_df = scfs_fish_df.reindex(columns=our_fish_data_df.columns)
scfs_fish_df.head()

Unnamed: 0,event_id,fish_id,length,weight,height,fish_species,lep_cop,lep_chal,lep_motile,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown
0,scfs_1.0,scfs_1,50.0,,,Pink Salmon,0.0,,0.0,,0.0,,0.0,,0.0,2.0,0.0,
1,scfs_1.0,scfs_2,50.0,,,Pink Salmon,0.0,,0.0,,0.0,,0.0,,0.0,0.0,3.0,
2,scfs_1.0,scfs_3,50.0,,,Pink Salmon,0.0,,0.0,,0.0,,0.0,,0.0,4.0,0.0,
3,scfs_1.0,scfs_4,53.0,,,Pink Salmon,0.0,,0.0,,0.0,,0.0,,0.0,1.0,1.0,
4,scfs_1.0,scfs_5,54.0,,,Pink Salmon,0.0,,0.0,,0.0,,0.0,,0.0,5.0,1.0,


In [53]:
# There are some new lice count columns - so again make sure all lice count columns NaN means 0
col_names = scfs_fish_df.loc[:, 'lep_cop':'unknown_unknown'].columns
fill_dict = {col: 0 for col in col_names}
scfs_fish_df.fillna(value=fill_dict, inplace=True)

# check if there are any NaN in the lice counts
assert not scfs_fish_df.loc[:, 'lep_cop':'unknown_unknown'].isnull().values.any()

In [54]:

scfs_fish_df.head().to_csv(scfs_formatted_lice_filepath, index=False)