## This Notebook generates a CSV for Table 1b
The table shows the number of wild fish sampled by each organisation.
The CSV has the structure of Table 1b, except for the Total row at the bottom. The presentation and the Total row as in the paper was done in another tool.

In [11]:
from pathlib import Path
import pandas as pd

In [12]:
# define data location constants
DATA_DIR = Path('..') / 'source_data'
WILD_FISH_DATA = DATA_DIR / 'all_wild_fish_lice.csv'
WILD_EVENT_DATA = DATA_DIR / 'all_wild_sample_events.csv'

OUTPUT_DIR = Path('..') / 'output' / 'Table_1b'

In [13]:
# import and merge the data
events_df = pd.read_csv(WILD_EVENT_DATA)
events_df['event_id'] = events_df['event_id'].astype(str)
fish_df = pd.read_csv(WILD_FISH_DATA)
fish_df['event_id'] = fish_df['event_id'].astype(str)

wild_df = pd.merge(events_df, fish_df, on='event_id', how='right')
wild_df.head()

  fish_df = pd.read_csv(WILD_FISH_DATA)


Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source,source_code,fish_id,...,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown,lice_protocol
0,bc_5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,MBC,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal
1,bc_5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,MBC,2,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal
2,bc_5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,MBC,3,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal
3,bc_5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,MBC,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal
4,bc_5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,MBC,5,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal


## Calculate the fish sampled each year by each sampling organisation

In [14]:
# make a year column
wild_df['sampledate'] = pd.to_datetime(wild_df['sampledate'], errors='coerce', utc=True)
wild_df['Year'] = wild_df['sampledate'].dt.year

In [15]:
# pivot and count to get counts of each species of fish each year
year_source_df = pd.pivot_table(wild_df, values='fish_id', index='Year', columns=['source'],
                                 aggfunc='count', fill_value=0).reset_index()

year_source_df.head()

source,Year,Broughton Archipelago Monitoring Plan,Cedar Coast Field Station,Fisheries and Oceans Canada,Hakai Institute,Kitasoo First Nation,Mainstream Biological Consulting,Marine Environmental Research Program,Marty Krkosek,Pacificus Biological Services,Salmon Coast Field Station
0,2001,0,0,0,0,0,0,0,0,0,268
1,2002,0,0,0,0,0,0,0,0,0,564
2,2003,0,0,29472,0,0,0,535,4333,0,676
3,2004,0,0,8657,0,0,0,4065,11575,0,1087
4,2005,0,0,6198,0,192,0,3422,0,0,2084


In [16]:
year_source_df.columns

Index(['Year', 'Broughton Archipelago Monitoring Plan',
       'Cedar Coast Field Station', 'Fisheries and Oceans Canada',
       'Hakai Institute', 'Kitasoo First Nation',
       'Mainstream Biological Consulting',
       'Marine Environmental Research Program', 'Marty Krkosek',
       'Pacificus Biological Services', 'Salmon Coast Field Station'],
      dtype='object', name='source')

In [17]:
# rename the columns for the paper
year_source_df.rename(columns={'Broughton Archipelago Monitoring Plan': 'BAMP',
                               'Fisheries and Oceans Canada': 'DFO',
                               'Kitasoo First Nation': 'Kitasoo',
                               'Mainstream Biological Consulting': 'MBC',
                               'Marine Environmental Research Program': 'MERP',
                               'Marty Krkosek': 'MK',
                               'Pacificus Biological Services': 'Pacificus',
                               'Salmon Coast Field Station': 'SCS',
                               'Cedar Coast Field Station': 'CCS',
                               'Hakai Institute': 'Hakai'},
                      inplace=True)

# re-order the columns for presentation purposes
year_source_df = year_source_df.reindex(columns=['Year', 'SCS', 'CCS', 'Hakai', 'MK', 'Kitasoo', 'DFO', 'MERP', 'MBC', 'Pacificus', 'BAMP'])

year_source_df

source,Year,SCS,CCS,Hakai,MK,Kitasoo,DFO,MERP,MBC,Pacificus,BAMP
0,2001,268,0,0,0,0,0,0,0,0,0
1,2002,564,0,0,0,0,0,0,0,0,0
2,2003,676,0,0,4333,0,29472,535,0,0,0
3,2004,1087,0,0,11575,0,8657,4065,0,0,0
4,2005,2084,0,0,0,192,6198,3422,0,0,0
5,2006,1708,0,0,12609,1816,7360,3227,0,0,0
6,2007,1650,0,0,17829,1132,9378,4599,0,0,0
7,2008,2345,0,0,21361,954,9170,5197,0,0,0
8,2009,2549,0,0,16577,1675,10320,4160,0,0,0
9,2010,2276,0,0,0,1852,0,3627,0,0,6090


## Calculate the total fish sampled each year

In [18]:
year_source_df.loc[:, 'SCS':'BAMP'].columns

Index(['SCS', 'CCS', 'Hakai', 'MK', 'Kitasoo', 'DFO', 'MERP', 'MBC',
       'Pacificus', 'BAMP'],
      dtype='object', name='source')

In [19]:
# make a column for the total fish sampled that year
source_list = list(year_source_df)
source_list.remove('Year')
year_source_df['All fish'] = year_source_df[source_list].sum(axis=1)

year_source_df.head()

source,Year,SCS,CCS,Hakai,MK,Kitasoo,DFO,MERP,MBC,Pacificus,BAMP,All fish
0,2001,268,0,0,0,0,0,0,0,0,0,268
1,2002,564,0,0,0,0,0,0,0,0,0,564
2,2003,676,0,0,4333,0,29472,535,0,0,0,35016
3,2004,1087,0,0,11575,0,8657,4065,0,0,0,25384
4,2005,2084,0,0,0,192,6198,3422,0,0,0,11896


In [20]:
# export the table to a CSV for final formatting
year_source_df.to_csv(OUTPUT_DIR / 'Table_1b.csv', index=False)