## This notebook creates CSVs with summary statistics for lethal and non-lethal sampling
The table shows a comparison of lethal and non-lethal sampling.
Non-lethal sampling is source="Salmon Coast Field Station / Raincoast Research" and 'Marty Krkosek'
The data is constrained to
 - zone 3.3,
 - years 2008 and 2009,
 - out-migration months (March-June)
 - Chum and pink salmon

This script produces four CSVs: (non-lethal, lethal) X (2008, 2009)

In [84]:
from pathlib import Path
import pandas as pd

In [85]:
# define data location constants
DATA_DIR = Path('..') / 'source_data'
WILD_FISH_DATA = DATA_DIR / 'all_wild_fish_lice.csv'
WILD_EVENT_DATA = DATA_DIR / 'all_wild_sample_events.csv'

OUTPUT_DIR = Path('..') / 'output' / 'Table_7'

In [86]:
# import and merge the data
events_df = pd.read_csv(WILD_EVENT_DATA)
events_df['event_id'] = events_df['event_id'].astype(str)
fish_df = pd.read_csv(WILD_FISH_DATA)
fish_df['event_id'] = fish_df['event_id'].astype(str)
wild_df = pd.merge(events_df, fish_df, on='event_id', how='right')
wild_df.head()

  fish_df = pd.read_csv(WILD_FISH_DATA)


Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source,fish_id,length,...,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown,lice_protocol
0,5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,1,44.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
1,5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,2,39.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,
2,5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,3,45.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,
3,5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,4,43.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
4,5666,2017-05-15,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,5,38.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [87]:
# create month and year columns
wild_df['sampledate'] = pd.to_datetime(wild_df['sampledate'], errors='coerce', utc=True)
wild_df['year'] = wild_df.sampledate.dt.year
wild_df['month'] = wild_df.sampledate.dt.month

In [88]:
# make a new column for count of all lice
wild_df['all_lice'] = wild_df.loc[:, 'lep_cop':'unknown_unknown'].sum(axis=1)
wild_df.head()

  wild_df['all_lice'] = wild_df.loc[:, 'lep_cop':'unknown_unknown'].sum(axis=1)


Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source,fish_id,length,...,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown,lice_protocol,year,month,all_lice
0,5666,2017-05-15 00:00:00+00:00,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,1,44.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2017.0,5.0,0.0
1,5666,2017-05-15 00:00:00+00:00,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,2,39.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2017.0,5.0,2.0
2,5666,2017-05-15 00:00:00+00:00,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,3,45.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2017.0,5.0,2.0
3,5666,2017-05-15 00:00:00+00:00,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,4,43.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2017.0,5.0,0.0
4,5666,2017-05-15 00:00:00+00:00,Broughton Archipelago,3_3,Swanson Island Fish Farm,50.61806666666666,-126.701233,Mainstream Biological Consulting,5,38.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2017.0,5.0,3.0


In [89]:
wild_df.dfozone.unique()

array(['3_3', '2_3', '3_2', '2_4', '3_4', '3_1', '3_5', nan], dtype=object)

In [90]:
# constrain to the data that we are interested in
wild_df = wild_df[(wild_df.dfozone == "3_3")]
wild_df.year.unique()

array([2017., 2016., 2015., 2003., 2010., 2011., 2012., 2004., 2005.,
       2006., 2008., 2009., 2007., 2018., 2019., 2020., 2021., 2022.,
       2023., 2001.])

In [91]:
wild_df = wild_df[(wild_df.year.isin([2008.0, 2009.0]))]
wild_df.year.unique()

array([2008., 2009.])

In [92]:
wild_df = wild_df[(wild_df.month.isin([3, 4, 5, 6, 7]))]
wild_df.fish_species.unique()

array(['Pink Salmon', 'Chum Salmon', 'Three-Spined Stickleback',
       'Other Species', 'Chinook Salmon', 'Pacific Herring',
       'Coho Salmon'], dtype=object)

In [93]:
wild_df = wild_df[(wild_df.fish_species.isin(['Chum Salmon', 'Pink Salmon']))]
wild_df.head()

Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source,fish_id,length,...,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown,lice_protocol,year,month,all_lice
35679,5371,2008-03-26 00:00:00+00:00,Broughton Archipelago,3_3,Slope Point,50.639999,-126.54,Fisheries and Oceans Canada,35680,35.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2008.0,3.0,0.0
35680,6027,2008-03-27 00:00:00+00:00,Broughton Archipelago,3_3,Tribune Bay,50.639999,-126.46,Fisheries and Oceans Canada,35681,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2008.0,3.0,0.0
35681,2264,2008-03-27 00:00:00+00:00,Broughton Archipelago,3_3,Gilford Bay,50.650002,-126.38,Fisheries and Oceans Canada,35682,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2008.0,3.0,0.0
35682,1786,2008-03-28 00:00:00+00:00,Broughton Archipelago,3_3,Doctor Island,50.650002,-126.29,Fisheries and Oceans Canada,35683,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2008.0,3.0,0.0
35683,1786,2008-03-28 00:00:00+00:00,Broughton Archipelago,3_3,Doctor Island,50.650002,-126.29,Fisheries and Oceans Canada,35684,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,2008.0,3.0,0.0


In [94]:
wild_df.source.unique()

array(['Fisheries and Oceans Canada', 'Marty Krkosek'], dtype=object)

In [95]:
# label the non-lethal sampling
NONLETHAL = 'nonlethal'
LETHAL = 'lethal'
# All sampling from Marty Krkosek and Salmon Coast are non-lethal
nonlethal_sources = ['Marty Krkosek', 'Salmon Coast Field Station', 'Cedar Coast Field Station']

wild_df['sampling'] = wild_df.source.apply(lambda x: NONLETHAL if x in nonlethal_sources else LETHAL)

In [96]:
def fill_year_stats(input_df, lethality_type):
    """
    Creates a dataframe from the input data. Output is number of fish, non-infested fish count, infested fish count, lice count,
    prevalence, intensity. Output is broken down by month.
    :param input_df: restricted to lethal or non-lethal and year
    :type input_df: pandas.Dataframe object
    :param lethality_type: Indicator of type of data, will be used as first column name
    :type lethality_type: str
    :return: Dataframe of summary stats as per table 7
    :rtype: pandas.Dataframe object
    """
    month_list = sorted(list(input_df.month.unique()))

    output_dict = {lethality_type: []}
    for month in month_list:
        output_dict[month] = []

    # keep track of some numbers to help later calculations
    num_all_fish = {}
    num_infested_fish = {}
    num_all_lice = {}

    # get counts of all fish
    output_dict[lethality_type].append('All fish')
    for month in month_list:
        num_fish = len(input_df[input_df.month == month])
        output_dict[month].append(num_fish)

        num_all_fish[month] = num_fish

    # get counts of non-infested fish
    output_dict[lethality_type].append('Non-infested')
    for month in month_list:
        num_fish = len(input_df[(input_df.month == month) & (input_df.all_lice == 0)])
        output_dict[month].append(num_fish)

    # get counts of infested fish
    output_dict[lethality_type].append('Infested')
    for month in month_list:
        num_fish = len(input_df[(input_df.month == month) & (input_df.all_lice > 0)])
        output_dict[month].append(num_fish)

        num_infested_fish[month] = num_fish

    # get counts of all lice
    output_dict[lethality_type].append('All lice')
    for month in month_list:
        num_lice = input_df[(input_df.month == month)].all_lice.sum()
        output_dict[month].append(num_lice)

        num_all_lice[month] = num_lice

    # calculate prevalence
    output_dict[lethality_type].append('Prevalence')
    for month in month_list:
        prevalence = num_infested_fish[month] / num_all_fish[month]
        output_dict[month].append(prevalence)

    # calculate intensity
    output_dict[lethality_type].append('Intensity')
    for month in month_list:
        prevalence = num_all_lice[month] / num_infested_fish[month]
        output_dict[month].append(prevalence)

    # make into a dataframe and return
    output_df = pd.DataFrame(output_dict)

    # give the months friendly names
    output_df.rename(columns={3: 'Mar', 4: 'Apr', 5:'May', 6:'Jun', 7:'Jul'}, inplace=True)

    return output_df

### Non-lethal, 2008

In [97]:
non_lethal_2008_input_df = wild_df[(wild_df.sampling == NONLETHAL) & (wild_df.year == 2008)]
non_lethal_2008_df = fill_year_stats(non_lethal_2008_input_df, 'Non-lethal')

# write to file
non_lethal_2008_df.to_csv(OUTPUT_DIR / 'non_lethal_2008.csv', index=False)

non_lethal_2008_df.head(10)

Unnamed: 0,Non-lethal,Apr,May,Jun
0,All fish,5300.0,8818.0,7243.0
1,Non-infested,4911.0,7922.0,5538.0
2,Infested,389.0,896.0,1705.0
3,All lice,429.0,1152.0,2615.0
4,Prevalence,0.073396,0.10161,0.2354
5,Intensity,1.102828,1.285714,1.533724


### Non-lethal, 2009

In [98]:
non_lethal_2009_input_df = wild_df[(wild_df.sampling == NONLETHAL) & (wild_df.year == 2009)]
non_lethal_2009_df = fill_year_stats(non_lethal_2009_input_df, 'Non-lethal')

# write to file
non_lethal_2009_df.to_csv(OUTPUT_DIR / 'non_lethal_2009.csv', index=False)

non_lethal_2009_df.head(10)


Unnamed: 0,Non-lethal,Apr,May,Jun
0,All fish,3174.0,8948.0,4455.0
1,Non-infested,3081.0,7483.0,3699.0
2,Infested,93.0,1465.0,756.0
3,All lice,94.0,1943.0,1057.0
4,Prevalence,0.029301,0.163724,0.169697
5,Intensity,1.010753,1.32628,1.398148


### Lethal, 2008

In [99]:
lethal_2008_input_df = wild_df[(wild_df.sampling == LETHAL) & (wild_df.year == 2008)]
lethal_2008_df = fill_year_stats(lethal_2008_input_df, 'Lethal')

# write to file
lethal_2008_df.to_csv(OUTPUT_DIR / 'lethal_2008.csv', index=False)

lethal_2008_df.head(10)

Unnamed: 0,Lethal,Mar,Apr,May,Jun
0,All fish,544.0,1432.0,3348.0,2343.0
1,Non-infested,542.0,1375.0,3162.0,2026.0
2,Infested,2.0,57.0,186.0,317.0
3,All lice,2.0,68.0,253.0,489.0
4,Prevalence,0.003676,0.039804,0.055556,0.135297
5,Intensity,1.0,1.192982,1.360215,1.542587


### Lethal, 2009

In [100]:
lethal_2009_input_df = wild_df[(wild_df.sampling == LETHAL) & (wild_df.year == 2009)]
lethal_2009_df = fill_year_stats(lethal_2009_input_df, 'Lethal')

# write to file
lethal_2009_df.to_csv(OUTPUT_DIR / 'lethal_2009.csv', index=False)

lethal_2009_df.head(10)

  prevalence = num_all_lice[month] / num_infested_fish[month]


Unnamed: 0,Lethal,Mar,Apr,May,Jun,Jul
0,All fish,479.0,1092.0,2988.0,2904.0,261.0
1,Non-infested,479.0,1070.0,2790.0,2490.0,206.0
2,Infested,0.0,22.0,198.0,414.0,55.0
3,All lice,0.0,26.0,231.0,770.0,104.0
4,Prevalence,0.0,0.020147,0.066265,0.142562,0.210728
5,Intensity,,1.181818,1.166667,1.859903,1.890909
