## This notebook creates CSVs with summary statistics for lethal and non-lethal sampling
The table shows a comparison of lethal and non-lethal sampling.
Non-lethal sampling is source="Salmon Coast Field Station" and 'Marty Krkosek'
The data is constrained to
 - zone 3.3,
 - years 2008 and 2009,
 - out-migration months (March-June)
 - Chum and pink salmon

This script produces four CSVs: (non-lethal, lethal) X (2008, 2009)

In [344]:
from pathlib import Path
import pandas as pd

In [345]:
# define data location constants
DATA_DIR = Path('..') / 'source_data'
WILD_FISH_DATA = DATA_DIR / 'all_wild_fish_lice.csv'
WILD_EVENT_DATA = DATA_DIR / 'all_wild_sample_events.csv'

OUTPUT_DIR = Path('..') / 'output' / 'Table_7'

In [346]:
# import and merge the data
events_df = pd.read_csv(WILD_EVENT_DATA, parse_dates=['sampledate'])
events_df['event_id'] = events_df['event_id'].astype(str)
fish_df = pd.read_csv(WILD_FISH_DATA)
fish_df['event_id'] = fish_df['event_id'].astype(str)
wild_df = pd.merge(events_df, fish_df, on='event_id', how='left')
wild_df.head()

  fish_df = pd.read_csv(WILD_FISH_DATA)


Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source,source_code,fish_id,...,lep_unknown,cal_cop,cal_chal,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown,lice_protocol
0,1,2003-05-13,Broughton Archipelago,3_3,Adeane Point,50.71978,-125.6795,Fisheries and Oceans Canada,DFO,1715,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal
1,1,2003-05-13,Broughton Archipelago,3_3,Adeane Point,50.71978,-125.6795,Fisheries and Oceans Canada,DFO,1716,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal
2,1,2003-05-13,Broughton Archipelago,3_3,Adeane Point,50.71978,-125.6795,Fisheries and Oceans Canada,DFO,1717,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal
3,1,2003-05-13,Broughton Archipelago,3_3,Adeane Point,50.71978,-125.6795,Fisheries and Oceans Canada,DFO,1718,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal
4,1,2003-05-13,Broughton Archipelago,3_3,Adeane Point,50.71978,-125.6795,Fisheries and Oceans Canada,DFO,1719,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lethal


In [347]:
wild_df.loc[wild_df.source_code == 'SCS'].fish_species.unique()

array(['Pink Salmon', 'Chum Salmon', 'Sockeye Salmon', nan], dtype=object)

In [348]:
wild_df.sampledate.dtype

dtype('<M8[ns]')

In [349]:
# create month and year columns
wild_df['year'] = wild_df.sampledate.dt.year
wild_df['month'] = wild_df.sampledate.dt.month

In [350]:
# just the bits we need for this analysis
wild_df = wild_df[(wild_df.dfozone == "3_3")]

wild_df = wild_df[(wild_df.year.isin([2008, 2009]))]

wild_df.loc[wild_df.source_code == 'SCS'].fish_species.unique()
# wild_df.source_code.unique()
# wild_df.month.dtype

array(['Chum Salmon', 'Pink Salmon'], dtype=object)

In [351]:

wild_df = wild_df[(wild_df.month.isin([3, 4, 5, 6, 7]))]

wild_df = wild_df[(wild_df.fish_species.isin(['Chum Salmon', 'Pink Salmon']))]
sc = wild_df.source_code.unique()
sf = wild_df.source.unique()
print("years: {}\nmonths: {}\nspecies: {}\nsources: {}{}".format(
    wild_df.year.unique(), wild_df.month.unique(), wild_df.fish_species.unique(),
    sc, sf
))

years: [2008 2009]
months: [4 5 6 3 7]
species: ['Pink Salmon' 'Chum Salmon']
sources: ['DFO' 'MK' 'SCS']['Fisheries and Oceans Canada' 'Marty Krkosek'
 'Salmon Coast Field Station']


In [352]:
# make a new column for count of all lice
wild_df['all_lice'] = wild_df.loc[:, 'lep_cop':'unknown_unknown'].sum(axis=1)
wild_df.head()

Unnamed: 0,event_id,sampledate,region,dfozone,sample_site,latitude,longitude,source,source_code,fish_id,...,cal_motile,cal_unknown,unknown_cop,unknown_chal,unknown_motile,unknown_unknown,lice_protocol,year,month,all_lice
1868,130,2008-04-28,Broughton Archipelago,3_3,Alder Point,50.869999,-126.87,Fisheries and Oceans Canada,DFO,37358,...,0.0,0.0,0.0,0.0,0.0,0.0,Lethal,2008,4,0.0
1869,130,2008-04-28,Broughton Archipelago,3_3,Alder Point,50.869999,-126.87,Fisheries and Oceans Canada,DFO,37359,...,0.0,0.0,0.0,0.0,0.0,0.0,Lethal,2008,4,0.0
1870,130,2008-04-28,Broughton Archipelago,3_3,Alder Point,50.869999,-126.87,Fisheries and Oceans Canada,DFO,37360,...,0.0,0.0,0.0,0.0,0.0,0.0,Lethal,2008,4,0.0
1871,130,2008-04-28,Broughton Archipelago,3_3,Alder Point,50.869999,-126.87,Fisheries and Oceans Canada,DFO,37361,...,0.0,0.0,0.0,0.0,0.0,0.0,Lethal,2008,4,0.0
1872,130,2008-04-28,Broughton Archipelago,3_3,Alder Point,50.869999,-126.87,Fisheries and Oceans Canada,DFO,37362,...,0.0,0.0,0.0,0.0,0.0,0.0,Lethal,2008,4,0.0


In [353]:
wild_df.source.unique()

array(['Fisheries and Oceans Canada', 'Marty Krkosek',
       'Salmon Coast Field Station'], dtype=object)

In [354]:
# label the non-lethal sampling
NONLETHAL = 'nonlethal'
LETHAL = 'lethal'
# All sampling from Marty Krkosek and Salmon Coast are non-lethal
nonlethal_sources = ['Marty Krkosek', 'Salmon Coast Field Station', 'Cedar Coast Field Station']

wild_df['sampling'] = wild_df.source.apply(lambda x: NONLETHAL if x in nonlethal_sources else LETHAL)

In [355]:
def fill_year_stats(input_df, lethality_type):
    """
    Creates a dataframe from the input data. Output is number of fish, non-infested fish count, infested fish count, lice count,
    prevalence, intensity. Output is broken down by month.
    :param input_df: restricted to lethal or non-lethal and year
    :type input_df: pandas.Dataframe object
    :param lethality_type: Indicator of type of data, will be used as first column name
    :type lethality_type: str
    :return: Dataframe of summary stats as per table 7
    :rtype: pandas.Dataframe object
    """
    month_list = sorted(list(input_df.month.unique()))

    output_dict = {lethality_type: []}
    for month in month_list:
        output_dict[month] = []

    # keep track of some numbers to help later calculations
    num_all_fish = {}
    num_infested_fish = {}
    num_all_lice = {}

    # get counts of all fish
    output_dict[lethality_type].append('All fish')
    for month in month_list:
        num_fish = len(input_df[input_df.month == month])
        output_dict[month].append(num_fish)

        num_all_fish[month] = num_fish

    # get counts of non-infested fish
    output_dict[lethality_type].append('Non-infested')
    for month in month_list:
        num_fish = len(input_df[(input_df.month == month) & (input_df.all_lice == 0)])
        output_dict[month].append(num_fish)

    # get counts of infested fish
    output_dict[lethality_type].append('Infested')
    for month in month_list:
        num_fish = len(input_df[(input_df.month == month) & (input_df.all_lice > 0)])
        output_dict[month].append(num_fish)

        num_infested_fish[month] = num_fish

    # get counts of all lice
    output_dict[lethality_type].append('All lice')
    for month in month_list:
        num_lice = input_df[(input_df.month == month)].all_lice.sum()
        output_dict[month].append(num_lice)

        num_all_lice[month] = num_lice

    # calculate prevalence
    output_dict[lethality_type].append('Prevalence')
    for month in month_list:
        prevalence = num_infested_fish[month] / num_all_fish[month]
        output_dict[month].append(prevalence)

    # calculate intensity
    output_dict[lethality_type].append('Intensity')
    for month in month_list:
        prevalence = num_all_lice[month] / num_infested_fish[month]
        output_dict[month].append(prevalence)

    # make into a dataframe and return
    output_df = pd.DataFrame(output_dict)

    # give the months friendly names
    output_df.rename(columns={3: 'Mar', 4: 'Apr', 5:'May', 6:'Jun', 7:'Jul'}, inplace=True)

    return output_df

### Non-lethal, 2008

In [356]:
non_lethal_2008_input_df = wild_df[(wild_df.sampling == NONLETHAL) & (wild_df.year == 2008)]
non_lethal_2008_df = fill_year_stats(non_lethal_2008_input_df, 'Non-lethal')

# write to file
non_lethal_2008_df.to_csv(OUTPUT_DIR / 'non_lethal_2008.csv', index=False)

non_lethal_2008_df.head(10)

Unnamed: 0,Non-lethal,Mar,Apr,May,Jun,Jul
0,All fish,29.0,5669.0,9451.0,7997.0,560.0
1,Non-infested,25.0,5216.0,8160.0,5978.0,221.0
2,Infested,4.0,453.0,1291.0,2019.0,339.0
3,All lice,5.0,504.0,1738.0,3114.0,645.0
4,Prevalence,0.137931,0.079908,0.136599,0.25247,0.605357
5,Intensity,1.25,1.112583,1.346243,1.542348,1.902655


### Non-lethal, 2009

In [357]:
non_lethal_2009_input_df = wild_df[(wild_df.sampling == NONLETHAL) & (wild_df.year == 2009)]
non_lethal_2009_df = fill_year_stats(non_lethal_2009_input_df, 'Non-lethal')

# write to file
non_lethal_2009_df.to_csv(OUTPUT_DIR / 'non_lethal_2009.csv', index=False)

non_lethal_2009_df.head(10)


Unnamed: 0,Non-lethal,Apr,May,Jun,Jul
0,All fish,3174.0,9929.0,5495.0,528.0
1,Non-infested,3049.0,8160.0,4489.0,396.0
2,Infested,125.0,1769.0,1006.0,132.0
3,All lice,128.0,2322.0,1386.0,187.0
4,Prevalence,0.039382,0.178165,0.183076,0.25
5,Intensity,1.024,1.312606,1.377734,1.416667


### Lethal, 2008

In [358]:
lethal_2008_input_df = wild_df[(wild_df.sampling == LETHAL) & (wild_df.year == 2008)]
lethal_2008_df = fill_year_stats(lethal_2008_input_df, 'Lethal')

# write to file
lethal_2008_df.to_csv(OUTPUT_DIR / 'lethal_2008.csv', index=False)

lethal_2008_df.head(10)

Unnamed: 0,Lethal,Mar,Apr,May,Jun
0,All fish,544.0,1432.0,3348.0,2343.0
1,Non-infested,542.0,1374.0,3138.0,2025.0
2,Infested,2.0,58.0,210.0,318.0
3,All lice,2.0,70.0,302.0,494.0
4,Prevalence,0.003676,0.040503,0.062724,0.135723
5,Intensity,1.0,1.206897,1.438095,1.553459


### Lethal, 2009

In [359]:
lethal_2009_input_df = wild_df[(wild_df.sampling == LETHAL) & (wild_df.year == 2009)]
lethal_2009_df = fill_year_stats(lethal_2009_input_df, 'Lethal')

# write to file
lethal_2009_df.to_csv(OUTPUT_DIR / 'lethal_2009.csv', index=False)

lethal_2009_df.head(10)

  prevalence = num_all_lice[month] / num_infested_fish[month]


Unnamed: 0,Lethal,Mar,Apr,May,Jun,Jul
0,All fish,479.0,1092.0,2988.0,2904.0,261.0
1,Non-infested,479.0,1070.0,2776.0,2486.0,206.0
2,Infested,0.0,22.0,212.0,418.0,55.0
3,All lice,0.0,26.0,251.0,776.0,104.0
4,Prevalence,0.0,0.020147,0.07095,0.143939,0.210728
5,Intensity,,1.181818,1.183962,1.856459,1.890909
