## This notebook creates a CSV with the summary statistics in Table 5
The table shows N, NA, min, max, mean, median for each of fish length, weight, and height, as well as numbers of any lice on wild fish, and abundance in farms of chalimus, lep motile, lep AF, and caligus motile

In [35]:
import pandas as pd
import numpy as np
from pathlib import Path

In [36]:
# define the paths
DATA_DIR = Path('..') / 'source_data'

wild_fish_filepath = DATA_DIR / 'all_wild_fish_lice.csv'

farm_events_filepath = DATA_DIR / 'industry_farm_abundance.csv'

chart_output_dirpath = Path('..') / 'output' / 'Table_5'

In [37]:
# import the data
wild_fish_df = pd.read_csv(wild_fish_filepath)
farm_events_df = pd.read_csv(farm_events_filepath)

## Construct a dataframe

In [38]:
# make an empty dictionary in the format pandas can make into a Dataframe
summary_dict = {'Field': [], 'Dataset': [], 'N': [], 'NA': [], 'Min': [], 'Max': [], 'Mean': [], 'Median': []}

### Wild Fish

In [39]:
# length=0, weight=0, height=0 mean null
wild_fish_df.replace({'length': 0, 'weight': 0, 'height': 0}, np.nan, inplace=True)

In [40]:
def add_wild_row(label, field, num=None):
    """Adds a row to summary_dict with summary stats for a column in wild_fish_df"""
    summary_dict['Field'].append(label)
    summary_dict['Dataset'].append(wild_fish_filepath.stem)

    if num is None:
        # N is a count of all non-null values
        summary_dict['N'].append(wild_fish_df[field].count())

        # NA is null values as % of total
        proportion = len(wild_fish_df[pd.isnull(wild_fish_df[field])]) / len(wild_fish_df)
        summary_dict['NA'].append(proportion*100)
    else:
        # special case, use the supplied num as N
        summary_dict['N'].append(num)

        # NA doesn't make sense here, so is None
        summary_dict['NA'].append(None)

    # stats
    summary_dict['Min'].append(wild_fish_df[field].min())
    summary_dict['Max'].append(wild_fish_df[field].max())
    summary_dict['Mean'].append(wild_fish_df[field].mean())
    summary_dict['Median'].append(wild_fish_df[field].median())

In [41]:
# length
add_wild_row('Length (mm)', 'length')

In [42]:
# weight
add_wild_row('Weight (g)', 'weight')

In [43]:
# height
add_wild_row('Height (mm)', 'height')

In [44]:
# all lice

# make a new column that is the number of all types of lice
wild_fish_df['num_all_lice'] = wild_fish_df.loc[:,'lep_cop':].sum(axis=1)

add_wild_row('Any louse', 'num_all_lice', num=len(wild_fish_df))

### Farm events

In [45]:
def add_farm_row(label, field):
    """Adds a row to summary_dict for a column in farm_events_df"""
    summary_dict['Field'].append(label)
    summary_dict['Dataset'].append(farm_events_filepath.stem)

    # N is number of sampling events
    summary_dict['N'].append(len(farm_events_df))

    # NA is null values as % of total
    proportion = len(farm_events_df[pd.isnull(farm_events_df[field])]) / len(farm_events_df)
    summary_dict['NA'].append(proportion*100)

    # stats
    summary_dict['Min'].append(farm_events_df[field].min())
    summary_dict['Max'].append(farm_events_df[field].max())
    summary_dict['Mean'].append(farm_events_df[field].mean())
    summary_dict['Median'].append(farm_events_df[field].median())

In [46]:
# chalimus abundance
add_farm_row('Chalimus_ab', 'chalimus_ab')

In [47]:
# lep motile abundance
add_farm_row('Lep_motile_ab', 'lep_motile_ab')

In [48]:
# lep AF abundance
add_farm_row('Lep_AF_ab', 'lep_af_ab')

In [49]:
# Caligus motile abundance
add_farm_row('Cal_motile_ab', 'cal_motile_ab')

In [50]:
summary_df = pd.DataFrame(summary_dict)
summary_df.head(10)

Unnamed: 0,Field,Dataset,N,NA,Min,Max,Mean,Median
0,Length (mm),all_wild_fish_lice,351849,0.033526,0.24,553.0,51.947462,46.0
1,Weight (g),all_wild_fish_lice,217655,38.160396,0.04,1100.0,2.617904,0.9
2,Height (mm),all_wild_fish_lice,119297,66.105629,1.0,35.0,8.113069,7.62
3,Any louse,all_wild_fish_lice,351967,,0.0,384.0,0.903948,0.0
4,Chalimus_ab,industry_farm_abundance,10146,0.049281,0.0,46.35,0.961783,0.167
5,Lep_motile_ab,industry_farm_abundance,10146,0.049281,0.0,53.367,1.628119,0.567
6,Lep_AF_ab,industry_farm_abundance,10146,0.049281,0.0,27.831,0.767576,0.208
7,Cal_motile_ab,industry_farm_abundance,10146,0.049281,0.0,41.513,0.410746,0.056


In [51]:
summary_df.to_csv(chart_output_dirpath / 'Table_5_summary_statistics.csv', index=False)