## This notebook formats the data published by the Department of Fisheries and Oceans Canada (DFO)
### Note that this formatted data is not merged into the rest of the industry counts and is not used in the figures or tables.

The data from DFO consists of lice counts on salmon farms. The counts are reported by the companies. The data here was downloaded from [Industry sea lice counts at BC marine finfish aquaculture sites](https://open.canada.ca/data/en/dataset/3cafbe89-c98b-4b44-88f1-594e8d28838d) from the Government of Canada Open Government site on 17 March 2025.

We format the DFO count data to match that in *industry_farm_abundances.csv*. We replace the DFO field *Facility Reference Number* with *facility_id* to link to site descriptions in *industry_farm_details.csv*.

In [65]:
import pandas as pd
from pathlib import Path
import calendar

### Path configuration variables

In [66]:
# DFO data path
dfo_farm_data_filepath = Path('DFO') / 'lice-count-dens-pou-2011-ongoing-rpt-pac-dfo-mpo-aquaculture-eng.csv'
# mapping file from DFO 'Facility Reference Number' to 'facility_id'
ref_to_id_map_filepath = Path('.') / 'DFO_facility_mapping.csv'
# path to industry Farm information
farm_info_path = Path('.') / 'industry_farm_details.csv'

# output for formatted data
dfo_formatted_farm_filepath = Path('.') / 'DFO_farm_abundance.csv'

# path to

### Load the source data

In [67]:
dfo_farm_data_df = pd.read_csv(dfo_farm_data_filepath)

# strip out the columns that aren't needed
drop_columns = ['Licence Holder', 'Site Common Name', 'Latitude', 'Longitude', 'Aquaculture Management Unit']
dfo_farm_data_df.drop(drop_columns, axis=1, inplace=True)

dfo_farm_data_df.head()

Unnamed: 0,Year,Month,Facility Reference Number,Sample Type,Incident Date,Number of Pens Sampled,Average L. salmonis motiles per fish,Average L. salmonis females per fish,Average chalimus per fish,Average caligus per fish,Comments,Year Class
0,2011,January,466,Routine monitoring,2011-01-27,3,0.25,0.0,0.2167,0.05,,1
1,2011,January,1144,Routine monitoring,2011-01-01,3,1.3833,0.25,1.1,0.5,,2
2,2011,January,1144,Pre-treatment,2011-01-15,3,2.7833,1.2167,1.8333,0.3667,In-feed treatment,2
3,2011,January,458,Routine monitoring,2011-01-05,2,0.35,0.175,0.1,0.0,Sampling methodology does not meet requirement...,Brood
4,2011,January,1586,Follow-up,2011-01-20,3,0.0667,0.0,0.25,0.0167,,1


In [68]:
# put sampling date into numeric year, month, day (when available)
dfo_farm_data_df['year'] = dfo_farm_data_df['Year']

month_to_num_dict = {month_name: num for num, month_name in enumerate(calendar.month_name)}
dfo_farm_data_df['month'] = dfo_farm_data_df['Month'].apply(lambda month_name: month_to_num_dict[month_name])

dfo_farm_data_df['date'] = pd.to_datetime(dfo_farm_data_df['Incident Date'], errors='coerce')
dfo_farm_data_df['day'] = dfo_farm_data_df['date'].apply(lambda sample_date: None if pd.isnull(sample_date) else sample_date.day)

dfo_farm_data_df.head()

Unnamed: 0,Year,Month,Facility Reference Number,Sample Type,Incident Date,Number of Pens Sampled,Average L. salmonis motiles per fish,Average L. salmonis females per fish,Average chalimus per fish,Average caligus per fish,Comments,Year Class,year,month,date,day
0,2011,January,466,Routine monitoring,2011-01-27,3,0.25,0.0,0.2167,0.05,,1,2011,1,2011-01-27,27.0
1,2011,January,1144,Routine monitoring,2011-01-01,3,1.3833,0.25,1.1,0.5,,2,2011,1,2011-01-01,1.0
2,2011,January,1144,Pre-treatment,2011-01-15,3,2.7833,1.2167,1.8333,0.3667,In-feed treatment,2,2011,1,2011-01-15,15.0
3,2011,January,458,Routine monitoring,2011-01-05,2,0.35,0.175,0.1,0.0,Sampling methodology does not meet requirement...,Brood,2011,1,2011-01-05,5.0
4,2011,January,1586,Follow-up,2011-01-20,3,0.0667,0.0,0.25,0.0167,,1,2011,1,2011-01-20,20.0


### Combine facility mapping with the DFO data to get out facility_id

In [69]:
facility_map_df = pd.read_csv(ref_to_id_map_filepath)

dfo_formatted_df = pd.merge(dfo_farm_data_df, facility_map_df, on='Facility Reference Number', how='left')

dfo_formatted_df.head()

Unnamed: 0,Year,Month,Facility Reference Number,Sample Type,Incident Date,Number of Pens Sampled,Average L. salmonis motiles per fish,Average L. salmonis females per fish,Average chalimus per fish,Average caligus per fish,...,year,month,date,day,Site Common Name,Aquaculture Management Unit,separator,facility_id,name,region_name
0,2011,January,466,Routine monitoring,2011-01-27,3,0.25,0.0,0.2167,0.05,...,2011,1,2011-01-27,27.0,Arrow Pass,Broughton Archipelago,,5.0,Arrow Passage,Broughton Archipelago
1,2011,January,1144,Routine monitoring,2011-01-01,3,1.3833,0.25,1.1,0.5,...,2011,1,2011-01-01,1.0,Burdwood,Broughton Archipelago,,25.0,Burdwood,Broughton Archipelago
2,2011,January,1144,Pre-treatment,2011-01-15,3,2.7833,1.2167,1.8333,0.3667,...,2011,1,2011-01-15,15.0,Burdwood,Broughton Archipelago,,25.0,Burdwood,Broughton Archipelago
3,2011,January,458,Routine monitoring,2011-01-05,2,0.35,0.175,0.1,0.0,...,2011,1,2011-01-05,5.0,Cypress Harbour,Broughton Archipelago,,45.0,Cypress Harbour,Broughton Archipelago
4,2011,January,1586,Follow-up,2011-01-20,3,0.0667,0.0,0.25,0.0167,...,2011,1,2011-01-20,20.0,Doctor Islets,Broughton Archipelago,,51.0,Doctor Islets,Broughton Archipelago


### Convert the format to match industry data

In [70]:
# rename the lice data columns to match industry_farm_abundance
dfo_formatted_df.rename(columns={
    # lice numbers
    'Average L. salmonis motiles per fish': 'lep_motile_ab',
    'Average L. salmonis females per fish': 'lep_af_ab',
    'Average chalimus per fish': 'chalimus_ab',
    'Average caligus per fish': 'cal_motile_ab',

    # other columns
    'Number of Pens Sampled': 'num_pens_sampled',
    'Sample Type': 'sample_type',
    'Comments': 'comments'
},
inplace=True)

dfo_formatted_df.head()

Unnamed: 0,Year,Month,Facility Reference Number,sample_type,Incident Date,num_pens_sampled,lep_motile_ab,lep_af_ab,chalimus_ab,cal_motile_ab,...,year,month,date,day,Site Common Name,Aquaculture Management Unit,separator,facility_id,name,region_name
0,2011,January,466,Routine monitoring,2011-01-27,3,0.25,0.0,0.2167,0.05,...,2011,1,2011-01-27,27.0,Arrow Pass,Broughton Archipelago,,5.0,Arrow Passage,Broughton Archipelago
1,2011,January,1144,Routine monitoring,2011-01-01,3,1.3833,0.25,1.1,0.5,...,2011,1,2011-01-01,1.0,Burdwood,Broughton Archipelago,,25.0,Burdwood,Broughton Archipelago
2,2011,January,1144,Pre-treatment,2011-01-15,3,2.7833,1.2167,1.8333,0.3667,...,2011,1,2011-01-15,15.0,Burdwood,Broughton Archipelago,,25.0,Burdwood,Broughton Archipelago
3,2011,January,458,Routine monitoring,2011-01-05,2,0.35,0.175,0.1,0.0,...,2011,1,2011-01-05,5.0,Cypress Harbour,Broughton Archipelago,,45.0,Cypress Harbour,Broughton Archipelago
4,2011,January,1586,Follow-up,2011-01-20,3,0.0667,0.0,0.25,0.0167,...,2011,1,2011-01-20,20.0,Doctor Islets,Broughton Archipelago,,51.0,Doctor Islets,Broughton Archipelago


In [71]:
# make string "n/a" into null
dfo_formatted_df.replace('n/a', None, inplace=True)

In [72]:
# remove columns and reorder to match industry - leave some extra useful/interesting data
dfo_formatted_df = dfo_formatted_df.reindex(
    columns=['facility_id', 'year', 'month', 'day', 'fish_selected', 'num_pens_sampled',
             'chalimus_ab', 'lep_motile_ab', 'lep_af_ab', 'cal_motile_ab',
             'sample_type', 'comments'])

# dfo_formatted_df.head()
len(dfo_formatted_df)

18808

In [73]:
# remove all lines with null facility id
dfo_formatted_df.dropna(axis='rows', subset=['facility_id'], inplace=True)

dfo_formatted_df['facility_id'] = dfo_formatted_df['facility_id'].apply(lambda x: int(x))
# dfo_formatted_df.head()
len(dfo_formatted_df)

18625

### Write formatted data to file

In [74]:
dfo_formatted_df.to_csv(dfo_formatted_farm_filepath, index=False)