## This notebook creates a CSV that maps between facility indexes by the Department of Fisheries and Oceans Canada (DFO) and the facility IDs in the industry data

In [2]:
import pandas as pd
from pathlib import Path

### Path configuration variables

In [28]:
# DFO data path
dfo_farm_data_filepath = Path('DFO') / 'lice-count-dens-pou-2011-ongoing-rpt-pac-dfo-mpo-aquaculture-eng.csv'
# industry data path
industry_farm_details_filepath = Path('.') / 'industry_farm_details.csv'

# mapping file from DFO 'Facility Reference Number' to 'facility_id'
ref_to_id_map_filepath = Path('.') / 'DFO_facility_mapping.csv'

In [22]:
# mapping
# done manually as it's more transparent and less error-prone than algorithmically
ref_to_id_map = {
    78: 121,  # Phillips Arm
    100: 92,  # Lees Bay
    136: 35,  # Cliff Bay
    137: 39,  # Conville Bay
    141: 124,  # Port Elizabeth
    143: 91,  # Larsen Island
    144: 88,  # Koskimo Bay
    169: None,  # Barkley. Not included.
    211: 117,  # Okisollo (Sonora Island)
    221: 165,  # Vantage
    227: 12,  # Bawden
    234: 50,  # Dixon Bay
    303: None,  # Glacial Creek. Not included.
    304: 131,  # Raza Island
    306: 166,  # Venture Point
    314: 134,  # Ross Pass
    332: 135,  # Salten
    377: None,  # Bickley Bay. Not included.
    378: 160,  # Thurlow Point
    380: 148,  # Sonora Point
    388: 24,  # Brougham Point
    412: 60,  # Site 9 / Farm 9
    458: 45,  # Cypress Harbour
    465: 156,  # Swanson Island
    466: 5,  # Arrow Passage
    467: 105,  # Midsummer Island
    520: 14,  # Bedwell
    526: 129,  # Rant Point
    527: 139,  # Saranac Island
    540: 62,  # Fortune Channel
    543: 110,  # Mussel Rock
    553: 63,  # Frederick Arm
    728: 146,  # Sir Edmund Bay
    733: 46,  # Cyrus Rocks
    739: 164,  # Upper Retreat
    746: 59,  # Site 13 / Farm 13
    753: None,  # Cormorant. Not Included.
    790: 29,  # Chancellor Channel
    819: 27,  # Cecil Island
    820: 173,  # Wicklow Point
    821: 69,  # Glacier Falls
    831: 144,  # Shelter Passage
    869: 103,  # Maude Island
    871: 11,  # Barnes Bay
    884: 96,  # Localsh Bay
    892: 15,  # Bell Island
    1059: 140,  # Sargeaunt Passage
    1078: 97,  # Lutes Creek
    1079: 152,  # Steamer
    1136: 141,  # Shaw Point
    1144: 25,  # Burdwood
    1145: 125,  # Potts Bay
    1148: 19,  # Binns Island
    1164: 61,  # Farside
    1198: 130,  # Raynor Group
    1237: 107,  # Monday Rocks
    1238: 100,  # Mahatta West
    1288: 52,  # Doyle Island
    1291: 98,  # MacIntyre
    1293: 53,  # Duncan Island
    1300: 3,  # Althorpe
    1335: 171,  # Wehlis Bay
    1336: 145,  # Simmonds Point
    1338: 99,  # Mahatta East
    1350: 143,  # Shelter Bay
    1351: 102,  # Marsh Bay
    1376: None,  # Cleagh Creek. Not included.
    1382: 133,  # Robertson Cr
    1401: 23,  # Brent Island
    1472: 172,  # Westside
    1507: 106,  # Millar
    1537: 9,  # Bare Bluff
    1580: 82,  # Jackson Passage
    1581: 73,  # Hardwicke Island
    1586: 51,  # Doctor Islets
    1618: 79,  # Humphrey Rocks
    1691: 85,  # Kid Bay
    1697: 44,  # Culloden
    1698: 1,  # Ahlstrom
    1700: 109,  # Muchalat South
    1702: 70,  # Goat Cove
    1705: 174,  # Williamson
    1738: 7,  # Atrevida
    1762: 72,  # Gore
    1789: 37,  # Concepcion
    1825: 113,  # Noo-la
    1839: 168,  # Wa-kwa
    1849: 108,  # Muchalat North
    1862: 76,  # Hecate
    1863: 58,  # Esperanza
    1895: 142,  # Sheep Passage
    1896: 94,  # Lime Point / Lime Bay
    6668: 123,  # Plover Point
    7053: 67,  # Ghi ya (Bull Harbour)
    7054: 169,  # Wanx talis (Heath Bay)
    7273: 161,  # Tsa-ya
    7713: 42,  # Cougar
    7714: 2,  # Alexander Inlet
}

In [23]:
# load the DFO data file and group to get facility reference numbers
dfo_df = pd.read_csv(dfo_farm_data_filepath)

dfo_ref_group = dfo_df.groupby(['Facility Reference Number', 'Site Common Name', 'Aquaculture Management Unit'])
dfo_ref_df = dfo_ref_group.count().reset_index()
dfo_ref_df.head()

Unnamed: 0,Facility Reference Number,Site Common Name,Aquaculture Management Unit,Year,Month,Licence Holder,Latitude,Longitude,Sample Type,Incident Date,Number of Pens Sampled,Average L. salmonis motiles per fish,Average L. salmonis females per fish,Average chalimus per fish,Average caligus per fish,Comments,Year Class
0,78,Phillips Arm,Discovery Islands,212,212,212,212,212,212,189,212,189,189,189,189,90,206
1,100,Lees Bay,Discovery Islands,173,173,173,173,173,173,153,173,153,153,153,153,95,168
2,136,Cliff Bay,Broughton Archipelago,15,15,15,15,15,15,5,15,5,5,5,5,10,13
3,137,Conville Bay,Discovery Islands,23,23,23,23,23,23,18,23,18,18,18,18,7,22
4,141,Port Elizabeth,Broughton Archipelago,227,227,227,227,227,227,206,227,206,206,206,206,91,219


In [24]:
dfo_ref_df['facility_id'] = dfo_ref_df['Facility Reference Number']. apply(lambda ref: ref_to_id_map[ref])

In [26]:
dfo_ref_df['Facility Reference Number'].unique()

array([  78,  100,  136,  137,  141,  143,  144,  169,  211,  221,  227,
        234,  303,  304,  306,  314,  332,  377,  378,  380,  388,  412,
        458,  465,  466,  467,  520,  526,  527,  540,  543,  553,  728,
        733,  739,  746,  753,  790,  819,  820,  821,  831,  869,  871,
        884,  892, 1059, 1078, 1079, 1136, 1144, 1145, 1148, 1164, 1198,
       1237, 1238, 1288, 1291, 1293, 1300, 1335, 1336, 1338, 1350, 1351,
       1376, 1382, 1401, 1472, 1507, 1537, 1580, 1581, 1586, 1618, 1691,
       1697, 1698, 1700, 1702, 1705, 1738, 1762, 1789, 1825, 1839, 1849,
       1862, 1863, 1895, 1896, 6668, 7053, 7054, 7273, 7713, 7714])

In [27]:
dfo_ref_df.columns

Index(['Facility Reference Number', 'Site Common Name',
       'Aquaculture Management Unit', 'Year', 'Month', 'Licence Holder',
       'Latitude', 'Longitude', 'Sample Type', 'Incident Date',
       'Number of Pens Sampled', 'Average L. salmonis motiles per fish',
       'Average L. salmonis females per fish', 'Average chalimus per fish',
       'Average caligus per fish', 'Comments', 'Year Class', 'facility_id'],
      dtype='object')

In [29]:
# load the industry farm details
industry_farm_df = pd.read_csv(industry_farm_details_filepath)

In [32]:
ref_to_id_df = pd.merge(dfo_ref_df, industry_farm_df, on='facility_id', how='outer')
ref_to_id_df =  ref_to_id_df.reindex(
    columns=['Facility Reference Number', 'Site Common Name', 'Aquaculture Management Unit', 'separator',
    'facility_id', 'name', 'region_name'])
ref_to_id_df.head()

Unnamed: 0,Facility Reference Number,Site Common Name,Aquaculture Management Unit,separator,facility_id,name,region_name
0,78.0,Phillips Arm,Discovery Islands,,121.0,Phillips Arm,Discovery Islands
1,100.0,Lees Bay,Discovery Islands,,92.0,Lees Bay,Discovery Islands
2,136.0,Cliff Bay,Broughton Archipelago,,35.0,Cliff Bay,Broughton Archipelago
3,137.0,Conville Bay,Discovery Islands,,39.0,Conville Bay,Discovery Islands
4,141.0,Port Elizabeth,Broughton Archipelago,,124.0,Port Elizabeth,Broughton Archipelago


In [33]:
# write out the mapping DF for use in formatting
ref_to_id_df.to_csv(ref_to_id_map_filepath, index=False)