This notebook contains some automatic data filtering routines.

In [1]:
import numpy as np
import pandas as pd
from utils.data_ops import *
from utils.constants import *
from utils.time_conversion import *

In [2]:
# Import and format original M2 data
in_path = "../data/tracks_tagged_modified.csv"
track_data = read_and_init_track_df(in_path)
track_data.head(5)

Loaded a legacy M2 tagged dataset.


Unnamed: 0,id_track,id_site,id_m2,source,duration,alarm,min_speed,max_speed,avg_speed,curviness,...,repairs,distress,other,miss_ais,violation,le_contact,tdate,ttime,type_m2_agg,activity
0,29980760,10,18-0704-1000,radar,637,0,7.1,14.4,10.636634,1.023164,...,0,0,0,0,0,0,2023-07-04,10:06:47,,
1,30278317,10,158-0714-0950,radar,5463,1,0.1,6.7,3.016,2.591425,...,0,0,0,0,0,0,2023-07-14,10:45:04,sail,
2,30281714,10,196-0714-1032,radar,572,0,0.1,2.1,0.694318,1.300154,...,0,0,0,0,0,0,2023-07-14,10:43:06,,
3,30282211,10,1-0714-1040,radar,581,0,4.0,26.5,12.441758,1.068525,...,0,0,0,0,0,0,2023-07-14,10:42:50,,
4,30282731,10,3-0714-1047,radar,155,0,3.9,5.7,5.2375,1.003332,...,0,0,0,0,0,0,2023-07-14,10:59:59,,


In [3]:
# First filter out data by valid only
track_data = track_data[track_data["valid"] > 0]
len(track_data)

5588

In [4]:
# Then filter out tracks with less than 50 detection points
track_data = track_data[track_data["detections"] >= 50]
len(track_data)

4366

Print out some constants

In [26]:
track_data.columns

Index(['id_track', 'id_site', 'id_m2', 'source', 'duration', 'alarm',
       'min_speed', 'max_speed', 'avg_speed', 'curviness', 'heading_mean',
       'heading_std', 'turning_mean', 'turning_std', 'duration_z', 'distance',
       'distance_o', 'assoc_str', 'assoc_id', 'tagged', 'has_photos',
       'confidence', 'detections', 'sdate', 'stime', 'ldate', 'ltime',
       'user_id', 'valid', 'type', 'notes', 'transit', 'overnight', 'loiter',
       'cleanup', 'fishing_c', 'fishing_r', 'research', 'diving', 'repairs',
       'distress', 'other', 'miss_ais', 'violation', 'le_contact', 'tdate',
       'ttime', 'type_m2_agg', 'activity'],
      dtype='object')

In [25]:
ACT_CODE_NEW

['transit',
 'drifting',
 'fishing',
 'stopped',
 'work',
 'other',
 'reserved1',
 'reserved2',
 'reserved3',
 '']

In [15]:
def assign_activity(row):
    """
    User defined function to convert old activity tags to new activity tags
    """
    # If originally tagged as transit, assign slow speed / transit activity tags
    if row["transit"]:
        if row["avg_speed"] < 5.0:
            return "drifting"
        else:
            return "transit"

    if row["fishing_c"]:
        return "fishing"
    # Use heuristics to determine hook and line
    note = "" if pd.isna(row["notes"]) else row["notes"].lower()
    if "hook" in note or "pole" in note:
        return "reserved1" # Use reserved1 tag for hook and line fishing
    
    if "kayak" in note:
        return "drifting"
    
    # Stopped / Anchored criterion:
    # Vessel is tracked for more than 10 minutes
    # Vessel average speed is less than 1 km/h (0.54 kts)
    if row["duration"] >= 600 and row["avg_speed"] < 0.54:
        return "stopped"
    
    # Other requires further investigation.
    return ""

track_data["activity"] = track_data.apply(assign_activity, axis = 1)
    

In [16]:
track_data["activity"].value_counts()

activity
             1802
stopped      1141
transit       865
fishing       279
drifting      178
reserved1     101
Name: count, dtype: int64

In [17]:
# Export new data:
out_path = "../data/tracks_tagged_v1.csv"
track_data.to_csv(out_path, index=False)

In [18]:
# Load the second version of data for more heuristics:
in_path = "../data/tracks_tagged_v2.csv"
track_data = read_and_init_track_df(in_path)

Loaded a legacy M2 tagged dataset.


In [19]:
def fishing_heuristics(row):
    """
    Further break down fishing activity in hook and line, seine and trawl
    """
    if row["activity"] == "fishing":
        note = "" if pd.isna(row["notes"]) else row["notes"].lower()
        if "seine" in note:
            return "reserved2"
        elif "trawl" in note:
            return "reserved3"
    
    return row["activity"]

track_data["activity"] = track_data.apply(fishing_heuristics, axis = 1)

In [20]:
track_data["activity"].value_counts()

activity
             1724
stopped      1144
transit       867
fishing       240
drifting      193
reserved1     114
work           33
reserved3      28
reserved2      23
Name: count, dtype: int64

In [21]:
out_path = "../data/tracks_tagged_v3.csv"
track_data.to_csv(out_path, index=False)

In [24]:
in_path = "../data/tracks_tagged_v4.csv"
track_data = read_and_init_track_df(in_path)
track_data["activity"].value_counts()

Loaded a legacy M2 tagged dataset.


activity
             1724
stopped      1141
transit       867
fishing       215
drifting      193
reserved1     126
work           33
reserved2      32
reserved3      32
other           3
Name: count, dtype: int64