In [37]:
# Load libraries
import pandas as pd
import numpy as np
import geopy.distance
import os
import math
from datetime import timedelta, datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [76]:
# Read the biotic and accompanying acoustic files
filename = "Biotic_06SL794.csv"
acoustic_1 = pd.read_csv("Data\\Acoustic\\Acoustic" + filename[6:99], header=0)
biotic_1 = pd.read_csv("Data\\Biotic\\" + filename, header=0)

# Filter out the required columns
a_columns = ['LogTime', 'LogLatitude', 'LogLongitude']
b_columns = ['HaulStartTime', 'HaulDuration', 'HaulValidity', 'HaulStartLatitude', 'HaulStartLongitude', 'HaulStopLatitude', 'HaulStopLongitude', 'HaulDistance', 'HaulTowDirection']
acoustic_1_1 = acoustic_1[a_columns]
biotic_1_1 = biotic_1[b_columns]

# Set the datetime columns to correct datatype and drop duplicates
acoustic_1_1['LogTime'] = pd.to_datetime(acoustic_1_1['LogTime'])
biotic_1_1['HaulStartTime'] = pd.to_datetime(biotic_1_1['HaulStartTime'])
acoustic_1_1 = acoustic_1_1.drop_duplicates()

# Prepare the biotic file to have the same structure as the acoustic file
bio_to_aco = biotic_1_1[['HaulStartTime', 'HaulStartLatitude', 'HaulStartLongitude']]
bio_to_aco = bio_to_aco.rename(columns={'HaulStartTime': 'LogTime', 'HaulStartLatitude': 'LogLatitude', 'HaulStartLongitude': 'LogLongitude'})

# Concat the biotic and acoustic file to create a df containing all registered timestamps
acoustic_final = pd.concat([acoustic_1_1, bio_to_aco], ignore_index=True)
acoustic_final = acoustic_final.sort_values('LogTime', ignore_index=True)
acoustic_final['fileIDN'] = 1

# Index both final sets over their respective timestamp columns
acoustic_final2 = acoustic_final.set_index('LogTime')
biotic_final = biotic_1_1.set_index('HaulStartTime')

# Perform final concat adding all available columns to the available timestamps and create new column containing binary classifier for fishing or not.
full = pd.concat([acoustic_final2, biotic_final], axis=1)
full['fishing'] = 0
full.loc[full['HaulValidity'] == 'V', 'fishing'] = 1
full = full.drop('HaulValidity', axis=1)

# Extend the fishing period (fishing = 1) over all rows that are within the start-end haul timeframe
# -----------------------------------------------------------------------------------------------------------------------------
time_str = '1/1/1900 12:00:00.0000'
date_format_str = '%d/%m/%Y %H:%M:%S.%f'

track = 1
start_haul = datetime.strptime(time_str, date_format_str)
end_haul = datetime.strptime(time_str, date_format_str)
index = full.index
while track < full.shape[0]:
        if full['HaulDuration'][track] > 0:
                start_haul = index[track]
                end_haul = index[track] + timedelta(minutes=full['HaulDuration'][track])

        if index[track] >= start_haul and index[track] <= end_haul:
                full['fishing'][track] = 1
        track += 1
# -----------------------------------------------------------------------------------------------------------------------------


# Calculate speed
# -----------------------------------------------------------------------------------------------------------------------------
full = full[['LogLatitude', 'LogLongitude', 'fishing', 'fileIDN', ]]
distances = [0]
durations = [0]
index = full.index
i=0
while i < full.shape[0] - 1:
        distances.append(geopy.distance.distance((full['LogLatitude'][i], full['LogLongitude'][i]), (full['LogLatitude'][i+1], full['LogLongitude'][i+1])).km)
        durations.append((index[i+1] - index[i]).total_seconds())
        i += 1

km_s = [0]
j=1
while j < len(distances):
        if durations[j] != 0:
                km_s.append((distances[j]/durations[j]) * 60 * 60)
        else:
                km_s.append(0)
        j += 1

full['speed'] = km_s

# speed_lag_1 = []
# speed_lead_1 = []
# speed_both_1 = []
# speed_lag_2 = []
# speed_lead_2 = []
# speed_both_2 = []

# full['lag_1'] = full['speed'].shift(1)
# full['lag_2'] = full['speed'].shift(2)
# full['lead_1'] = full['speed'].shift(-1)
# full['lead_2'] = full['speed'].shift(-2)

# m = 0
# while m < full.shape[0]:
#         speed_current = full['speed'][m]
#         lag_1 = full['lag_1'][m]
#         lag_2 = full['lag_2'][m]
#         lead_1 = full['lead_1'][m]
#         lead_2 = full['lead_2'][m]

#         average_speed_lg1 = (speed_current + lag_1) / 2
#         average_speed_lg2 = (speed_current + lag_1 + lag_2) / 3
#         if np.isnan(lead_1):
#                 average_speed_ld1 = speed_current
#         else:
#                 average_speed_ld1 = (speed_current + lead_1) / 2
#         average_speed_ld2 = (speed_current + lead_1 + lead_2) / 3
#         average_speed_b1 = (speed_current + lead_1 + lag_1) / 3
#         average_speed_b2 = (speed_current + lead_1 + lag_1 + lead_2 + lag_2) / 5

#         speed_lag_1.append(average_speed_lg1)
#         speed_lag_2.append(average_speed_lg2)
#         speed_lead_1.append(average_speed_ld1)
#         speed_lead_2.append(average_speed_ld2)
#         speed_both_1.append(average_speed_b1)
#         speed_both_2.append(average_speed_b2)
        
#         m += 1

# # print(speed_lag_1)

# full['speed_lag_1'] = speed_lag_1
# full['speed_lag_2'] = speed_lag_2
# full['speed_lead_1'] = speed_lead_1
# full['speed_lead_2'] = speed_lead_2
# full['speed_both_1'] = speed_both_1
# full['speed_both_2'] = speed_both_2


# Calculate direction based on previous and next
# -----------------------------------------------------------------------------------------------------------------------------
# Create lag and lead of lat/long
full['lat_f1'] = full['LogLatitude'].shift(-1) # latitude with lead 1
full['lon_f1'] = full['LogLongitude'].shift(-1) # Longtitude with lead 1
full['lat_b1'] = full['LogLatitude'].shift(1) # latitude with lag 1
full['lon_b1'] = full['LogLongitude'].shift(1) # Longtitude with lag 1

direction = [0]
geo_track = 1
while geo_track < full.shape[0]:
        delta_lon = full['LogLongitude'][geo_track] - full['lon_b1'][geo_track]
        y2 = math.sin(delta_lon) * math.cos(full['LogLatitude'][geo_track])
        x2 = math.cos(full['lat_b1'][geo_track]) * math.sin(full['LogLatitude'][geo_track]) - math.sin(full['lat_b1'][geo_track]) * math.cos(full['LogLatitude'][geo_track]) * math.cos(delta_lon)

        brng2 = math.atan2(y2, x2)
        brng2 = math.degrees(brng2)

        direction.append(brng2)

        geo_track += 1

full['bearing'] = direction

direction_change = [0]
geo_track2 = 1
while geo_track2 < full.shape[0]:
        change = full['bearing'][geo_track2 - 1] - full['bearing'][geo_track2]
        direction_change.append(change)

        geo_track2 += 1

full['bearing change'] = direction_change
# -----------------------------------------------------------------------------------------------------------------------------

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acoustic_1_1['LogTime'] = pd.to_datetime(acoustic_1_1['LogTime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  biotic_1_1['HaulStartTime'] = pd.to_datetime(biotic_1_1['HaulStartTime'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full['fishing'][track] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,c

In [77]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(full)

Unnamed: 0,LogLatitude,LogLongitude,fishing,fileIDN,speed,lat_f1,lon_f1,lat_b1,lon_b1,bearing,bearing change
2021-07-01 04:23:00,55.895245,6.03053,0,1,0.0,55.89524,6.060218,,,0.0,0.0
2021-07-01 04:29:00,55.89524,6.060218,0,1,18.573427,55.895256,6.089987,55.895245,6.03053,90.529208,-90.529208
2021-07-01 04:35:00,55.895256,6.089987,0,1,18.623732,55.89514,6.119775,55.89524,6.060218,90.479671,0.049537
2021-07-01 04:41:00,55.89514,6.119775,0,1,18.636269,55.89518,6.149489,55.895256,6.089987,90.799815,-0.320144
2021-07-01 04:47:00,55.89518,6.149489,0,1,18.589848,55.895252,6.179245,55.89514,6.119775,90.420428,0.379386
2021-07-01 04:52:00,55.895252,6.179245,0,1,22.338484,55.895176,6.208973,55.89518,6.149489,90.343636,0.076792
2021-07-01 04:58:00,55.895176,6.208973,0,1,18.599033,55.895203,6.238823,55.895252,6.179245,90.702245,-0.358608
2021-07-01 05:04:00,55.895203,6.238823,0,1,18.674075,55.895252,6.26852,55.895176,6.208973,90.454622,0.247623
2021-07-01 05:10:00,55.895252,6.26852,0,1,18.57952,55.89524,6.29819,55.895203,6.238823,90.398179,0.056442
2021-07-01 05:16:00,55.89524,6.29819,0,1,18.561481,55.89518,6.327952,55.895252,6.26852,90.545904,-0.147725
