This file is used to perform all necessary data engineering steps to create the Single-Point dataset

In [1]:
# Load libraries
import pandas as pd
import numpy as np
import geopy.distance
from datetime import timedelta, datetime
import math
import os

In [2]:
# Import worldports dataset
world_ports = pd.read_csv("Data\\wld_trs_ports_wfp.csv", header=0)
world_ports_lim = world_ports[['portname', 'latitude', 'longitude']]

coords_test = tuple(zip(world_ports_lim['latitude'], world_ports_lim['longitude']))

world_ports_lim['coordinates'] = coords_test

array_world_ports_lim = np.array(world_ports_lim['coordinates'].tolist())

#  Define function to find closest port by calculating the distance between all entries in data1 and data2
def broadcasting_based_lng_lat(data1, data2):
    # data1, data2 are the data arrays with 2 cols and they hold
    # lat., lng. values in those cols respectively
    data1 = np.deg2rad(data1)                     
    data2 = np.deg2rad(data2)                     

    lat1 = data1[:,0]                     
    lng1 = data1[:,1]         

    lat2 = data2[:,0]                     
    lng2 = data2[:,1]         

    diff_lat = lat1[:,None] - lat2
    diff_lng = lng1[:,None] - lng2
    d = np.sin(diff_lat/2)**2 + np.cos(lat1[:,None])*np.cos(lat2) * np.sin(diff_lng/2)**2
    return 2 * 6371 * np.arcsin(np.sqrt(d))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  world_ports_lim['coordinates'] = coords_test


In [3]:
# Perform the data engineering
k = 0
list_of_df = []
list_of_acoustic = []
list_of_biotic = []

survey_list = ['Data\\HERAS', 'Data\\IBWSS', 'Data\\IESSNS']

# Loop over every file present in folder "Data\*survey*\Biotic"
for survey in survey_list:
       for filename in os.listdir(survey + "\\Biotic"):
              k += 1
              
              # Read the biotic and accompanying acoustic files
              acoustic_1 = pd.read_csv(survey + "\\Acoustic\\Acoustic" + filename[6:99], header=0)
              biotic_1 = pd.read_csv(survey + "\\Biotic\\" + filename, header=0)

              acoustic_1['Survey'] = survey[5:15]
              acoustic_1['fileIDN'] = k

              biotic_1['Survey'] = survey[5:15]
              biotic_1['fileIDN'] = k

              list_of_acoustic.append(acoustic_1)
              list_of_biotic.append(biotic_1)
              
              # Remove invalid acoustic logs
              acoustic_1 = acoustic_1.loc[acoustic_1['LogValidity'] == 'V']
              
              # Filter out the required columns
              a_columns = ['LogTime', 'LogLatitude', 'LogLongitude']
              b_columns = ['HaulStartTime', 'HaulDuration', 'HaulValidity', 'HaulStartLatitude', 'HaulStartLongitude', 'HaulStopLatitude', 'HaulStopLongitude', 'HaulDistance', 'HaulTowDirection']
              acoustic_1_1 = acoustic_1[a_columns]
              biotic_1_1 = biotic_1[b_columns]

              # Set the datetime columns to correct datatype and drop duplicates
              acoustic_1_1['LogTime'] = pd.to_datetime(acoustic_1_1['LogTime'])
              biotic_1_1['HaulStartTime'] = pd.to_datetime(biotic_1_1['HaulStartTime'])
              acoustic_1_1 = acoustic_1_1.drop_duplicates()
              biotic_1_1 = biotic_1_1.drop_duplicates(subset=['HaulStartTime'])

              # Prepare the biotic file to have the same structure as the acoustic file
              bio_to_aco = biotic_1_1[['HaulStartTime', 'HaulStartLatitude', 'HaulStartLongitude']]
              bio_to_aco = bio_to_aco.rename(columns={'HaulStartTime': 'LogTime', 'HaulStartLatitude': 'LogLatitude', 'HaulStartLongitude': 'LogLongitude'})

              # Concat the biotic and acoustic file to create a df containing all registered timestamps
              acoustic_final = pd.concat([acoustic_1_1, bio_to_aco], ignore_index=True)
              acoustic_final = acoustic_final.sort_values('LogTime', ignore_index=True)
              acoustic_final['fileIDN'] = k

              # Index both final sets over their respective timestamp columns
              acoustic_final2 = acoustic_final.set_index('LogTime')
              biotic_final = biotic_1_1.set_index('HaulStartTime')

              # Perform final concat adding all available columns to the available timestamps and create new column containing binary classifier for fishing or not.
              full = pd.concat([acoustic_final2, biotic_final], axis=1)
              full['fishing'] = 0
              full.loc[full['HaulValidity'] == 'V', 'fishing'] = 1
              full = full.drop('HaulValidity', axis=1)
       
              
              # Extend the fishing period (fishing = 1) over all rows that are within the start-end haul timeframe
              # -----------------------------------------------------------------------------------------------------------------------------
              time_str = '1/1/1900 12:00:00.0000'
              date_format_str = '%d/%m/%Y %H:%M:%S.%f'

              track = 1
              start_haul = datetime.strptime(time_str, date_format_str)
              end_haul = datetime.strptime(time_str, date_format_str)
              index = full.index
              while track < full.shape[0]:
                     if full['HaulDuration'][track] > 0:
                            start_haul = index[track]
                            end_haul = index[track] + timedelta(minutes=full['HaulDuration'][track])
              
                     if index[track] >= start_haul and index[track] <= end_haul:
                            full['fishing'][track] = 1
                     track += 1
              # -----------------------------------------------------------------------------------------------------------------------------


              # Calculate speed
              # -----------------------------------------------------------------------------------------------------------------------------
              full = full[['LogLatitude', 'LogLongitude', 'fishing', 'fileIDN']]
              distances = [0]
              durations = [0]
              index = full.index
              i=0
              while i < full.shape[0] - 1:
                     distances.append(geopy.distance.distance((full['LogLatitude'][i], full['LogLongitude'][i]), (full['LogLatitude'][i+1], full['LogLongitude'][i+1])).km)
                     durations.append((index[i+1] - index[i]).total_seconds())
                     i += 1

              km_s = [0]
              j=1
              while j < len(distances):
                     if durations[j] != 0:
                            km_s.append((distances[j]/durations[j]) * 60 * 60)
                     else:
                            km_s.append(0)
                     j += 1

              full['speed'] = km_s
              # -----------------------------------------------------------------------------------------------------------------------------


              # Calculate speeds based on lag/lead
              # -----------------------------------------------------------------------------------------------------------------------------
              speed_lag_1 = []
              speed_lead_1 = []
              speed_both_1 = []
              speed_lag_2 = []
              speed_lead_2 = []
              speed_both_2 = []

              full['lag_1'] = full['speed'].shift(1)
              full['lag_2'] = full['speed'].shift(2)
              full['lead_1'] = full['speed'].shift(-1)
              full['lead_2'] = full['speed'].shift(-2)

              m = 0
              while m < full.shape[0]:
                     speed_current = full['speed'][m]
                     lag_1 = full['lag_1'][m]
                     lag_2 = full['lag_2'][m]
                     lead_1 = full['lead_1'][m]
                     lead_2 = full['lead_2'][m]

                     if math.isnan(lag_1):
                            average_speed_lg1 = speed_current
                     else:
                            average_speed_lg1 = (speed_current + lag_1) / 2
                     
                     if math.isnan(lag_2) and math.isnan(lag_1):
                            average_speed_lg2 = speed_current
                     elif math.isnan(lag_2) and not math.isnan(lag_1):
                            average_speed_lg2 = (speed_current + lag_1) / 2
                     else:
                            average_speed_lg2 = (speed_current + lag_1 + lag_2) / 3
                     
                     if math.isnan(lead_1):
                            average_speed_ld1 = speed_current
                     else:
                            average_speed_ld1 = (speed_current + lead_1) / 2
                     
                     if math.isnan(lead_2) and math.isnan(lead_1):
                            average_speed_ld2 = speed_current
                     elif math.isnan(lead_2) and not math.isnan(lead_1):
                            average_speed_ld2 = (speed_current + lead_1) / 2
                     else:
                            average_speed_ld2 = (speed_current + lead_1 + lead_2) / 3
                     
                     if math.isnan(lead_1):
                            average_speed_b1 = (speed_current + lag_1) / 2
                     elif math.isnan(lag_1):
                            average_speed_b1 = (speed_current + lead_1) / 2
                     else:
                            average_speed_b1 = (speed_current + lead_1 + lag_1) / 3
                     
                     if math.isnan(lead_1):
                            average_speed_b2 = (speed_current + lag_1 + lag_2) / 3
                     elif math.isnan(lead_2):
                            average_speed_b2 = (speed_current + lag_1 + lag_2 + lead_1) / 4
                     elif math.isnan(lag_1):
                            average_speed_b2 = (speed_current + lead_1 + lead_2) / 3
                     elif math.isnan(lag_2):
                            average_speed_b2 = (speed_current + lead_1 + lead_2 + lag_1) / 4
                     else:
                            average_speed_b2 = (speed_current + lead_1 + lag_1 + lead_2 + lag_2) / 5

                     speed_lag_1.append(average_speed_lg1)
                     speed_lag_2.append(average_speed_lg2)
                     speed_lead_1.append(average_speed_ld1)
                     speed_lead_2.append(average_speed_ld2)
                     speed_both_1.append(average_speed_b1)
                     speed_both_2.append(average_speed_b2)
                     
                     m += 1

              full['speed_lag_1'] = speed_lag_1
              full['speed_lag_2'] = speed_lag_2
              full['speed_lead_1'] = speed_lead_1
              full['speed_lead_2'] = speed_lead_2
              full['speed_both_1'] = speed_both_1
              full['speed_both_2'] = speed_both_2
              # -----------------------------------------------------------------------------------------------------------------------------


              # Calculate direction based on previous and next
              # -----------------------------------------------------------------------------------------------------------------------------
              # Create lag and lead of lat/long
              full['lat_f1'] = full['LogLatitude'].shift(-1) # latitude with lead 1
              full['lon_f1'] = full['LogLongitude'].shift(-1) # Longtitude with lead 1
              full['lat_b1'] = full['LogLatitude'].shift(1) # latitude with lag 1
              full['lon_b1'] = full['LogLongitude'].shift(1) # Longtitude with lag 1

              direction = [0]
              geo_track = 1
              while geo_track < full.shape[0]:
                     delta_lon = full['LogLongitude'][geo_track] - full['lon_b1'][geo_track]
                     y2 = math.sin(delta_lon) * math.cos(full['LogLatitude'][geo_track])
                     x2 = math.cos(full['lat_b1'][geo_track]) * math.sin(full['LogLatitude'][geo_track]) - math.sin(full['lat_b1'][geo_track]) * math.cos(full['LogLatitude'][geo_track]) * math.cos(delta_lon)

                     brng2 = math.atan2(y2, x2)
                     brng2 = math.degrees(brng2)

                     direction.append(brng2)

                     geo_track += 1

              full['bearing'] = direction

              direction_change = [0, 0]
              geo_track2 = 2
              while geo_track2 < full.shape[0]:
                     change = full['bearing'][geo_track2 - 1] - full['bearing'][geo_track2]
                     direction_change.append(change)

                     geo_track2 += 1

              full['bearing change'] = direction_change
              # -----------------------------------------------------------------------------------------------------------------------------

              # Calculate distance to port
              # -----------------------------------------------------------------------------------------------------------------------------
              cur_loc = 0
              port_distances = []

              while cur_loc < full.shape[0]:
                     cord = np.array([(full['LogLatitude'][cur_loc], full['LogLongitude'][cur_loc])])
                     p_dist = broadcasting_based_lng_lat(cord, array_world_ports_lim)
                     port_distances.append(p_dist.min())

                     cur_loc += 1

              full['distance to port'] = port_distances
              # -----------------------------------------------------------------------------------------------------------------------------
              
              
              # add time deltas
              # -----------------------------------------------------------------------------------------------------------------------------
              moment = 1
              time_deltas = [0]
              index = full.index
              while moment < full.shape[0]:
                     time_delta = (index[moment] - index[moment - 1]).total_seconds()
                     time_deltas.append(time_delta)
                     moment += 1

              full['time delta'] = time_deltas   
              # -----------------------------------------------------------------------------------------------------------------------------
              
              full['survey'] = survey[5:15]
              list_of_df.append(full) # Add calculated df to list of lists to be transformed back to a final df at the end

all_acoustic = pd.concat(list_of_acoustic, ignore_index=False)
all_biotic = pd.concat(list_of_biotic, ignore_index=False)

all_acoustic.to_csv('Data/all_acoustic.csv')
all_biotic.to_csv('Data/all_biotic.csv')

full_set = pd.concat(list_of_df,ignore_index=False) # transform list of lists to final df
full_set.reset_index(inplace=True)

# rename columns
full_set = full_set.rename(columns={"index": "datetimestamp"})

# convert datetime to int
full_set['int_datetime'] =  full_set['datetimestamp'].astype('int64')

full_set.to_csv('Data/full_set_SPA.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  acoustic_1_1['LogTime'] = pd.to_datetime(acoustic_1_1['LogTime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  biotic_1_1['HaulStartTime'] = pd.to_datetime(biotic_1_1['HaulStartTime'])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full['fishing'][track] = 1
  full_set['int_datetime'] =  full_set['datetimestamp'].astype('int64')
