In [4]:
# Load libraries
import pandas as pd
import numpy as np
import geopy.distance
from datetime import timedelta, datetime
import math


In [5]:
def PrepDataSPA():
       k = 0
       list_of_df = []

       # Loop over every file present in folder "Data\Biotic"
       for filename in os.listdir("Data\Biotic"):
              k += 1

              # Read the biotic and accompanying acoustic files
              acoustic_1 = pd.read_csv("Data\\Acoustic\\Acoustic" + filename[6:99], header=0)
              biotic_1 = pd.read_csv("Data\\Biotic\\" + filename, header=0)

              # Filter out the required columns
              a_columns = ['LogTime', 'LogLatitude', 'LogLongitude']
              b_columns = ['HaulStartTime', 'HaulDuration', 'HaulValidity', 'HaulStartLatitude', 'HaulStartLongitude', 'HaulStopLatitude', 'HaulStopLongitude', 'HaulDistance', 'HaulTowDirection']
              acoustic_1_1 = acoustic_1[a_columns]
              biotic_1_1 = biotic_1[b_columns]

              # Set the datetime columns to correct datatype and drop duplicates
              acoustic_1_1['LogTime'] = pd.to_datetime(acoustic_1_1['LogTime'])
              biotic_1_1['HaulStartTime'] = pd.to_datetime(biotic_1_1['HaulStartTime'])
              acoustic_1_1 = acoustic_1_1.drop_duplicates()

              # Prepare the biotic file to have the same structure as the acoustic file
              bio_to_aco = biotic_1_1[['HaulStartTime', 'HaulStartLatitude', 'HaulStartLongitude']]
              bio_to_aco = bio_to_aco.rename(columns={'HaulStartTime': 'LogTime', 'HaulStartLatitude': 'LogLatitude', 'HaulStartLongitude': 'LogLongitude'})

              # Concat the biotic and acoustic file to create a df containing all registered timestamps
              acoustic_final = pd.concat([acoustic_1_1, bio_to_aco], ignore_index=True)
              acoustic_final = acoustic_final.sort_values('LogTime', ignore_index=True)
              acoustic_final['fileIDN'] = k

              # Index both final sets over their respective timestamp columns
              acoustic_final2 = acoustic_final.set_index('LogTime')
              biotic_final = biotic_1_1.set_index('HaulStartTime')

              # Perform final concat adding all available columns to the available timestamps and create new column containing binary classifier for fishing or not.
              full = pd.concat([acoustic_final2, biotic_final], axis=1)
              full['fishing'] = 0
              full.loc[full['HaulValidity'] == 'V', 'fishing'] = 1
              full = full.drop('HaulValidity', axis=1)
              
              # Extend the fishing period (fishing = 1) over all rows that are within the start-end haul timeframe
              # -----------------------------------------------------------------------------------------------------------------------------
              time_str = '1/1/1900 12:00:00.0000'
              date_format_str = '%d/%m/%Y %H:%M:%S.%f'

              track = 1
              start_haul = datetime.strptime(time_str, date_format_str)
              end_haul = datetime.strptime(time_str, date_format_str)
              index = full.index
              while track < full.shape[0]:
                     if full['HaulDuration'][track] > 0:
                            start_haul = index[track]
                            end_haul = index[track] + timedelta(minutes=full['HaulDuration'][track])
              
                     if index[track] >= start_haul and index[track] <= end_haul:
                            full['fishing'][track] = 1
                     track += 1
              # -----------------------------------------------------------------------------------------------------------------------------


              # Calculate speed
              # -----------------------------------------------------------------------------------------------------------------------------
              full = full[['LogLatitude', 'LogLongitude', 'fishing', 'fileIDN', ]]
              distances = [0]
              durations = [0]
              index = full.index
              i=0
              while i < full.shape[0] - 1:
                     distances.append(geopy.distance.distance((full['LogLatitude'][i], full['LogLongitude'][i]), (full['LogLatitude'][i+1], full['LogLongitude'][i+1])).km)
                     durations.append((index[i+1] - index[i]).total_seconds())
                     i += 1

              km_s = [0]
              j=1
              while j < len(distances):
                     if durations[j] != 0:
                            km_s.append((distances[j]/durations[j]) * 60 * 60)
                     else:
                            km_s.append(0)
                     j += 1

              full['speed'] = km_s
              # -----------------------------------------------------------------------------------------------------------------------------


              # Calculate speeds based on lag/lead
              # -----------------------------------------------------------------------------------------------------------------------------
              speed_lag_1 = []
              speed_lead_1 = []
              speed_both_1 = []
              speed_lag_2 = []
              speed_lead_2 = []
              speed_both_2 = []

              full['lag_1'] = full['speed'].shift(1)
              full['lag_2'] = full['speed'].shift(2)
              full['lead_1'] = full['speed'].shift(-1)
              full['lead_2'] = full['speed'].shift(-2)

              m = 0
              while m < full.shape[0]:
                     speed_current = full['speed'][m]
                     lag_1 = full['lag_1'][m]
                     lag_2 = full['lag_2'][m]
                     lead_1 = full['lead_1'][m]
                     lead_2 = full['lead_2'][m]

                     if math.isnan(lag_1):
                            average_speed_lg1 = speed_current
                     else:
                            average_speed_lg1 = (speed_current + lag_1) / 2
                     
                     if math.isnan(lag_2) and math.isnan(lag_1):
                            average_speed_lg2 = speed_current
                     elif math.isnan(lag_2) and not math.isnan(lag_1):
                            average_speed_lg2 = (speed_current + lag_1) / 2
                     else:
                            average_speed_lg2 = (speed_current + lag_1 + lag_2) / 3
                     
                     if math.isnan(lead_1):
                            average_speed_ld1 = speed_current
                     else:
                            average_speed_ld1 = (speed_current + lead_1) / 2
                     
                     if math.isnan(lead_2) and math.isnan(lead_1):
                            average_speed_ld2 = speed_current
                     elif math.isnan(lead_2) and not math.isnan(lead_1):
                            average_speed_ld2 = (speed_current + lead_1) / 2
                     else:
                            average_speed_ld2 = (speed_current + lead_1 + lead_2) / 3
                     
                     if math.isnan(lead_1):
                            average_speed_b1 = (speed_current + lag_1) / 2
                     elif math.isnan(lag_1):
                            average_speed_b1 = (speed_current + lead_1) / 2
                     else:
                            average_speed_b1 = (speed_current + lead_1 + lag_1) / 3
                     
                     if math.isnan(lead_1):
                            average_speed_b2 = (speed_current + lag_1 + lag_2) / 3
                     elif math.isnan(lead_2):
                            average_speed_b2 = (speed_current + lag_1 + lag_2 + lead_1) / 4
                     elif math.isnan(lag_1):
                            average_speed_b2 = (speed_current + lead_1 + lead_2) / 3
                     elif math.isnan(lag_2):
                            average_speed_b2 = (speed_current + lead_1 + lead_2 + lag_1) / 4
                     else:
                            average_speed_b2 = (speed_current + lead_1 + lag_1 + lead_2 + lag_2) / 5

                     speed_lag_1.append(average_speed_lg1)
                     speed_lag_2.append(average_speed_lg2)
                     speed_lead_1.append(average_speed_ld1)
                     speed_lead_2.append(average_speed_ld2)
                     speed_both_1.append(average_speed_b1)
                     speed_both_2.append(average_speed_b2)
                     
                     m += 1

              full['speed_lag_1'] = speed_lag_1
              full['speed_lag_2'] = speed_lag_2
              full['speed_lead_1'] = speed_lead_1
              full['speed_lead_2'] = speed_lead_2
              full['speed_both_1'] = speed_both_1
              full['speed_both_2'] = speed_both_2
              # -----------------------------------------------------------------------------------------------------------------------------


              # Calculate direction based on previous and next
              # -----------------------------------------------------------------------------------------------------------------------------
              # Create lag and lead of lat/long
              full['lat_f1'] = full['LogLatitude'].shift(-1) # latitude with lead 1
              full['lon_f1'] = full['LogLongitude'].shift(-1) # Longtitude with lead 1
              full['lat_b1'] = full['LogLatitude'].shift(1) # latitude with lag 1
              full['lon_b1'] = full['LogLongitude'].shift(1) # Longtitude with lag 1

              direction = [0]
              geo_track = 1
              while geo_track < full.shape[0]:
                     delta_lon = full['LogLongitude'][geo_track] - full['lon_b1'][geo_track]
                     y2 = math.sin(delta_lon) * math.cos(full['LogLatitude'][geo_track])
                     x2 = math.cos(full['lat_b1'][geo_track]) * math.sin(full['LogLatitude'][geo_track]) - math.sin(full['lat_b1'][geo_track]) * math.cos(full['LogLatitude'][geo_track]) * math.cos(delta_lon)

                     brng2 = math.atan2(y2, x2)
                     brng2 = math.degrees(brng2)

                     direction.append(brng2)

                     geo_track += 1

              full['bearing'] = direction

              direction_change = [0, 0]
              geo_track2 = 2
              while geo_track2 < full.shape[0]:
                     change = full['bearing'][geo_track2 - 1] - full['bearing'][geo_track2]
                     direction_change.append(change)

                     geo_track2 += 1

              full['bearing change'] = direction_change
              # -----------------------------------------------------------------------------------------------------------------------------

              list_of_df.append(full) # Add calculated df to list of lists to be transformed back to a final df at the end

       full_set = pd.concat(list_of_df,ignore_index=False) # transform list of lists to final df
       full_set.reset_index(inplace=True)

       # rename columns
       full_set = full_set.rename(columns={"index": "datetimestamp"})

       # convert datetime to int
       full_set['int_datetime'] =  full_set['datetimestamp'].astype('int64')

       full_set.to_csv('Data/full_set.csv', index_label='timestamp')

PrepDataSPA()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full['fishing'][track] = 1
  full_set['int_datetime'] =  full_set['datetimestamp'].astype('int64')


Unnamed: 0,datetimestamp,LogLatitude,LogLongitude,fishing,fileIDN,speed,lag_1,lag_2,lead_1,lead_2,...,speed_lead_2,speed_both_1,speed_both_2,lat_f1,lon_f1,lat_b1,lon_b1,bearing,bearing change,int_datetime
0,2021-07-01 04:23:00,55.895245,6.030530,0,1,0.000000,,,18.573427,18.623732,...,12.399053,9.286714,12.399053,55.895240,6.060218,,,0.000000,0.000000,1625113380000000000
1,2021-07-01 04:29:00,55.895240,6.060218,0,1,18.573427,0.000000,,18.623732,18.636269,...,18.611143,12.399053,13.958357,55.895256,6.089987,55.895245,6.030530,90.529208,0.000000,1625113740000000000
2,2021-07-01 04:35:00,55.895256,6.089987,0,1,18.623732,18.573427,0.000000,18.636269,18.589848,...,18.616616,18.611143,14.884655,55.895140,6.119775,55.895240,6.060218,90.479671,0.049537,1625114100000000000
3,2021-07-01 04:41:00,55.895140,6.119775,0,1,18.636269,18.623732,18.573427,18.589848,22.338484,...,19.854867,18.616616,19.352352,55.895180,6.149489,55.895256,6.089987,90.799815,-0.320144,1625114460000000000
4,2021-07-01 04:47:00,55.895180,6.149489,0,1,18.589848,18.636269,18.623732,22.338484,18.599033,...,19.842455,19.854867,19.357473,55.895252,6.179245,55.895140,6.119775,90.420428,0.379386,1625114820000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52329,2022-04-02 03:50:00,58.773000,-7.611000,0,19,18.513005,22.255465,18.545690,22.376517,22.257376,...,21.048966,21.048329,20.789610,58.771000,-7.579000,58.773000,-7.643000,-90.727853,-2.939868,1648871400000000000
52330,2022-04-02 03:55:00,58.771000,-7.579000,0,19,22.376517,18.513005,22.255465,22.257376,22.258013,...,22.297302,21.048966,21.532075,58.770000,-7.547000,58.773000,-7.611000,-96.604723,5.876870,1648871700000000000
52331,2022-04-02 04:00:00,58.770000,-7.547000,0,19,22.257376,22.376517,18.513005,22.258013,27.823313,...,24.112901,22.297302,22.645645,58.769000,-7.515000,58.771000,-7.579000,-93.680934,-2.923789,1648872000000000000
52332,2022-04-02 04:05:00,58.769000,-7.515000,0,19,22.258013,22.257376,22.376517,27.823313,,...,25.040663,24.112901,23.678805,58.768000,-7.483000,58.770000,-7.547000,-93.685363,0.004429,1648872300000000000
