In [2]:
# Load libraries
import pandas as pd
import numpy as np
import geopy.distance
import os
import math
from datetime import timedelta, datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
import geopandas as gpd
from mpl_toolkits.basemap import Basemap
from matplotlib.collections import LineCollection


In [3]:
oman_1 = pd.read_csv("Data/Oman_AIS/raw_ais_20210501_20220531_461000002.txt", sep="\t", header=0)
oman_1['Timestamp'] = pd.to_datetime(oman_1['Timestamp'])
oman_1 = oman_1[['Timestamp', 'Latitude', 'Longitude']]
oman_1 = oman_1.sort_values(by='Timestamp', ignore_index=True)
oman_1 = oman_1.set_index('Timestamp').rename_axis(None)
oman_1.columns = ['LogLatitude', 'LogLongitude']
oman_1 = oman_1.dropna()

display(oman_1)

Unnamed: 0,LogLatitude,LogLongitude
2021-05-01 01:11:24,20.99134,59.45736
2021-05-01 02:05:24,21.10693,59.49646
2021-05-01 02:08:44,21.11462,59.49598
2021-05-01 02:37:04,21.08519,59.49337
2021-05-01 02:45:30,21.06769,59.48729
...,...,...
2022-05-29 17:35:00,19.67523,57.72517
2022-05-29 18:56:05,19.67520,57.72515
2022-05-31 00:20:07,19.67522,57.72517
2022-05-31 02:29:05,19.67522,57.72515


In [12]:
# display(oman_1.loc[oman_1.index > "2021-05-02 17:33:41"])
# display(oman_1.loc[oman_1['LogLatitude'].isnull()])

Unnamed: 0,LogLatitude,LogLongitude


In [4]:
full = oman_1
distances = [0]
durations = [0]
index = full.index
i=0
while i < full.shape[0] - 1:
        # print(str(index[i]) + str(full['LogLatitude'][i]) + str(full['LogLongitude'][i]))
        distances.append(geopy.distance.distance((full['LogLatitude'][i], full['LogLongitude'][i]), (full['LogLatitude'][i+1], full['LogLongitude'][i+1])).km)
        durations.append((index[i+1] - index[i]).total_seconds())
        i += 1

km_s = [0]
j=1
while j < len(distances):
        if durations[j] != 0:
                km_s.append((distances[j]/durations[j]) * 60 * 60)
        else:
                km_s.append(0)
        j += 1

full['speed'] = km_s
# -----------------------------------------------------------------------------------------------------------------------------


# Calculate speeds based on lag/lead
# -----------------------------------------------------------------------------------------------------------------------------
speed_lag_1 = []
speed_lead_1 = []
speed_both_1 = []
speed_lag_2 = []
speed_lead_2 = []
speed_both_2 = []

full['lag_1'] = full['speed'].shift(1)
full['lag_2'] = full['speed'].shift(2)
full['lead_1'] = full['speed'].shift(-1)
full['lead_2'] = full['speed'].shift(-2)

m = 0
while m < full.shape[0]:
        speed_current = full['speed'][m]
        lag_1 = full['lag_1'][m]
        lag_2 = full['lag_2'][m]
        lead_1 = full['lead_1'][m]
        lead_2 = full['lead_2'][m]

        if math.isnan(lag_1):
                average_speed_lg1 = speed_current
        else:
                average_speed_lg1 = (speed_current + lag_1) / 2
        
        if math.isnan(lag_2) and math.isnan(lag_1):
                average_speed_lg2 = speed_current
        elif math.isnan(lag_2) and not math.isnan(lag_1):
                average_speed_lg2 = (speed_current + lag_1) / 2
        else:
                average_speed_lg2 = (speed_current + lag_1 + lag_2) / 3
        
        if math.isnan(lead_1):
                average_speed_ld1 = speed_current
        else:
                average_speed_ld1 = (speed_current + lead_1) / 2
        
        if math.isnan(lead_2) and math.isnan(lead_1):
                average_speed_ld2 = speed_current
        elif math.isnan(lead_2) and not math.isnan(lead_1):
                average_speed_ld2 = (speed_current + lead_1) / 2
        else:
                average_speed_ld2 = (speed_current + lead_1 + lead_2) / 3
        
        if math.isnan(lead_1):
                average_speed_b1 = (speed_current + lag_1) / 2
        elif math.isnan(lag_1):
                average_speed_b1 = (speed_current + lead_1) / 2
        else:
                average_speed_b1 = (speed_current + lead_1 + lag_1) / 3
        
        if math.isnan(lead_1):
                average_speed_b2 = (speed_current + lag_1 + lag_2) / 3
        elif math.isnan(lead_2):
                average_speed_b2 = (speed_current + lag_1 + lag_2 + lead_1) / 4
        elif math.isnan(lag_1):
                average_speed_b2 = (speed_current + lead_1 + lead_2) / 3
        elif math.isnan(lag_2):
                average_speed_b2 = (speed_current + lead_1 + lead_2 + lag_1) / 4
        else:
                average_speed_b2 = (speed_current + lead_1 + lag_1 + lead_2 + lag_2) / 5

        speed_lag_1.append(average_speed_lg1)
        speed_lag_2.append(average_speed_lg2)
        speed_lead_1.append(average_speed_ld1)
        speed_lead_2.append(average_speed_ld2)
        speed_both_1.append(average_speed_b1)
        speed_both_2.append(average_speed_b2)
        
        m += 1

full['speed_lag_1'] = speed_lag_1
full['speed_lag_2'] = speed_lag_2
full['speed_lead_1'] = speed_lead_1
full['speed_lead_2'] = speed_lead_2
full['speed_both_1'] = speed_both_1
full['speed_both_2'] = speed_both_2
# -----------------------------------------------------------------------------------------------------------------------------


# Calculate direction based on previous and next
# -----------------------------------------------------------------------------------------------------------------------------
# Create lag and lead of lat/long
full['lat_f1'] = full['LogLatitude'].shift(-1) # latitude with lead 1
full['lon_f1'] = full['LogLongitude'].shift(-1) # Longtitude with lead 1
full['lat_b1'] = full['LogLatitude'].shift(1) # latitude with lag 1
full['lon_b1'] = full['LogLongitude'].shift(1) # Longtitude with lag 1

direction = [0]
geo_track = 1
while geo_track < full.shape[0]:
        delta_lon = full['LogLongitude'][geo_track] - full['lon_b1'][geo_track]
        y2 = math.sin(delta_lon) * math.cos(full['LogLatitude'][geo_track])
        x2 = math.cos(full['lat_b1'][geo_track]) * math.sin(full['LogLatitude'][geo_track]) - math.sin(full['lat_b1'][geo_track]) * math.cos(full['LogLatitude'][geo_track]) * math.cos(delta_lon)

        brng2 = math.atan2(y2, x2)
        brng2 = math.degrees(brng2)

        direction.append(brng2)

        geo_track += 1

full['bearing'] = direction

direction_change = [0, 0]
geo_track2 = 2
while geo_track2 < full.shape[0]:
        change = full['bearing'][geo_track2 - 1] - full['bearing'][geo_track2]
        direction_change.append(change)

        geo_track2 += 1

full['bearing change'] = direction_change
# -----------------------------------------------------------------------------------------------------------------------------

In [5]:
full.reset_index(inplace=True)

# rename columns
full = full.rename(columns={"index": "datetimestamp"})

# convert datetime to int
full['int_datetime'] =  full['datetimestamp'].astype('int64')

display(full)

  full['int_datetime'] =  full['datetimestamp'].astype('int64')


Unnamed: 0,datetimestamp,LogLatitude,LogLongitude,speed,lag_1,lag_2,lead_1,lead_2,speed_lag_1,speed_lag_2,...,speed_lead_2,speed_both_1,speed_both_2,lat_f1,lon_f1,lat_b1,lon_b1,bearing,bearing change,int_datetime
0,2021-05-01 01:11:24,20.99134,59.45736,0.000000,,,14.919556,15.351916,0.000000,0.000000,...,10.090491,7.459778,10.090491,21.10693,59.49646,,,0.000000,0.000000,1619831484000000000
1,2021-05-01 02:05:24,21.10693,59.49646,14.919556,0.000000,,15.351916,6.924082,7.459778,7.459778,...,12.398518,10.090491,9.298889,21.11462,59.49598,20.99134,59.45736,-12.167251,0.000000,1619834724000000000
2,2021-05-01 02:08:44,21.11462,59.49598,15.351916,14.919556,0.000000,6.924082,14.499446,15.135736,10.090491,...,12.258481,12.398518,10.339000,21.08519,59.49337,21.10693,59.49646,2.287041,-14.454292,1619834924000000000
3,2021-05-01 02:37:04,21.08519,59.49337,6.924082,15.351916,14.919556,14.499446,1.462610,11.137999,12.398518,...,7.628713,12.258481,10.631522,21.06769,59.48729,21.11462,59.49598,176.868047,-174.581005,1619836624000000000
4,2021-05-01 02:45:30,21.06769,59.48729,14.499446,6.924082,15.351916,1.462610,16.928057,10.711764,12.258481,...,10.963371,7.628713,11.033222,21.07503,59.48908,21.08519,59.49337,168.171702,8.696345,1619837130000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13277,2022-05-29 17:35:00,19.67523,57.72517,0.000000,0.000208,0.000212,0.002906,0.000104,0.000104,0.000140,...,0.001003,0.001038,0.000686,19.67520,57.72515,19.67523,57.72517,0.000000,89.999579,1653845700000000000
13278,2022-05-29 18:56:05,19.67520,57.72515,0.002906,0.000000,0.000208,0.000104,0.000976,0.001453,0.001038,...,0.001329,0.001003,0.000839,19.67522,57.72517,19.67523,57.72517,-155.674305,155.674305,1653850565000000000
13279,2022-05-31 00:20:07,19.67522,57.72517,0.000104,0.002906,0.000000,0.000976,0.000316,0.001505,0.001003,...,0.000465,0.001329,0.000860,19.67522,57.72515,19.67520,57.72515,34.139847,-189.814152,1653956407000000000
13280,2022-05-31 02:29:05,19.67522,57.72515,0.000976,0.000104,0.002906,0.000316,,0.000540,0.001329,...,0.000646,0.000465,0.001076,19.67523,57.72513,19.67522,57.72517,-89.999579,124.139426,1653964145000000000


In [15]:
full_x = full[['int_datetime', 'LogLatitude', 'LogLongitude', 'speed', 'speed_lead_1']]
y_pred = clf.predict(full_x)
y_proba = clf.predict_proba(full_x)

In [36]:
full_test = full_x
full_test['y_pred'] = y_pred
y_proba = pd.DataFrame(y_proba)
full_test['y_proba 0'] = y_proba[0].tolist()
full_test['y_proba 1'] = y_proba[1].tolist()

full_test['LogLongitude_2'] = full_test['LogLongitude'].shift(-1)
full_test['LogLatitude_2'] = full_test['LogLatitude'].shift(-1)

route_test = full_test[['LogLongitude', 'LogLatitude']]
route_test = route_test.rename(columns={"LogLatitude": "Latitude", "LogLongitude": "Longitude"})
# route_test['LineStringColor'] = "blue"
route_test['height'] = 0.0

route_test.to_csv('Data/Oman_AIS/route_test.csv', index=False)

display(route_test)

fish_loc = full_test.loc[full_test['y_pred'] == 1]
fish_loc = fish_loc[['LogLatitude', 'LogLongitude']]
# fish_loc.to_csv('Data/Oman_AIS/fish_loc.csv')

display(fish_loc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_test['y_pred'] = y_pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_test['y_proba 0'] = y_proba[0].tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full_test['y_proba 1'] = y_proba[1].tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_

Unnamed: 0,Longitude,Latitude,height
0,59.45736,20.99134,0.0
1,59.49646,21.10693,0.0
2,59.49598,21.11462,0.0
3,59.49337,21.08519,0.0
4,59.48729,21.06769,0.0
...,...,...,...
13277,57.72517,19.67523,0.0
13278,57.72515,19.67520,0.0
13279,57.72517,19.67522,0.0
13280,57.72515,19.67522,0.0


Unnamed: 0,LogLatitude,LogLongitude
9,21.05103,59.48160
10,21.14047,59.51361
11,21.09151,59.49372
48,21.11680,59.49889
70,19.79045,58.47902
...,...,...
12956,19.67630,57.72540
13001,19.67630,57.72537
13083,19.67630,57.72538
13084,19.67632,57.72538
