In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
# Load AIS data
df = pd.read_csv("AIS_2024_12_31.csv")


In [7]:
df.head(5)

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,TransceiverClass
0,367776660,2024-12-31 00:00:08,21.19308,-157.72342,8.0,112.1,106.0,MOUNT BAKER,IMO9822906,WDJ6673,52.0,0.0,40.0,12.0,5.5,52.0,A
1,368095340,2024-12-31 00:00:05,29.76995,-95.07893,0.0,185.5,511.0,GAMBLER,,WDK8562,52.0,15.0,19.0,8.0,0.0,52.0,A
2,366847780,2024-12-31 00:00:00,29.96697,-93.85909,0.1,186.2,220.0,PACIFIC DAWN,IMO7400467,WDA7844,31.0,0.0,30.0,8.0,5.0,31.0,A
3,367481310,2024-12-31 00:00:04,27.68242,-82.58073,11.5,57.6,61.0,OSG COURAGEOUS,IMO9395707,WDF7122,82.0,0.0,204.0,26.0,6.0,82.0,A
4,248669000,2024-12-31 00:00:06,29.85743,-93.94083,3.1,220.9,219.0,EAGLE SAN FRANCISCO,IMO9795127,9HA4753,80.0,0.0,277.0,48.0,10.4,80.0,A


In [9]:
df.columns


Index(['MMSI', 'BaseDateTime', 'LAT', 'LON', 'SOG', 'COG', 'Heading',
       'VesselName', 'IMO', 'CallSign', 'VesselType', 'Status', 'Length',
       'Width', 'Draft', 'Cargo', 'TransceiverClass'],
      dtype='object')

In [10]:
df.shape

(7588976, 17)

In [14]:
# Remove duplicates based on MMSI and timestamp (BaseDateTime)
df = df.drop_duplicates(subset=["MMSI", "BaseDateTime"])

# Check for missing values and handle them (drop or impute)
df.isnull().sum()
df = df.dropna() 

# Reset index for ease of analysis
df.reset_index(drop=True, inplace=True)


In [15]:
df.shape

(3167944, 17)

In [16]:
# Heading vs COG Difference
df["heading_vs_cog_diff"] = abs(df["Heading"] - df["COG"])

# Loiter Time: Calculate time difference between rows
df = df.sort_values(by=["MMSI", "BaseDateTime"])  # Sort by MMSI and time

df["time_diff_min"] = df.groupby("MMSI")["BaseDateTime"].diff().dt.total_seconds() / 60  # Time difference in minutes
df["is_loitering"] = (df["SOG"] < 1) & (df["time_diff_min"] > 5)  # If speed is less than 1, and time spent is >5 mins

# Loiter time: Aggregate loiter time per ship (MMSI)
df["loiter_time"] = df.groupby("MMSI")["is_loitering"].transform("sum")

# Display new features
df[['MMSI', 'heading_vs_cog_diff', 'is_loitering', 'loiter_time']].head()

Unnamed: 0,MMSI,heading_vs_cog_diff,is_loitering,loiter_time
3940,11,287.2,False,52
10799,11,290.2,False,52
22033,11,310.4,True,52
44601,11,288.9,True,52
47642,11,304.0,False,52


In [18]:
pip install geopy

Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/125.4 kB ? eta -:--:--
   --- ------------------------------------ 10.2/125.4 kB ? eta -:--:--
   --------- ----------------------------- 30.7/125.4 kB 435.7 kB/s eta 0:00:01
   ---------------------------- ---------- 92.2/125.4 kB 751.6 kB/s eta 0:00:01
   -------------------------------------- 125.4/125.4 kB 738.4 kB/s eta 0:00:00
Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
   ---------------------------------------- 0.0/40.3 kB ? eta -:--:--
   ---------------------------------------- 40.3/40.3 kB ? eta 0:00:00
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.4.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
df['speed_change'] = df['SOG'].diff()  # Difference between consecutive speed values
df['course_change'] = df['COG'].diff()  # Difference between consecutive course values

In [21]:
df['heading_vs_cog_diff'] = abs(df['Heading'] - df['COG'])

In [24]:
# Function to calculate loiter time per group (MMSI)
def loitering_time(group):
    # Calculate the difference in LAT and LON for consecutive rows
    group['LAT_diff'] = group['LAT'].diff().abs()
    group['LON_diff'] = group['LON'].diff().abs()
    
    # Mark as loitering if the movement is small (threshold < 0.0001)
    group['is_loitering'] = ((group['LAT_diff'] < 0.0001) & (group['LON_diff'] < 0.0001)).astype(int)
    
    # Return the sum of loitering flags per group
    return group['is_loitering']

# Apply the function to the grouped DataFrame
df['loiter_time'] = df.groupby('MMSI').apply(loitering_time).reset_index(drop=True)

# Now, you can check the result
print(df.head())


  df['loiter_time'] = df.groupby('MMSI').apply(loitering_time).reset_index(drop=True)


       MMSI        BaseDateTime       LAT       LON  SOG    COG  Heading  \
3940     11 2024-12-31 00:00:44  27.29231 -90.96791  0.0  223.8    511.0   
10799    11 2024-12-31 00:03:45  27.29230 -90.96785  0.1  220.8    511.0   
22033    11 2024-12-31 00:09:46  27.29234 -90.96789  0.1  200.6    511.0   
44601    11 2024-12-31 00:18:44  27.29226 -90.96779  0.1  222.1    511.0   
47642    11 2024-12-31 00:21:48  27.29235 -90.96787  0.1  207.0    511.0   

         VesselName         IMO CallSign  ...  Width  Draft  Cargo  \
3940   CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   
10799  CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   
22033  CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   
44601  CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   
47642  CONSTITUTION  IMO0000007   GC 680  ...   38.0    0.0   90.0   

       TransceiverClass  heading_vs_cog_diff  time_diff_min is_loitering  \
3940                  A                287.2  

In [25]:
# Apply the function to the grouped DataFrame
df['loiter_time'] = df.groupby('MMSI').apply(loitering_time).reset_index(drop=True)

# Generate a new CSV file with updated attributes
df.to_csv('updated_ships_data_with_loitering.csv', index=False)

# Confirm the action
print("✅ New CSV file 'updated_ships_data_with_loitering.csv' has been created with all attributes and new features.")

  df['loiter_time'] = df.groupby('MMSI').apply(loitering_time).reset_index(drop=True)


✅ New CSV file 'updated_ships_data_with_loitering.csv' has been created with all attributes and new features.


In [27]:
df.head(20)

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,...,Width,Draft,Cargo,TransceiverClass,heading_vs_cog_diff,time_diff_min,is_loitering,loiter_time,speed_change,course_change
3940,11,2024-12-31 00:00:44,27.29231,-90.96791,0.0,223.8,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,287.2,,False,1,,
10799,11,2024-12-31 00:03:45,27.2923,-90.96785,0.1,220.8,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,290.2,3.016667,False,1,0.1,-3.0
22033,11,2024-12-31 00:09:46,27.29234,-90.96789,0.1,200.6,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,310.4,6.016667,True,0,0.0,-20.2
44601,11,2024-12-31 00:18:44,27.29226,-90.96779,0.1,222.1,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,288.9,8.966667,True,1,0.0,21.5
47642,11,2024-12-31 00:21:48,27.29235,-90.96787,0.1,207.0,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,304.0,3.066667,False,0,0.0,-15.1
53178,11,2024-12-31 00:24:44,27.29234,-90.96784,0.1,195.6,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,315.4,2.933333,False,1,0.0,-11.4
57953,11,2024-12-31 00:27:48,27.29231,-90.96779,0.1,190.1,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,320.9,3.066667,False,0,0.0,-5.5
63979,11,2024-12-31 00:30:43,27.29234,-90.96781,0.3,202.7,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,308.3,2.916667,False,0,0.2,12.6
89804,11,2024-12-31 00:42:45,27.29231,-90.96775,0.1,212.0,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,299.0,12.033333,True,0,-0.2,9.3
99592,11,2024-12-31 00:48:45,27.29236,-90.96778,0.1,214.1,511.0,CONSTITUTION,IMO0000007,GC 680,...,38.0,0.0,90.0,A,296.9,6.0,True,0,0.0,2.1
