In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [6]:

df = pd.read_csv("CLEAN_AIS_2024_12_31.csv")

features = ['SOG', 'COG', 'Heading', 'LAT', 'LON', 'Draft', 'StatusBinary']

In [8]:
numeric_cols = ['LAT', 'LON', 'SOG', 'COG', 'Heading', 'Length', 'Width', 'Draft']

# Drop rows with missing values in these columns
df_clean = df[numeric_cols].dropna()

# Optional: standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_clean)

In [9]:
# Initialize the model
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)

# Fit the model
iso_forest.fit(X_scaled)

# Predict anomalies (-1: anomaly, 1: normal)
df_clean['anomaly'] = iso_forest.predict(X_scaled)

In [11]:
df_result = df.loc[df_clean.index].copy()
df_result['anomaly'] = df_clean['anomaly'].values

In [14]:
# Count anomalies
print(df_result['anomaly'].value_counts())

# View anomalous vessels
anomalies = df_result[df_result['anomaly'] == -1]
print(anomalies.head(40))

anomaly
 1    5083467
-1     267385
Name: count, dtype: int64
          MMSI         BaseDateTime       LAT        LON    SOG    COG  \
0    367776660  2024-12-31 00:00:08  21.19308 -157.72342    8.0  112.1   
6    367178330  2024-12-31 00:00:04  27.54570  -92.44334    0.1  259.6   
28   316001251  2024-12-31 00:00:04  49.25729 -123.72145   20.2   82.8   
46   563163400  2024-12-31 00:00:04  28.46854 -119.03485   18.0  307.9   
89   368172370  2024-12-31 00:00:05  29.67065  -89.95759    0.0  287.9   
91   367098340  2024-12-31 00:00:01  25.19852  -81.90829   32.4  359.5   
116  367637980  2024-12-31 00:00:01  30.04467  -90.64134    0.0   32.2   
124  248392000  2024-12-31 00:00:03  25.20087  -79.28053   16.9  189.0   
159  636093031  2024-12-31 00:00:00  31.06601  -78.06215   12.4   16.7   
195  316011407  2024-12-31 00:00:05  49.21793 -123.81425   20.4   91.1   
207  477348300  2024-12-31 00:00:02  35.14836 -121.71460    9.6  326.6   
231  367008770  2024-12-31 00:00:08  21.22614 -157