# Spoofing Detection for Maritime AIS Data

In [1]:
# Environment setup and imports
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'  # Fix OpenMP conflict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Deep Learning - PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Visualization
import folium
from folium import plugins
import itertools

print(f"PyTorch version: {torch.__version__}")
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")
print("All packages loaded successfully")

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

PyTorch version: 2.9.1+cu126
GPU Available: True
GPU Device: NVIDIA GeForce RTX 4060 Laptop GPU
Numpy version: 1.26.4
Pandas version: 2.3.3
All packages loaded successfully
Using device: cuda


## 1. Data Loading & Preprocessing

Load AIS data from incident slices created in the incident analysis notebook.

In [2]:
# Configuration
data_root = Path("../dataset/piraeus")
output_root = Path("./models")
output_root.mkdir(parents=True, exist_ok=True)

# AIS dataset configuration (from incident_anomaly_labels.ipynb)
cols_primary = ["timestamp", "vessel_id", "lon", "lat", "speed", "course", "heading"]
cols_alias = ["t", "timestamp", "vessel_id", "lon", "lat", "speed", "course", "heading"]

MONTH_ABBR = {
    1: "jan", 2: "feb", 3: "mar", 4: "apr", 5: "may", 6: "jun",
    7: "jul", 8: "aug", 9: "sep", 10: "oct", 11: "nov", 12: "dec"
}

# Years and months to load for comprehensive training
DATA_PERIODS = [
    (2017, [5, 6, 7, 8, 9, 10, 11, 12]),  # May-Dec 2017
    (2018, list(range(1, 13))),            # Full year 2018
    (2019, list(range(1, 13))),            # Full year 2019
]

print("Configuration loaded")
print(f"Data root: {data_root}")
print(f"Loading periods: 2017-2019 (multiple months)")

Configuration loaded
Data root: ..\dataset\piraeus
Loading periods: 2017-2019 (multiple months)


In [19]:
def load_month(year, month, root, chunk_size=500_000):
    """Load AIS data for a specific month (from incident_anomaly_labels.ipynb)."""
    folder = Path(root) / f"unipi_ais_dynamic_{year}"
    fname = folder / f"unipi_ais_dynamic_{MONTH_ABBR[month]}{year}.csv"
    if not fname.exists():
        raise FileNotFoundError(f"Missing file: {fname}")

    # Discover available columns
    preview = pd.read_csv(fname, nrows=1)
    available = list(preview.columns)
    selected_cols = [c for c in cols_alias if c in available]
    print(f"Selected columns: {selected_cols}")
    
sample_months=12
for year, months in DATA_PERIODS:
    # Sample subset of months to manage memory
    selected_months = np.random.choice(months, min(sample_months, len(months)), replace=False)
    
    for i, month in enumerate(selected_months):
        print(f"{i+1} Loading {year}-{month:02d}...")
        load_month(year, month, data_root)

1 Loading 2017-06...
Selected columns: ['t', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']
2 Loading 2017-10...
Selected columns: ['t', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']
3 Loading 2017-05...
Selected columns: ['t', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']
4 Loading 2017-11...
Selected columns: ['t', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']
5 Loading 2017-12...
Selected columns: ['t', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']
6 Loading 2017-07...
Selected columns: ['t', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']
7 Loading 2017-08...
Selected columns: ['t', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']
8 Loading 2017-09...
Selected columns: ['t', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']
1 Loading 2018-04...
Selected columns: ['timestamp', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']
2 Loading 2018-11...
Selected columns: ['timestamp', 'vessel_id', 'lon', '

Since we have t and timestamp variation, column_primary and col_alias are used

In [None]:
# Load AIS data from full dataset
def load_month(year, month, root, chunk_size=500_000):
    """Load AIS data for a specific month (from incident_anomaly_labels.ipynb)."""
    folder = Path(root) / f"unipi_ais_dynamic_{year}"
    fname = folder / f"unipi_ais_dynamic_{MONTH_ABBR[month]}{year}.csv"
    if not fname.exists():
        raise FileNotFoundError(f"Missing file: {fname}")

    # Discover available columns
    preview = pd.read_csv(fname, nrows=1)
    available = list(preview.columns)
    selected_cols = [c for c in cols_alias if c in available]

    chunks = []
    for chunk in pd.read_csv(fname, usecols=selected_cols, chunksize=chunk_size):
        if "t" in chunk.columns and "timestamp" not in chunk.columns:
            chunk = chunk.rename(columns={"t": "timestamp"})
        chunk["timestamp"] = pd.to_datetime(chunk["timestamp"], unit="ms", utc=True)
        chunk = chunk[[c for c in cols_primary if c in chunk.columns]]
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)


def load_ais_dataset(data_periods, root, sample_months=3):
    """Load multiple months of AIS data for training."""
    all_data = []
    loaded_count = 0

    for year, months in data_periods:
        # Sample subset of months to manage memory
        selected_months = np.random.choice(months, min(sample_months, len(months)), replace=False)
        
        for month in selected_months:
            try:
                print(f"Loading {year}-{month:02d}...")
                df = load_month(year, month, root)
                all_data.append(df)
                loaded_count += 1
                print(f"  Loaded: {len(df):,} records, {df['vessel_id'].nunique()} vessels")
            except FileNotFoundError as e:
                print(f"  Skipped: {e}")
                continue
            except Exception as e:
                print(f"  Error: {e}")
                continue
    
    if not all_data:
        raise FileNotFoundError("No AIS data found. Check data_root path.")
    
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"\nTotal months loaded: {loaded_count}")
    return combined_df


# Load data (sample 3 months per year for manageable dataset size)
print("Loading AIS dataset...")
ais_data = load_ais_dataset(DATA_PERIODS, data_root, sample_months=3)

print(f"\nDataset Summary:")
print(f"  Total records: {len(ais_data):,}")
print(f"  Unique vessels: {ais_data['vessel_id'].nunique()}")
print(f"  Time range: {ais_data['timestamp'].min()} to {ais_data['timestamp'].max()}")
print(f"  Columns: {list(ais_data.columns)}")

Loading AIS dataset...
Loading 2017-10...
  Loaded: 4,286,717 records, 816 vessels
Loading 2017-09...
  Loaded: 7,622,161 records, 1135 vessels
Loading 2017-06...
  Loaded: 8,107,297 records, 1119 vessels
Loading 2018-03...
  Loaded: 6,338,935 records, 795 vessels
Loading 2018-01...
  Loaded: 5,458,182 records, 659 vessels
Loading 2018-07...
  Loaded: 13,813,130 records, 1344 vessels
Loading 2019-02...
  Loaded: 6,930,667 records, 874 vessels
Loading 2019-05...
  Loaded: 7,015,268 records, 1175 vessels
Loading 2019-04...
  Loaded: 7,139,650 records, 1055 vessels

Total months loaded: 9

Dataset Summary:
  Total records: 66,712,007
  Unique vessels: 4159
  Time range: 2017-05-31 21:00:00+00:00 to 2019-05-30 20:59:59+00:00
  Columns: ['timestamp', 'vessel_id', 'lon', 'lat', 'speed', 'course', 'heading']


In [3]:
def ais_month_generator(year, month, root, chunk_size=500_000):
    folder = Path(root) / f"unipi_ais_dynamic_{year}"
    fname = folder / f"unipi_ais_dynamic_{MONTH_ABBR[month]}{year}.csv"

    preview = pd.read_csv(fname, nrows=1)
    selected_cols = [c for c in cols_alias if c in preview.columns]

    for chunk in pd.read_csv(fname, usecols=selected_cols, chunksize=chunk_size):
        if "t" in chunk.columns and "timestamp" not in chunk.columns:
            chunk = chunk.rename(columns={"t": "timestamp"})
        chunk["timestamp"] = pd.to_datetime(chunk["timestamp"], unit="ms", utc=True)
        chunk = chunk[[c for c in cols_primary if c in chunk.columns]]
        yield chunk


In [4]:
import torch
from torch.utils.data import IterableDataset, DataLoader
from tqdm import tqdm
import numpy as np

class AISDataset(IterableDataset):
    def __init__(self, data_periods, root, window_size=128):
        """
        Iterable dataset for AIS data sliding windows.

        Returns: (window_tensor, vessel_id)
        window_tensor shape: (window_size, 5) -> [lat, lon, speed, course, timestamp]
        vessel_id: string
        """
        self.data_periods = data_periods
        self.root = root
        self.window_size = window_size

    def __iter__(self):
        for year, months in self.data_periods:
            for month in months:
                for chunk in ais_month_generator(year, month, self.root):
                    for vessel_id, df_v in chunk.groupby("vessel_id"):
                        df_v = df_v.sort_values("timestamp")

                        # Extract numeric features
                        values = df_v[["lat", "lon", "speed", "course"]].values.astype(np.float32)

                        # Convert timestamps to float seconds
                        timestamps = (df_v["timestamp"].astype("int64") // 10**9).values.astype(np.float32)
                        timestamps = timestamps.reshape(-1, 1)

                        # Combine into sliding window array
                        window_data = np.hstack([values, timestamps])

                        # Yield sliding windows along with vessel_id
                        for i in range(len(window_data) - self.window_size):
                            yield torch.tensor(window_data[i:i+self.window_size], dtype=torch.float32), vessel_id

# Loader
loader = DataLoader(
    AISDataset(DATA_PERIODS, data_root, window_size=128),
    batch_size=64,        # number of windows per batch
    num_workers=0         # single-threaded safe for IterableDataset
)

Temporal coverage

Each “window” covers 128 consecutive AIS messages for a vessel.

If AIS messages are sent, say, every 1–2 minutes, then 128 messages ≈ 2–4 hours of movement.

If anomalies occur over longer time periods (e.g., abnormal route over a day), 128 may be too short.

In [37]:
# Columns in window tensor
check_cols = ["lat", "lon", "speed", "course", "timestamp"]

# Stats holders
missing_counts = {c: 0 for c in check_cols}
speed_violations = 0
course_violations = 0
duplicate_timestamps = 0
negative_time_diff = 0
stationary_vessels = set()
total_windows = 0

# Iterate through loader
for batch, vessel_ids in tqdm(loader, desc="Checking raw AIS data"):
    # batch shape: (batch_size, window_size, features)
    batch = batch.cpu()
    batch_size, window_size, n_features = batch.shape
    total_windows += batch_size

    # Check NaNs
    for i, col in enumerate(check_cols):
        missing_counts[col] += torch.isnan(batch[:, :, i]).sum().item()

    # Speed violations (>100 or <0)
    speed_idx = check_cols.index("speed")
    speed_violations += ((batch[:, :, speed_idx] > 100) |
                         (batch[:, :, speed_idx] < 0)).sum().item()

    # Course violations (>360 or <0)
    course_idx = check_cols.index("course")
    course_violations += ((batch[:, :, course_idx] > 360) |
                          (batch[:, :, course_idx] < 0)).sum().item()

    # Time differences and duplicates
    ts_idx = check_cols.index("timestamp")
    for b in range(batch_size):
        dt = batch[b, 1:, ts_idx] - batch[b, :-1, ts_idx]
        negative_time_diff += (dt < 0).sum().item()
        duplicate_timestamps += (dt == 0).sum().item()

        # Stationary vessel check (<0.5 knots)
        if (batch[b, :, speed_idx] < 0.5).all():
            stationary_vessels.add(vessel_ids[b])

# Print summary
print("\n=== Pre-Cleaning AIS Data Summary ===")
print(f"Total windows processed: {total_windows:,}")
for col in check_cols:
    print(f"Missing values in {col}: {missing_counts[col]}")
print(f"Speed violations (>100 or <0 knots): {speed_violations}")
print(f"Course violations (>360 or <0 degrees): {course_violations}")
print(f"Duplicate timestamps: {duplicate_timestamps}")
print(f"Negative time differences: {negative_time_diff}")
print(f"Stationary vessels detected: {len(stationary_vessels)}")

Checking raw AIS data: 301474it [43:29, 115.53it/s]


KeyboardInterrupt: 

KeyboardInterrupt: was performed as we had already checked a substantial amount of records close to 19 M records which 9 percent subset of the data.

In [38]:
print("\n=== Pre-Cleaning AIS Data Summary ===")
print(f"Total windows processed: {total_windows:,}")
for col in check_cols:
    print(f"Missing values in {col}: {missing_counts[col]}")
print(f"Speed violations (>100 or <0 knots): {speed_violations}")
print(f"Course violations (>360 or <0 degrees): {course_violations}")
print(f"Duplicate timestamps: {duplicate_timestamps}")
print(f"Negative time differences: {negative_time_diff}")
print(f"Stationary vessels detected: {len(stationary_vessels)}")


=== Pre-Cleaning AIS Data Summary ===
Total windows processed: 19,294,400
Missing values in lat: 0
Missing values in lon: 0
Missing values in speed: 13913925
Missing values in course: 251599680
Missing values in timestamp: 0
Speed violations (>100 or <0 knots): 97400
Course violations (>360 or <0 degrees): 0
Duplicate timestamps: 2018963744
Negative time differences: 0
Stationary vessels detected: 975


In [22]:
for batch in loader:
    batch = batch.to(device)
    print(batch.shape)
    break
    # loss = model(batch)
    # loss.backward()

Vessel 002351f7584dcb3b6ab87557073727eadd310a71e141d35646da5a7e3c577ee0 has 1002 records
Vessel 00f122d09f0a9d8c26eb19e9abcd6ba3463df344117f5631d33ecf1b2e34f94f has 351 records
Vessel 002351f7584dcb3b6ab87557073727eadd310a71e141d35646da5a7e3c577ee0 has 598 records
Vessel 002351f7584dcb3b6ab87557073727eadd310a71e141d35646da5a7e3c577ee0 has 853 records
Vessel 00f122d09f0a9d8c26eb19e9abcd6ba3463df344117f5631d33ecf1b2e34f94f has 353 records
Vessel 001680b0d55b053ae6558a0ce257e717dfd188fd98e6c2cd3b5d4db5c7ae5a63 has 212 records
Vessel 002351f7584dcb3b6ab87557073727eadd310a71e141d35646da5a7e3c577ee0 has 441 records
Vessel 002351f7584dcb3b6ab87557073727eadd310a71e141d35646da5a7e3c577ee0 has 422 records
Vessel 00c7725d1ff8ff36fc4091b1aef03261fb1a9e38b11f20d5604fb5607a94a4f2 has 1 records
Vessel 001680b0d55b053ae6558a0ce257e717dfd188fd98e6c2cd3b5d4db5c7ae5a63 has 763 records
Vessel 001680b0d55b053ae6558a0ce257e717dfd188fd98e6c2cd3b5d4db5c7ae5a63 has 857 records
Vessel 002351f7584dcb3b6ab8755707

----

In [5]:
CLEANED_WINDOWS = []
speed_idx = 2
course_idx = 3

In [None]:
for batch, vessel_ids in tqdm(loader, desc="Cleaning AIS data"):
    batch = batch.cpu()

    valid_mask = (
        ~torch.isnan(batch[:, :, 2]).any(dim=1) &
        ~torch.isnan(batch[:, :, 3]).any(dim=1)
    )

    valid_idx = valid_mask.nonzero(as_tuple=True)[0]

    if len(valid_idx) == 0:
        continue

    batch = batch[valid_idx]
    vessel_ids = [vessel_ids[i] for i in valid_idx.tolist()]

    # Clamp
    batch[:, :, 2] = batch[:, :, 2].clamp(0, 100)
    batch[:, :, 3] = batch[:, :, 3].clamp(0, 360)

    CLEANED_WINDOWS.append({
        "windows": batch,
        "vessel_ids": vessel_ids
    })


Cleaning AIS data: 283052it [07:46, 171.52it/s] 

In [None]:
torch.save(CLEANED_WINDOWS, "cleaned_ais_windows.pt")