# Environment & Data Check Notebook
This notebook verifies the Python environment and demonstrates simple data and visualization tasks. Use it as a starter before adding AIS anomaly labeling logic.

In [47]:
# Verify environment configuration
import platform
import sys
import importlib

print(f"Python version: {sys.version.split()[0]}")
print(f"Platform: {platform.system()} {platform.release()} ({platform.version()})")

for pkg in ["numpy", "pandas", "matplotlib", "torch"]:
    try:
        mod = importlib.import_module(pkg)
        version = getattr(mod, "__version__", "?")
        print(f"{pkg}: {version}")
    except ImportError:
        print(f"{pkg}: not installed")

# Optional: check basic GPU/acceleration status via torch if present
try:
    import torch
    print(f"torch.cuda.is_available: {torch.cuda.is_available()}")
    if torch.cuda.is_available():
        print(f"CUDA device count: {torch.cuda.device_count()}")
        print(f"CUDA device name: {torch.cuda.get_device_name(0)}")
except Exception as e:
    print(f"GPU check skipped: {e}")

Python version: 3.12.4
Platform: Windows 11 (10.0.26100)
numpy: 1.26.4
pandas: 2.3.3
matplotlib: 3.10.7
torch: 2.9.1+cu126
torch.cuda.is_available: True
CUDA device count: 1
CUDA device name: NVIDIA GeForce RTX 4060 Laptop GPU


In [48]:
# Import core libraries
try:
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    print("Imports successful: numpy, pandas, matplotlib")
except ImportError as e:
    print(f"ImportError: {e}")
    raise

# Set matplotlib inline backend if in IPython/Jupyter
try:
    get_ipython().run_line_magic("matplotlib", "inline")
except Exception:
    pass

Imports successful: numpy, pandas, matplotlib


# Incident-based anomaly labeling
This section sets up known incident metadata (Piraeus/Saronic Gulf 2017–2019), helper utilities, and stubs for slicing AIS data around incidents to produce labeled trajectories.

In [49]:
# Configuration and incident catalog
from datetime import datetime

# Root folders (edit to your actual paths)
data_root = "dataset/piraeus"
ais_dynamic_root = f"{data_root}/unipi_ais_dynamic_2018"  # change year as needed
output_root = "labeled_incidents"

# Inline incident catalog; refine times/coords as you verify
incidents = [
    {
        "name": "Agia Zoni II",
        "date_utc": "2017-09-10",
        "approx_lat": 37.93,
        "approx_lon": 23.52,
        "location": "Salamina WSW of Piraeus",
        "type": "sinking",
        "source": "Reuters KCN1BQ1FF; Maritime Executive"
    },
    {
        "name": "FlyingCat 4",
        "date_utc": "2018-08-29",
        "approx_lat": 37.744,
        "approx_lon": 23.427,
        "location": "Aegina pier strike",
        "type": "allision",
        "source": "MarineLink 443405"
    },
    {
        "name": "Flying Dolphin XVII",
        "date_utc": "2018-09-05",
        "approx_lat": 37.493,
        "approx_lon": 23.453,
        "location": "Poros grounding",
        "type": "grounding",
        "source": "GTP 2018-09-05"
    },
    {
        "name": "Salamina Ferry Collision",
        "date_utc": "2019-01-03",
        "approx_lat": 37.964,
        "approx_lon": 23.488,
        "location": "Salamina harbor pier",
        "type": "collision",
        "source": "Ekathimerini news/236210"
    },
    {
        "name": "Sea Star Piraeus Pier Allision",
        "date_utc": "2019-04-26",
        "approx_lat": 37.940,
        "approx_lon": 23.623,
        "location": "Piraeus Pier II",
        "type": "allision",
        "source": "Maritime Bulletin 2019/04/26"
    },
]

incidents

[{'name': 'Agia Zoni II',
  'date_utc': '2017-09-10',
  'approx_lat': 37.93,
  'approx_lon': 23.52,
  'location': 'Salamina WSW of Piraeus',
  'type': 'sinking',
  'source': 'Reuters KCN1BQ1FF; Maritime Executive'},
 {'name': 'FlyingCat 4',
  'date_utc': '2018-08-29',
  'approx_lat': 37.744,
  'approx_lon': 23.427,
  'location': 'Aegina pier strike',
  'type': 'allision',
  'source': 'MarineLink 443405'},
 {'name': 'Flying Dolphin XVII',
  'date_utc': '2018-09-05',
  'approx_lat': 37.493,
  'approx_lon': 23.453,
  'location': 'Poros grounding',
  'type': 'grounding',
  'source': 'GTP 2018-09-05'},
 {'name': 'Salamina Ferry Collision',
  'date_utc': '2019-01-03',
  'approx_lat': 37.964,
  'approx_lon': 23.488,
  'location': 'Salamina harbor pier',
  'type': 'collision',
  'source': 'Ekathimerini news/236210'},
 {'name': 'Sea Star Piraeus Pier Allision',
  'date_utc': '2019-04-26',
  'approx_lat': 37.94,
  'approx_lon': 23.623,
  'location': 'Piraeus Pier II',
  'type': 'allision',
  'so

In [50]:
# Helper utilities: name normalization, distance, time window
import math
import re
from typing import Tuple
import pandas as pd

_punct_re = re.compile(r"[\.,/\\;:'\"`~!@#$%^&*()\-_=+\[\]{}|<>?]")
_space_re = re.compile(r"\s+")


def normalize_name(name: str) -> str:
    """Upper-case, strip, collapse spaces, remove punctuation and common prefixes."""
    if name is None:
        return ""
    s = name.upper().strip()
    s = _punct_re.sub(" ", s)
    s = _space_re.sub(" ", s)
    # Drop common prefixes like MV, MT, M/V
    for prefix in ["MV ", "M/V ", "MT ", "M/T ", "MS ", "M/S ", "SS ", "R/V "]:
        if s.startswith(prefix):
            s = s[len(prefix):]
            break
    return s.strip()


def haversine_km(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """Great-circle distance in km."""
    R = 6371.0
    p1, p2 = math.radians(lat1), math.radians(lat2)
    dphi = p2 - p1
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi / 2) ** 2 + math.cos(p1) * math.cos(p2) * math.sin(dlambda / 2) ** 2
    return 2 * R * math.asin(math.sqrt(a))


def in_time_window(df: pd.DataFrame, center_ts: pd.Timestamp, hours: float = 3.0) -> pd.DataFrame:
    delta = pd.Timedelta(hours=hours)
    return df[(df["timestamp"] >= center_ts - delta) & (df["timestamp"] <= center_ts + delta)]


In [51]:
# AIS loading with filename pattern used in dataset; handles 'timestamp' or 't' column
import pandas as pd
from pathlib import Path

cols_primary = ["timestamp", "vessel_id", "lon", "lat", "speed", "course", "heading"]
cols_alias = ["t", "timestamp", "vessel_id", "lon", "lat", "speed", "course", "heading"]

MONTH_ABBR = {
    1: "jan", 2: "feb", 3: "mar", 4: "apr", 5: "may", 6: "jun",
    7: "jul", 8: "aug", 9: "sep", 10: "oct", 11: "nov", 12: "dec"
}

def load_month(year: int, month: int, root: str = data_root, chunk_size: int = 500_000):
    folder = Path(root) / f"unipi_ais_dynamic_{year}"
    fname = folder / f"unipi_ais_dynamic_{MONTH_ABBR[month]}{year}.csv"
    if not fname.exists():
        alt = folder / f"unipi_ais_dynamic_{year}_{month:02d}.csv"
        if alt.exists():
            fname = alt
        else:
            raise FileNotFoundError(f"Missing file: {fname} (or {alt})")

    # Discover available columns to avoid usecols mismatch (2017/2019 use 't' instead of 'timestamp')
    preview = pd.read_csv(fname, nrows=1)
    available = list(preview.columns)
    selected_cols = [c for c in cols_alias if c in available]

    chunks = []
    for chunk in pd.read_csv(fname, usecols=selected_cols, chunksize=chunk_size):
        if "t" in chunk.columns and "timestamp" not in chunk.columns:
            chunk = chunk.rename(columns={"t": "timestamp"})
        chunk["timestamp"] = pd.to_datetime(chunk["timestamp"], unit="ms", utc=True)
        # Ensure consistent column order
        chunk = chunk[[c for c in cols_primary if c in chunk.columns]]
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

# Example (commented to avoid heavy load):
# df_aug18 = load_month(2018, 8, root=data_root)
# df_aug18.head()

In [52]:
# Slice AIS around incidents and save
from pathlib import Path

bbox_padding = 0.2  # degrees
hours_window = 3.0

Path(output_root).mkdir(parents=True, exist_ok=True)

slices = []

# Example: assuming you loaded df_aug18 = load_month(2018, 8, data_root)
# Replace df_source with the appropriate month DataFrame per incident.
df_source = None  # set to your loaded monthly DataFrame

for inc in incidents:
    # Map incident date to timestamp center (assume noon UTC as placeholder)
    center_ts = pd.Timestamp(inc["date_utc"] + " 12:00:00", tz="UTC")
    lat0, lon0 = inc["approx_lat"], inc["approx_lon"]

    if df_source is None:
        continue  # skip until a DataFrame is loaded

    df_filtered = df_source[
        (df_source["lat"].between(lat0 - bbox_padding, lat0 + bbox_padding)) &
        (df_source["lon"].between(lon0 - bbox_padding, lon0 + bbox_padding))
    ]
    df_filtered = in_time_window(df_filtered, center_ts, hours=hours_window)

    if df_filtered.empty:
        continue

    slug = re.sub(r"[^A-Z0-9]+", "_", normalize_name(inc["name"]))
    out_dir = Path(output_root) / slug
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "slice.parquet"
    df_filtered.to_parquet(out_path, index=False)

    slices.append({
        "incident": inc["name"],
        "slug": slug,
        "rows": len(df_filtered),
        "file": str(out_path)
    })

slices

[]

In [53]:
# Build labels summary from slices
labels_df = pd.DataFrame(slices)
if not labels_df.empty:
    labels_df["anomaly_label"] = "incident_match"
    labels_df["source"] = labels_df["incident"].map({inc["name"]: inc["source"] for inc in incidents})
    labels_df.to_csv(Path(output_root) / "labels_summary.csv", index=False)
labels_df

In [54]:
# Quick visualization stub (optional)
try:
    import matplotlib.pyplot as plt
    sample = None
    if slices:
        # Load first slice for a quick plot
        first_file = slices[0]["file"]
        sample = pd.read_parquet(first_file)
    if sample is not None and not sample.empty:
        plt.figure(figsize=(5, 4))
        plt.scatter(sample["lon"], sample["lat"], s=4, alpha=0.5)
        plt.title(f"Incident slice: {slices[0]['incident']}")
        plt.xlabel("Longitude")
        plt.ylabel("Latitude")
        plt.tight_layout()
        plt.show()
    else:
        print("No slice loaded yet; run slicing after loading AIS data.")
except Exception as e:
    print(f"Visualization skipped: {e}")


No slice loaded yet; run slicing after loading AIS data.


# Automated slicing per incident (year/month inferred from date_utc)
This cell loads the needed month for each incident (one at a time) and writes slices. Adjust `bbox_padding`, `hours_window`, or add a `time_utc` field to incidents if you know the exact hour.

In [55]:
# Auto-slice incidents by loading the corresponding month
from collections import defaultdict

loaded_months = {}


def get_df_for_incident(inc):
    dt = pd.to_datetime(inc["date_utc"], utc=True)
    key = (dt.year, dt.month)
    if key in loaded_months:
        return loaded_months[key]
    df = load_month(dt.year, dt.month, root=data_root)
    loaded_months[key] = df
    return df

bbox_padding = 0.2  # degrees
hours_window = 3.0

Path(output_root).mkdir(parents=True, exist_ok=True)

slices = []

for inc in incidents:
    dt = pd.to_datetime(inc["date_utc"], utc=True)
    center_ts = dt + pd.Timedelta(hours=12)  # adjust if known hour differs
    lat0, lon0 = inc["approx_lat"], inc["approx_lon"]

    try:
        df_src = get_df_for_incident(inc)
    except FileNotFoundError as e:
        print(f"Skipping {inc['name']}: {e}")
        continue

    df_filtered = df_src[
        (df_src["lat"].between(lat0 - bbox_padding, lat0 + bbox_padding)) &
        (df_src["lon"].between(lon0 - bbox_padding, lon0 + bbox_padding))
    ]
    df_filtered = in_time_window(df_filtered, center_ts, hours=hours_window)

    if df_filtered.empty:
        print(f"No data for {inc['name']} in bbox/time window; consider widening.")
        continue

    slug = re.sub(r"[^A-Z0-9]+", "_", normalize_name(inc["name"]))
    out_dir = Path(output_root) / slug
    out_dir.mkdir(parents=True, exist_ok=True)
    out_path = out_dir / "slice.parquet"
    df_filtered.to_parquet(out_path, index=False)

    slices.append({
        "incident": inc["name"],
        "slug": slug,
        "rows": len(df_filtered),
        "file": str(out_path)
    })
    print(f"Saved {len(df_filtered)} rows for {inc['name']} -> {out_path}")

labels_df = pd.DataFrame(slices)
if not labels_df.empty:
    labels_df["anomaly_label"] = "incident_match"
    labels_df["source"] = labels_df["incident"].map({inc["name"]: inc["source"] for inc in incidents})
    labels_df.to_csv(Path(output_root) / "labels_summary.csv", index=False)
labels_df

Saved 76142 rows for Agia Zoni II -> labeled_incidents\AGIA_ZONI_II\slice.parquet
Saved 203088 rows for FlyingCat 4 -> labeled_incidents\FLYINGCAT_4\slice.parquet
Saved 17 rows for Flying Dolphin XVII -> labeled_incidents\FLYING_DOLPHIN_XVII\slice.parquet
Saved 73080 rows for Salamina Ferry Collision -> labeled_incidents\SALAMINA_FERRY_COLLISION\slice.parquet
Saved 55811 rows for Sea Star Piraeus Pier Allision -> labeled_incidents\SEA_STAR_PIRAEUS_PIER_ALLISION\slice.parquet


Unnamed: 0,incident,slug,rows,file,anomaly_label,source
0,Agia Zoni II,AGIA_ZONI_II,76142,labeled_incidents\AGIA_ZONI_II\slice.parquet,incident_match,Reuters KCN1BQ1FF; Maritime Executive
1,FlyingCat 4,FLYINGCAT_4,203088,labeled_incidents\FLYINGCAT_4\slice.parquet,incident_match,MarineLink 443405
2,Flying Dolphin XVII,FLYING_DOLPHIN_XVII,17,labeled_incidents\FLYING_DOLPHIN_XVII\slice.pa...,incident_match,GTP 2018-09-05
3,Salamina Ferry Collision,SALAMINA_FERRY_COLLISION,73080,labeled_incidents\SALAMINA_FERRY_COLLISION\sli...,incident_match,Ekathimerini news/236210
4,Sea Star Piraeus Pier Allision,SEA_STAR_PIRAEUS_PIER_ALLISION,55811,labeled_incidents\SEA_STAR_PIRAEUS_PIER_ALLISI...,incident_match,Maritime Bulletin 2019/04/26


In [56]:
# Sanity check slices: time range, vessel count, bbox
from pathlib import Path
import pandas as pd

checks = []
for s in slices:
    df = pd.read_parquet(s["file"])
    checks.append({
        "incident": s["incident"],
        "rows": len(df),
        "vessels": df["vessel_id"].nunique(),
        "t_min": df["timestamp"].min(),
        "t_max": df["timestamp"].max(),
        "lat_min": df["lat"].min(),
        "lat_max": df["lat"].max(),
        "lon_min": df["lon"].min(),
        "lon_max": df["lon"].max(),
    })

pd.DataFrame(checks)

Unnamed: 0,incident,rows,vessels,t_min,t_max,lat_min,lat_max,lon_min,lon_max
0,Agia Zoni II,76142,183,2017-09-10 09:00:00+00:00,2017-09-10 15:00:00+00:00,37.756557,38.035723,23.36206,23.705967
1,FlyingCat 4,203088,91,2018-08-29 09:00:01+00:00,2018-08-29 15:00:00+00:00,37.686173,37.944,23.227102,23.627
2,Flying Dolphin XVII,17,4,2018-09-05 09:00:57+00:00,2018-09-05 13:25:35+00:00,37.549263,37.678367,23.3007,23.646347
3,Salamina Ferry Collision,73080,180,2019-01-03 09:00:00+00:00,2019-01-03 15:00:00+00:00,37.769333,38.035877,23.344167,23.686652
4,Sea Star Piraeus Pier Allision,55811,199,2019-04-26 09:00:00+00:00,2019-04-26 15:00:00+00:00,37.740567,38.035967,23.423628,23.725713


In [57]:
# Visualize a slice on a map (all vessels color-coded)
import hashlib
import itertools

incident_to_plot = "Agia Zoni II"  # change to any incident name in slices
max_points_per_vessel = 2000  # downsample per vessel for display

try:
    import folium
except ImportError:
    folium = None
    print("folium not installed; install via: pip install folium")

# Find slice file
print(f"Looking for incident: {incident_to_plot}")
print(f"Available slices: {[s['incident'] for s in slices]}")

slice_entry = next((s for s in slices if s["incident"] == incident_to_plot), None)

if slice_entry is None:
    print(f"❌ No slice found for incident: {incident_to_plot}")
elif not Path(slice_entry["file"]).exists():
    print(f"❌ Parquet file missing: {slice_entry['file']}")
else:
    print(f"✓ Found slice: {slice_entry['file']}")
    df_plot = pd.read_parquet(slice_entry["file"]).sort_values("timestamp")
    if "vessel_id" not in df_plot.columns:
        print("❌ vessel_id column missing; cannot color by vessel")
    else:
        vessels = df_plot["vessel_id"].unique().tolist()
        print(f"✓ Loaded {len(df_plot)} points across {len(vessels)} vessels")
        if folium is None:
            print("Folium missing; showing head:")
            display(df_plot.head())
        else:
            colors = itertools.cycle([
                "red","blue","green","purple","orange","darkred","lightred","beige","darkblue","darkgreen","cadetblue","darkpurple","white","pink","lightblue","lightgreen","gray","black"
            ])
            m = folium.Map(location=[df_plot["lat"].mean(), df_plot["lon"].mean()], zoom_start=11, tiles="CartoDB positron")
            for vid in vessels:
                sub = df_plot[df_plot["vessel_id"] == vid]
                if sub.empty:
                    continue
                if len(sub) > max_points_per_vessel:
                    sub = sub.iloc[:: max(1, len(sub) // max_points_per_vessel)]
                coords = sub[["lat", "lon"]].values.tolist()
                col = next(colors)
                folium.PolyLine(coords, color=col, weight=3, opacity=0.7, tooltip=str(vid)).add_to(m)
                folium.CircleMarker(coords[0], radius=4, color=col, fill=True, popup=f"start {vid}").add_to(m)
                folium.CircleMarker(coords[-1], radius=4, color=col, fill=True, popup=f"end {vid}").add_to(m)
            print("✓ Map ready! Coloring each vessel_id differently.")
            display(m)

Looking for incident: Agia Zoni II
Available slices: ['Agia Zoni II', 'FlyingCat 4', 'Flying Dolphin XVII', 'Salamina Ferry Collision', 'Sea Star Piraeus Pier Allision']
✓ Found slice: labeled_incidents\AGIA_ZONI_II\slice.parquet
✓ Loaded 76142 points across 183 vessels
✓ Map ready! Coloring each vessel_id differently.
