## parquet paths

In [17]:
!ls ../dataset/piraeus/

107782 - The Piraeus AIS Dataset for Large-Scale Maritime Data Analytics.pdf
ais_augmented.parquet
ais_cleaned.parquet
ais_loiter.parquet
ais_loiter_pair.parquet
ais_static
geodata
models
noaa_weather
parquet
processed
sar
unipi_ais_dynamic_2017
unipi_ais_dynamic_2018
unipi_ais_dynamic_2019
unipi_ais_dynamic_synopses


In [10]:
!ls ../dataset/piraeus/unipi_ais_dynamic_2017

README.md
unipi_ais_dynamic_aug2017.csv
unipi_ais_dynamic_dec2017.csv
unipi_ais_dynamic_jul2017.csv
unipi_ais_dynamic_jun2017.csv
unipi_ais_dynamic_may2017.csv
unipi_ais_dynamic_nov2017.csv
unipi_ais_dynamic_oct2017.csv
unipi_ais_dynamic_sep2017.csv


In [11]:
!ls ../dataset/piraeus/unipi_ais_dynamic_2018

README.md
unipi_ais_dynamic_apr2018.csv
unipi_ais_dynamic_aug2018.csv
unipi_ais_dynamic_dec2018.csv
unipi_ais_dynamic_feb2018.csv
unipi_ais_dynamic_jan2018.csv
unipi_ais_dynamic_jul2018.csv
unipi_ais_dynamic_jun2018.csv
unipi_ais_dynamic_mar2018.csv
unipi_ais_dynamic_may2018.csv
unipi_ais_dynamic_nov2018.csv
unipi_ais_dynamic_oct2018.csv
unipi_ais_dynamic_sep2018.csv


In [12]:
!ls ../dataset/piraeus/unipi_ais_dynamic_2019

README.md
unipi_ais_dynamic_apr2019.csv
unipi_ais_dynamic_aug2019.csv
unipi_ais_dynamic_dec2019.csv
unipi_ais_dynamic_feb2019.csv
unipi_ais_dynamic_jan2019.csv
unipi_ais_dynamic_jul2019.csv
unipi_ais_dynamic_jun2019.csv
unipi_ais_dynamic_mar2019.csv
unipi_ais_dynamic_may2019.csv
unipi_ais_dynamic_nov2019.csv
unipi_ais_dynamic_oct2019.csv
unipi_ais_dynamic_sep2019.csv


In [13]:
import pandas, pyarrow
print(pandas.__version__)
print(pyarrow.__version__)

2.3.3
23.0.0


In [30]:
from tqdm import tqdm

In [14]:
import pyarrow.parquet as pq

pq_file = pq.ParquetFile("unipi_ais_dynamic_may2017.parquet")

# Suppose row groups are ~500k rows each
row_group_index = 6  # 6*500k = 3Mth row
table = pq_file.read_row_group(row_group_index)

df_chunk = table.to_pandas()  # Only this row group in memory
row = df_chunk.iloc[0]  # Approx 3Mth row
print(row)


t                                                1494345047000
vessel_id    b0b2bd45bbb8911fbea20744b0e8b98bbb0e76f6c3af37...
lat                                                  37.929298
lon                                                  23.682772
heading                                                   30.0
speed                                                      0.0
course                                                   170.0
Name: 0, dtype: object


## Random Access

In [15]:
import platform; print(platform.architecture()); import sys; print(sys.version)

('64bit', 'WindowsPE')
3.10.19 | packaged by conda-forge | (main, Jan 26 2026, 23:39:36) [MSC v.1944 64 bit (AMD64)]


# Clustering based on TRACLUS

## Core data structures

In [18]:
from dataclasses import dataclass
import numpy as np

@dataclass
class Point:
    """
    AIS point.
    """
    x: float
    y: float
    t: int

@dataclass
class Segment:
    """
    Line segment between two points.
    """
    p1: Point
    p2: Point


## Trajectory construction

In [31]:
def build_trajectories(df):
    """
    Group AIS points into trajectories.
    """
    trajectories = {}
    for vid, g in df.groupby("vessel_id"):
        g = g.sort_values("t")
        pts = [Point(r.lon, r.lat, r.t) for r in g.itertuples()]
        if len(pts) >= 2:
            trajectories[vid] = pts
    return trajectories


## MDL-based trajectory partitioning

In [42]:
def mdl_cost(points, i, j):
    """
    MDL cost between points i and j.
    """
    pi, pj = points[i], points[j]
    length = np.hypot(pj.x - pi.x, pj.y - pi.y)
    return np.log2(length + 1e-9)

def partition_trajectory(points):
    """
    Partition trajectory using MDL principle.
    """
    segments = []
    start = 0
    for i in tqdm(range(2, len(points)), desc="partitioning"):
        cost_no_split = mdl_cost(points, start, i)
        cost_split = mdl_cost(points, start, i - 1) + mdl_cost(points, i - 1, i)
        if cost_split < cost_no_split:
            segments.append(Segment(points[start], points[i - 1]))
            start = i - 1
    segments.append(Segment(points[start], points[-1]))
    return segments


## Segment distance function

In [38]:
def segment_distance(s1, s2):
    """
    TRACLUS distance: perpendicular + parallel + angular.
    """
    def vec(s):
        return np.array([s.p2.x - s.p1.x, s.p2.y - s.p1.y])

    v1, v2 = vec(s1), vec(s2)
    ang = np.arccos(
        np.clip(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)), -1, 1)
    )

    mid1 = np.array([(s1.p1.x + s1.p2.x) / 2, (s1.p1.y + s1.p2.y) / 2])
    mid2 = np.array([(s2.p1.x + s2.p2.x) / 2, (s2.p1.y + s2.p2.y) / 2])

    return np.linalg.norm(mid1 - mid2) + ang


## Line-segment clustering

In [43]:
def cluster_segments(segments, eps, min_samples):
    """
    DBSCAN-style clustering for line segments.
    """
    labels = [-1] * len(segments)
    cid = 0

    for i, s in enumerate(tqdm(segments, desc="clustering")):
        if labels[i] != -1:
            continue
        neighbors = [
            j for j, s2 in enumerate(segments)
            if segment_distance(s, s2) < eps
        ]
        if len(neighbors) < min_samples:
            continue

        for j in neighbors:
            labels[j] = cid
        cid += 1

    return labels


# Collecting segments from a chunk

In [44]:
def extract_segments(df):
    """
    Build and partition trajectories into line segments.
    """
    trajectories = build_trajectories(df)
    segments = []
    for pts in trajectories.values():
        segments.extend(partition_trajectory(pts))
    return segments


In [45]:
segments = extract_segments(df_chunk)

labels = cluster_segments(
    segments=segments,
    eps=0.02,
    min_samples=10
)


Exception ignored in: <function tqdm.__del__ at 0x000001826C90D000>
Traceback (most recent call last):
  File "c:\ProgramData\anaconda3\envs\rapids-23.12\lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\ProgramData\anaconda3\envs\rapids-23.12\lib\site-packages\tqdm\std.py", line 1277, in close
    if self.last_print_t < self.start_t + self.delay:
AttributeError: 'tqdm' object has no attribute 'last_print_t'
partitioning: 100%|██████████| 439/439 [00:00<00:00, 219542.08it/s]
partitioning: 100%|██████████| 12/12 [00:00<?, ?it/s]
partitioning: 100%|██████████| 1841/1841 [00:00<00:00, 262965.32it/s]
partitioning: 100%|██████████| 143/143 [00:00<00:00, 143078.60it/s]
partitioning: 100%|██████████| 32/32 [00:00<?, ?it/s]
partitioning: 100%|██████████| 1529/1529 [00:00<00:00, 216519.49it/s]
partitioning: 100%|██████████| 189/189 [00:00<00:00, 190238.41it/s]
partitioning: 100%|██████████| 2/2 [00:00<?, ?it/s]
partitioning: 100%|██████████| 437/437 [00:00<00:00,

KeyboardInterrupt: 

So basically the above program was stopped as time (753:00:57) would be close to ≈ 30 days, which isn't practical.