# paths

In [2]:
!ls ../dataset/piraeus

'107782 - The Piraeus AIS Dataset for Large-Scale Maritime Data Analytics.pdf'
 ais_augmented.parquet
 ais_cleaned.parquet
 ais_loiter.parquet
 ais_loiter_pair.parquet
 ais_static
 geodata
 models
 noaa_weather
 parquet_version
 processed
 sar
 unipi_ais_dynamic_2017
 unipi_ais_dynamic_2018
 unipi_ais_dynamic_2019
 unipi_ais_dynamic_synopses


In [3]:
ls ..\dataset\piraeus\unipi_ais_dynamic_2017

ls: cannot access '..datasetpiraeusunipi_ais_dynamic_2017': No such file or directory


In [4]:
import pandas, pyarrow
print(pandas.__version__)
print(pyarrow.__version__)

2.3.3
21.0.0


In [5]:
import pyarrow.parquet as pq

pq_file = pq.ParquetFile("unipi_ais_dynamic_may2017.parquet")

# Suppose row groups are ~500k rows each
row_group_index = 6  # 6*500k = 3Mth row
table = pq_file.read_row_group(row_group_index)

df_chunk = table.to_pandas()  # Only this row group in memory
row = df_chunk.iloc[0]  # Approx 3Mth row
print(row)


t                                                1494345047000
vessel_id    b0b2bd45bbb8911fbea20744b0e8b98bbb0e76f6c3af37...
lat                                                  37.929298
lon                                                  23.682772
heading                                                   30.0
speed                                                      0.0
course                                                   170.0
Name: 0, dtype: object


## Random Access

In [6]:
import pyarrow.parquet as pq
import pandas as pd

# Load Parquet file metadata
pq_file = pq.ParquetFile("unipi_ais_dynamic_may2017.parquet")
num_rows = pq_file.metadata.num_rows
num_row_groups = pq_file.num_row_groups

print(f"Total rows: {num_rows}, Row groups: {num_row_groups}")

# Function to read a row by index
def read_row(row_idx: int) -> pd.Series:
    if row_idx < 0 or row_idx >= num_rows:
        raise IndexError("Row index out of bounds")

    cum_rows = 0
    for group_idx in range(num_row_groups):
        rg_rows = pq_file.metadata.row_group(group_idx).num_rows
        if row_idx < cum_rows + rg_rows:
            local_idx = row_idx - cum_rows
            table = pq_file.read_row_group(group_idx)
            df = table.to_pandas()
            return df.iloc[local_idx]
        cum_rows += rg_rows

# Example: read 3,000,000th row
row_3m = read_row(3_000_000)
print(row_3m)


Total rows: 4305035, Row groups: 9
t                                                1494345047000
vessel_id    b0b2bd45bbb8911fbea20744b0e8b98bbb0e76f6c3af37...
lat                                                  37.929298
lon                                                  23.682772
heading                                                   30.0
speed                                                      0.0
course                                                   170.0
Name: 0, dtype: object


In [7]:
import platform; print(platform.architecture()); import sys; print(sys.version)

('64bit', 'ELF')
3.12.12 | packaged by conda-forge | (main, Jan 26 2026, 23:51:32) [GCC 14.3.0]


# Clustering

In [26]:
"""
Unsupervised AIS clustering using cuML HDBSCAN (GPU-safe).
"""

import cudf
from cuml.cluster import HDBSCAN
from cuml.preprocessing import StandardScaler


def main():
    """
    Load, clean, scale, and cluster AIS data.
    """

    df = cudf.read_parquet(
        "unipi_ais_dynamic_may2017.parquet",
        row_groups=[6]
    )

    features = df[["lat", "lon", "speed"]].astype("float32")

    features = features.dropna()
    # features = features[~features.isin([float("inf"), float("-inf")]).any(axis=1)]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(features)

    X_scaled = cudf.DataFrame(
        X_scaled,
        columns=["lat", "lon", "speed"]
    ).astype("float32")

    clusterer = HDBSCAN(
        min_cluster_size=3,
        min_samples=3,
        metric="euclidean"
    )

    features["cluster_id"] = clusterer.fit_predict(X_scaled)

    print(features["cluster_id"].value_counts())


if __name__ == "__main__":
    main()


cluster_id
-1    499220
Name: count, dtype: int64


Across a wide range of parameters, the algorithm consistently classified all observations as noise, indicating the absence of stable density structures in the feature space. Possible that there is a need for another model or further parameter tuning. So this will be done later after further analytics on as it reuires more data to set parameters.

The analysis was therefore redirected toward probabilistic modeling of vessel movement, where routes are represented as sequences of spatial transitions and ranked based on their empirical likelihood in the AIS data.