# paths

In [3]:
!ls ../dataset/piraeus

107782 - The Piraeus AIS Dataset for Large-Scale Maritime Data Analytics.pdf
ais_augmented.parquet
ais_cleaned.parquet
ais_loiter.parquet
ais_loiter_pair.parquet
ais_static
geodata
models
noaa_weather
parquet_version
processed
sar
unipi_ais_dynamic_2017
unipi_ais_dynamic_2018
unipi_ais_dynamic_2019
unipi_ais_dynamic_synopses


In [4]:
ls ..\dataset\piraeus\unipi_ais_dynamic_2017

 Volume in drive C is Windows
 Volume Serial Number is 22B1-CD8D

 Directory of c:\Users\BBBS-AI-01\d\anomaly\dataset\piraeus\unipi_ais_dynamic_2017

31/12/2025  02:53 pm    <DIR>          .
03/02/2026  01:28 pm    <DIR>          ..
31/12/2025  02:52 pm             1,831 README.md
31/12/2025  02:52 pm       982,705,862 unipi_ais_dynamic_aug2017.csv
31/12/2025  02:52 pm       734,582,968 unipi_ais_dynamic_dec2017.csv
31/12/2025  02:52 pm     1,022,394,184 unipi_ais_dynamic_jul2017.csv
31/12/2025  02:53 pm       993,088,856 unipi_ais_dynamic_jun2017.csv
31/12/2025  02:53 pm       527,700,689 unipi_ais_dynamic_may2017.csv
31/12/2025  02:53 pm       688,654,355 unipi_ais_dynamic_nov2017.csv
31/12/2025  02:53 pm       527,318,392 unipi_ais_dynamic_oct2017.csv
31/12/2025  02:53 pm       934,818,984 unipi_ais_dynamic_sep2017.csv
               9 File(s)  6,411,266,121 bytes
               2 Dir(s)  95,996,178,432 bytes free


In [1]:
import pandas, pyarrow
print(pandas.__version__)
print(pyarrow.__version__)

2.3.3
23.0.0


In [5]:
import pyarrow.parquet as pq

pq_file = pq.ParquetFile("unipi_ais_dynamic_may2017.parquet")

# Suppose row groups are ~500k rows each
row_group_index = 6  # 6*500k = 3Mth row
table = pq_file.read_row_group(row_group_index)

df_chunk = table.to_pandas()  # Only this row group in memory
row = df_chunk.iloc[0]  # Approx 3Mth row
print(row)


t                                                1494345047000
vessel_id    b0b2bd45bbb8911fbea20744b0e8b98bbb0e76f6c3af37...
lat                                                  37.929298
lon                                                  23.682772
heading                                                   30.0
speed                                                      0.0
course                                                   170.0
Name: 0, dtype: object


## Random Access

In [6]:
import pyarrow.parquet as pq
import pandas as pd

# Load Parquet file metadata
pq_file = pq.ParquetFile("unipi_ais_dynamic_may2017.parquet")
num_rows = pq_file.metadata.num_rows
num_row_groups = pq_file.num_row_groups

print(f"Total rows: {num_rows}, Row groups: {num_row_groups}")

# Function to read a row by index
def read_row(row_idx: int) -> pd.Series:
    if row_idx < 0 or row_idx >= num_rows:
        raise IndexError("Row index out of bounds")

    cum_rows = 0
    for group_idx in range(num_row_groups):
        rg_rows = pq_file.metadata.row_group(group_idx).num_rows
        if row_idx < cum_rows + rg_rows:
            local_idx = row_idx - cum_rows
            table = pq_file.read_row_group(group_idx)
            df = table.to_pandas()
            return df.iloc[local_idx]
        cum_rows += rg_rows

# Example: read 3,000,000th row
row_3m = read_row(3_000_000)
print(row_3m)


Total rows: 4305035, Row groups: 9
t                                                1494345047000
vessel_id    b0b2bd45bbb8911fbea20744b0e8b98bbb0e76f6c3af37...
lat                                                  37.929298
lon                                                  23.682772
heading                                                   30.0
speed                                                      0.0
course                                                   170.0
Name: 0, dtype: object


In [7]:
import platform; print(platform.architecture()); import sys; print(sys.version)

('64bit', 'WindowsPE')
3.10.19 | packaged by conda-forge | (main, Jan 26 2026, 23:39:36) [MSC v.1944 64 bit (AMD64)]


# Clustering

In [9]:
import pyarrow.parquet as pq
import pandas as pd

def load_row_group(path: str, row_group_index: int) -> pd.DataFrame:
    """
    Load a single parquet row group into memory.
    """
    pq_file = pq.ParquetFile(path)
    table = pq_file.read_row_group(row_group_index)
    return table.to_pandas()

df = load_row_group("unipi_ais_dynamic_may2017.parquet", 6)


In [10]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize timestamps and drop unusable rows.
    """
    df = df.dropna(subset=["lat", "lon", "speed", "course"])
    df["timestamp"] = pd.to_datetime(df["t"], unit="ms")
    return df

df = preprocess(df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["timestamp"] = pd.to_datetime(df["t"], unit="ms")


In [11]:
import numpy as np

def compute_behavior_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute per-vessel behavioral features for clustering.
    """
    df = df.sort_values(["vessel_id", "timestamp"])

    df["dt"] = (
        df.groupby("vessel_id")["timestamp"]
        .diff()
        .dt.total_seconds()
    )

    df["d_course"] = (
        df.groupby("vessel_id")["course"]
        .diff()
        .abs()
    )

    df["turn_rate"] = df["d_course"] / df["dt"]

    features = (
        df.groupby("vessel_id")
        .agg(
            speed_mean=("speed", "mean"),
            speed_std=("speed", "std"),
            turn_rate_mean=("turn_rate", "mean"),
            stop_ratio=("speed", lambda x: (x < 0.5).mean()),
            lat_mean=("lat", "mean"),
            lon_mean=("lon", "mean"),
        )
        .fillna(0.0)
        .reset_index()
    )

    return features

feat_df = compute_behavior_features(df)


In [13]:
from sklearn.cluster import DBSCAN

def spatial_clustering(df: pd.DataFrame) -> pd.DataFrame:
    """
    DBSCAN clustering on latitude / longitude.
    """
    coords = df[["lat", "lon"]].to_numpy()

    model = DBSCAN(
        eps=0.002,
        min_samples=50,
        metric="haversine"
    )

    df["spatial_cluster"] = model.fit_predict(
        np.radians(coords)
    )

    return df

df = spatial_clustering(df)


MemoryError: 

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

def behavior_clustering(features: pd.DataFrame) -> pd.DataFrame:
    """
    Cluster vessels by movement behavior.
    """
    X = features[
        [
            "speed_mean",
            "speed_std",
            "turn_rate_mean",
            "stop_ratio",
        ]
    ].to_numpy()

    X = StandardScaler().fit_transform(X)

    model = DBSCAN(
        eps=0.6,
        min_samples=10
    )

    features["behavior_cluster"] = model.fit_predict(X)
    return features

feat_df = behavior_clustering(feat_df)
