# paths

In [1]:
ls ../dataset/piraeus

[0m[01;32m'107782 - The Piraeus AIS Dataset for Large-Scale Maritime Data Analytics.pdf'[0m*
 [01;32mais_augmented.parquet[0m*
 [01;32mais_cleaned.parquet[0m*
 [01;32mais_loiter.parquet[0m*
 [01;32mais_loiter_pair.parquet[0m*
 [34;42mais_static[0m/
 [34;42mgeodata[0m/
 [34;42mmodels[0m/
 [34;42mnoaa_weather[0m/
 [34;42mparquet[0m/
 [34;42mprocessed[0m/
 [34;42msar[0m/
 [34;42munipi_ais_dynamic_2017[0m/
 [34;42munipi_ais_dynamic_2018[0m/
 [34;42munipi_ais_dynamic_2019[0m/
 [34;42munipi_ais_dynamic_synopses[0m/


In [9]:
!ls ../dataset/piraeus/unipi_ais_dynamic_synopses/ais_synopses

2017  2018  2019


In [13]:
!ls ../dataset/piraeus/unipi_ais_dynamic_synopses/ais_synopses/2017

unipi_ais_synopses_aug_2017.csv  unipi_ais_synopses_may_2017.csv
unipi_ais_synopses_dec_2017.csv  unipi_ais_synopses_nov_2017.csv
unipi_ais_synopses_jul_2017.csv  unipi_ais_synopses_oct_2017.csv
unipi_ais_synopses_jun_2017.csv  unipi_ais_synopses_sep_2017.csv


In [5]:
# pip install cudf

# load df

In [19]:
import tqdm

In [20]:
from pathlib import Path
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

def convert_all_synopsis_csv_to_parquet(src_base: str, dst_base: str, chunksize: int = 500_000):
    """
    Convert all CSVs in yearly subfolders from src_base to Parquet in dst_base,
    with tqdm progress bars and handling nested columns.
    """
    src_base = Path(src_base)
    dst_base = Path(dst_base)
    dst_base.mkdir(parents=True, exist_ok=True)

    schema = pa.schema([
        pa.field("t", pa.int64()),
        pa.field("vessel_id", pa.string()),
        pa.field("lon", pa.float32()),
        pa.field("lat", pa.float32()),
        pa.field("heading", pa.float32()),
        pa.field("speed", pa.float32()),
        pa.field("annotations", pa.list_(pa.string())),
        pa.field("transport_trail", pa.list_(pa.struct([
            pa.field("topic", pa.string()),
            pa.field("timestamp", pa.int64())
        ]))),
    ])

    year_folders = [f for f in src_base.iterdir() if f.is_dir()]
    for year_folder in tqdm(year_folders, desc="Years"):
        dst_year_folder = dst_base / year_folder.name
        dst_year_folder.mkdir(parents=True, exist_ok=True)

        csv_files = list(year_folder.glob("*.csv"))
        for csv_file in tqdm(csv_files, desc=f"{year_folder.name} CSVs", leave=False):
            parquet_file = dst_year_folder / csv_file.with_suffix(".parquet").name

            writer = pq.ParquetWriter(parquet_file, schema)
            for chunk in tqdm(pd.read_csv(csv_file, chunksize=chunksize),
                              desc=f"{csv_file.name} chunks", leave=False):
                # Convert nested string columns to Python objects
                chunk["annotations"] = chunk["annotations"].apply(eval)
                chunk["transport_trail"] = chunk["transport_trail"].apply(eval)

                table = pa.Table.from_pandas(chunk, schema=schema, preserve_index=False)
                writer.write_table(table)
            writer.close()
convert_all_synopsis_csv_to_parquet(
    src_base="../dataset/piraeus/unipi_ais_dynamic_synopses/ais_synopses",
    dst_base="../dataset/piraeus/parquet/unipi_ais_dynamic_synopses",
    chunksize=500_000
)


Years:   0%|          | 0/3 [00:00<?, ?it/s]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Years:  33%|███▎      | 1/3 [01:38<03:17, 98.54s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Years:  67%|██████▋   | 2/3 [03:29<01:45, 105.67s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Years: 100%|██████████| 3/3 [05:33<00:00, 111.19s/it]


In [22]:
!ls ../dataset/piraeus/parquet/unipi_ais_dynamic_synopses

2017  2018  2019


In [23]:
!ls ../dataset/piraeus/parquet/unipi_ais_dynamic_synopses/2019

unipi_ais_synopses_apr_2019.parquet  unipi_ais_synopses_jun_2019.parquet
unipi_ais_synopses_aug_2019.parquet  unipi_ais_synopses_mar_2019.parquet
unipi_ais_synopses_dec_2019.parquet  unipi_ais_synopses_may_2019.parquet
unipi_ais_synopses_feb_2019.parquet  unipi_ais_synopses_nov_2019.parquet
unipi_ais_synopses_jan_2019.parquet  unipi_ais_synopses_oct_2019.parquet
unipi_ais_synopses_jul_2019.parquet  unipi_ais_synopses_sep_2019.parquet


In [24]:
import pandas, pyarrow
print(pandas.__version__)
print(pyarrow.__version__)

2.3.3
21.0.0


In [28]:
import pyarrow.parquet as pq

pq_file = pq.ParquetFile("../dataset/piraeus/parquet/unipi_ais_dynamic_synopses/2019/unipi_ais_synopses_apr_2019.parquet")

# Suppose row groups are ~500k rows each
row_group_index = 0  # 6*500k = 3Mth row
table = pq_file.read_row_group(row_group_index)

df_chunk = table.to_pandas()  # Only this row group in memory
row = df_chunk.iloc[0]  # Approx 3Mth row
print(row)


t                                                      1554066000000
vessel_id          1945c6f118eb4bfae9dd5ccd32cc83126542d1b5341732...
lon                                                         23.64925
lat                                                        37.930935
heading                                                          NaN
speed                                                            NaN
annotations                                                [GAP_END]
transport_trail    [{'topic': 'datacsv_saronikos_3', 'timestamp':...
Name: 0, dtype: object


so now we can have random access in O(1) using parquet a and ready for inference

In [29]:
import platform; print(platform.architecture()); import sys; print(sys.version)

('64bit', 'ELF')
3.12.12 | packaged by conda-forge | (main, Jan 26 2026, 23:51:32) [GCC 14.3.0]
