## parquet paths

In [2]:
!ls ../dataset/piraeus/

107782 - The Piraeus AIS Dataset for Large-Scale Maritime Data Analytics.pdf
ais_augmented.parquet
ais_cleaned.parquet
ais_loiter.parquet
ais_loiter_pair.parquet
ais_static
geodata
models
noaa_weather
parquet
processed
sar
unipi_ais_dynamic_2017
unipi_ais_dynamic_2018
unipi_ais_dynamic_2019
unipi_ais_dynamic_synopses


In [3]:
!ls ../dataset/piraeus/unipi_ais_dynamic_2017

README.md
unipi_ais_dynamic_aug2017.csv
unipi_ais_dynamic_dec2017.csv
unipi_ais_dynamic_jul2017.csv
unipi_ais_dynamic_jun2017.csv
unipi_ais_dynamic_may2017.csv
unipi_ais_dynamic_nov2017.csv
unipi_ais_dynamic_oct2017.csv
unipi_ais_dynamic_sep2017.csv


In [4]:
!ls ../dataset/piraeus/unipi_ais_dynamic_2018

README.md
unipi_ais_dynamic_apr2018.csv
unipi_ais_dynamic_aug2018.csv
unipi_ais_dynamic_dec2018.csv
unipi_ais_dynamic_feb2018.csv
unipi_ais_dynamic_jan2018.csv
unipi_ais_dynamic_jul2018.csv
unipi_ais_dynamic_jun2018.csv
unipi_ais_dynamic_mar2018.csv
unipi_ais_dynamic_may2018.csv
unipi_ais_dynamic_nov2018.csv
unipi_ais_dynamic_oct2018.csv
unipi_ais_dynamic_sep2018.csv


In [5]:
!ls ../dataset/piraeus/unipi_ais_dynamic_2019

README.md
unipi_ais_dynamic_apr2019.csv
unipi_ais_dynamic_aug2019.csv
unipi_ais_dynamic_dec2019.csv
unipi_ais_dynamic_feb2019.csv
unipi_ais_dynamic_jan2019.csv
unipi_ais_dynamic_jul2019.csv
unipi_ais_dynamic_jun2019.csv
unipi_ais_dynamic_mar2019.csv
unipi_ais_dynamic_may2019.csv
unipi_ais_dynamic_nov2019.csv
unipi_ais_dynamic_oct2019.csv
unipi_ais_dynamic_sep2019.csv


In [6]:
import pandas, pyarrow
print(pandas.__version__)
print(pyarrow.__version__)

2.3.3
23.0.0


In [7]:
from tqdm import tqdm

In [8]:
import pyarrow.parquet as pq

pq_file = pq.ParquetFile("unipi_ais_dynamic_may2017.parquet")

# Suppose row groups are ~500k rows each
row_group_index = 6  # 6*500k = 3Mth row
table = pq_file.read_row_group(row_group_index)

df_chunk = table.to_pandas()  # Only this row group in memory
row = df_chunk.iloc[0]  # Approx 3Mth row
print(row)


t                                                1494345047000
vessel_id    b0b2bd45bbb8911fbea20744b0e8b98bbb0e76f6c3af37...
lat                                                  37.929298
lon                                                  23.682772
heading                                                   30.0
speed                                                      0.0
course                                                   170.0
Name: 0, dtype: object


## Random Access

In [9]:
import platform; print(platform.architecture()); import sys; print(sys.version)

('64bit', 'WindowsPE')
3.10.19 | packaged by conda-forge | (main, Jan 26 2026, 23:39:36) [MSC v.1944 64 bit (AMD64)]


In [10]:
import pandas as pd

# Adjust to your file path
file = "../dataset/piraeus/unipi_ais_dynamic_2017/unipi_ais_dynamic_may2017.csv"

# Chunked read in case of large files
chunksize = 1_000_000
min_lat, max_lat = float('inf'), float('-inf')
min_lon, max_lon = float('inf'), float('-inf')

for chunk in pd.read_csv(file, usecols=['lat', 'lon'], chunksize=chunksize):
    min_lat = min(min_lat, chunk['lat'].min())
    max_lat = max(max_lat, chunk['lat'].max())
    min_lon = min(min_lon, chunk['lon'].min())
    max_lon = max(max_lon, chunk['lon'].max())

print("Bounds:")
print("Latitude:", min_lat, "-", max_lat)
print("Longitude:", min_lon, "-", max_lon)


Bounds:
Latitude: 37.45947 - 38.03808166666671
Longitude: 23.0350833333333 - 23.8806466666667


In [None]:
import pandas as pd
import numpy as np
import folium
from folium.plugins import HeatMap

# ----------------------------
# Config
# ----------------------------
file = "../dataset/piraeus/unipi_ais_dynamic_2017/unipi_ais_dynamic_may2017.csv"
chunksize = 1_000_000  # rows per chunk
cell_size = 0.001       # ~100 m per cell

# Bounds
min_lat, max_lat = 37.45947, 38.03808166666671
min_lon, max_lon = 23.0350833333333, 23.8806466666667

# ----------------------------
# Step 1: Create grid
# ----------------------------
lat_bins = np.arange(min_lat, max_lat + cell_size, cell_size)
lon_bins = np.arange(min_lon, max_lon + cell_size, cell_size)
heatmap_grid = np.zeros((len(lat_bins)-1, len(lon_bins)-1), dtype=int)

# ----------------------------
# Step 2: Count points per cell
# ----------------------------
for chunk in pd.read_csv(file, usecols=['lat','lon'], chunksize=chunksize):
    lat_idx = np.searchsorted(lat_bins, chunk['lat'].values, side='right') - 1
    lon_idx = np.searchsorted(lon_bins, chunk['lon'].values, side='right') - 1
    for i, j in zip(lat_idx, lon_idx):
        heatmap_grid[i, j] += 1

# ----------------------------
# Step 3: Convert grid to points for HeatMap
# ----------------------------
heatmap_points = []
for i in range(heatmap_grid.shape[0]):
    for j in range(heatmap_grid.shape[1]):
        count = heatmap_grid[i, j]
        if count > 0:
            lat = (lat_bins[i] + lat_bins[i+1]) / 2
            lon = (lon_bins[j] + lon_bins[j+1]) / 2
            heatmap_points.append([lat, lon, count])

# Normalize intensity
max_count = max([p[2] for p in heatmap_points])
heatmap_points_norm = [[lat, lon, cnt/max_count] for lat, lon, cnt in heatmap_points]

# ----------------------------
# Step 4: Create Folium map in Jupyter
# ----------------------------
center_lat = (min_lat + max_lat) / 2
center_lon = (min_lon + max_lon) / 2

m = folium.Map(location=[center_lat, center_lon], zoom_start=12, tiles="OpenStreetMap")
HeatMap(heatmap_points_norm, radius=10, blur=15, max_zoom=18).add_to(m)

# Display map in Jupyter
m