In [1]:
# List all layers in the GPKG
import fiona
import geopandas as gpd
layers = fiona.listlayers("dataset/AISVesselTracks2024.gpkg")
print(layers)

['AISVesselTracks2024']


## Load and Explore the Data

In [13]:
# Load the vessel tracks data using spatial filtering
# This loads only data within a specified geographic bounding box
import geopandas as gpd
import pandas as pd

# Define bounding box (minx, miny, maxx, maxy)
# Based on diagnostic check, your data is in the Western Atlantic/Americas region
# Adjust these coordinates based on your area of interest:
bbox = (-85.0, 15.0, -60.0, 46.0)  # Western Atlantic, U.S. East Coast & Caribbean

print(f"Loading data for bounding box: {bbox}")
gdf_sample = gpd.read_file("dataset/AISVesselTracks2024.gpkg", 
                           layer="AISVesselTracks2024", 
                           bbox=bbox)

print(f"\nRecords loaded: {len(gdf_sample):,}")
print(f"Columns: {list(gdf_sample.columns)}")
print(f"CRS: {gdf_sample.crs}")
print(f"Data types:\n{gdf_sample.dtypes}")
print(f"\nActual bounds of loaded data:")
bounds = gdf_sample.total_bounds
print(f"  Min X (West): {bounds[0]:.6f}")
print(f"  Min Y (South): {bounds[1]:.6f}")
print(f"  Max X (East): {bounds[2]:.6f}")
print(f"  Max Y (North): {bounds[3]:.6f}")

Loading data for bounding box: (-85.0, 15.0, -60.0, 46.0)


  return ogr_read(



Records loaded: 3,591,251
Columns: ['MMSI', 'TrackStartTime', 'TrackEndTime', 'VesselType', 'Length', 'Width', 'Draft', 'DurationMinutes', 'VesselGroup', 'geometry']
CRS: EPSG:4269
Data types:
MMSI                             int32
TrackStartTime     datetime64[ms, UTC]
TrackEndTime       datetime64[ms, UTC]
VesselType                     float64
Length                         float64
Width                          float64
Draft                          float64
DurationMinutes                  int32
VesselGroup                     object
geometry                      geometry
dtype: object

Actual bounds of loaded data:
  Min X (West): -95.332230
  Min Y (South): 15.090980
  Max X (East): -60.000310
  Max Y (North): 48.781920


In [11]:
# First check: What geographic area does the dataset actually cover?
# Load a small sample without bbox to see the actual extent
print("Loading sample to check geographic coverage...")
gdf_test = gpd.read_file("dataset/AISVesselTracks2024.gpkg", 
                         layer="AISVesselTracks2024", 
                         rows=slice(0, 10000))

if len(gdf_test) > 0:
    bounds = gdf_test.total_bounds
    print(f"\nSample data geographic bounds:")
    print(f"  Longitude: {bounds[0]:.2f}° to {bounds[2]:.2f}°")
    print(f"  Latitude: {bounds[1]:.2f}° to {bounds[3]:.2f}°")
    print(f"\nThis suggests your data covers approximately:")
    if bounds[0] > -10 and bounds[2] < 50:
        print("  - Likely Mediterranean/European waters")
    elif bounds[0] > 40 and bounds[2] < 100:
        print("  - Likely Arabian Sea/Indian Ocean region")
    elif bounds[0] < -60:
        print("  - Likely includes Atlantic or Americas")
    print(f"\nAdjust your bbox accordingly!")
else:
    print("No data loaded - check file path")

Loading sample to check geographic coverage...


  return ogr_read(



Sample data geographic bounds:
  Longitude: -84.10° to -60.04°
  Latitude: 15.19° to 45.06°

This suggests your data covers approximately:
  - Likely includes Atlantic or Americas

Adjust your bbox accordingly!


In [8]:
# Display first few records from sample
print(gdf_sample.head())

Empty GeoDataFrame
Columns: [MMSI, TrackStartTime, TrackEndTime, VesselType, Length, Width, Draft, DurationMinutes, VesselGroup, geometry]
Index: []


In [9]:
# Data summary
gdf_sample.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 0 entries
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   MMSI             0 non-null      int32         
 1   TrackStartTime   0 non-null      datetime64[ms]
 2   TrackEndTime     0 non-null      datetime64[ms]
 3   VesselType       0 non-null      int32         
 4   Length           0 non-null      float64       
 5   Width            0 non-null      float64       
 6   Draft            0 non-null      float64       
 7   DurationMinutes  0 non-null      int32         
 8   VesselGroup      0 non-null      object        
 9   geometry         0 non-null      geometry      
dtypes: datetime64[ms](2), float64(3), geometry(1), int32(3), object(1)
memory usage: 132.0+ bytes


## Statistical Analysis

In [6]:
# Statistical summary of numerical columns
gdf_sample.describe()

  gdf_sample.describe()
  gdf_sample.describe()


Unnamed: 0,MMSI,TrackStartTime,TrackEndTime,VesselType,Length,Width,Draft,DurationMinutes
count,0.0,0,0,0.0,0.0,0.0,0.0,0.0
mean,,NaT,NaT,,,,,
min,,NaT,NaT,,,,,
25%,,NaT,NaT,,,,,
50%,,NaT,NaT,,,,,
75%,,NaT,NaT,,,,,
max,,NaT,NaT,,,,,
std,,,,,,,,


In [None]:
# Check for missing values
missing = gdf_sample.isnull().sum()
missing_pct = (missing / len(gdf_sample)) * 100
missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage': missing_pct})
missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

## Spatial Analysis

In [None]:
# Geographic bounds of sample data
bounds = gdf_sample.total_bounds
print(f"Bounding Box (sample data):")
print(f"  Min X (West): {bounds[0]:.6f}")
print(f"  Min Y (South): {bounds[1]:.6f}")
print(f"  Max X (East): {bounds[2]:.6f}")
print(f"  Max Y (North): {bounds[3]:.6f}")

In [None]:
# Visualize spatial distribution of sample
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15, 10))
gdf_sample.plot(ax=ax, markersize=0.5, alpha=0.5, color='blue')
ax.set_title('AIS Vessel Tracks Spatial Distribution (Sample)', fontsize=16)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
plt.tight_layout()
plt.show()

## Vessel-Specific Analysis

In [None]:
# Count unique vessels (if MMSI column exists)
# Adapt based on actual column names
if 'MMSI' in gdf_sample.columns:
    unique_vessels = gdf_sample['MMSI'].nunique()
    print(f"Total unique vessels (in sample): {unique_vessels:,}")
    print(f"\nTop 10 vessels by number of records:")
    print(gdf_sample['MMSI'].value_counts().head(10))
elif 'mmsi' in gdf_sample.columns:
    unique_vessels = gdf_sample['mmsi'].nunique()
    print(f"Total unique vessels (in sample): {unique_vessels:,}")
    print(f"\nTop 10 vessels by number of records:")
    print(gdf_sample['mmsi'].value_counts().head(10))
else:
    print("MMSI column not found. Available columns:")
    print(gdf_sample.columns.tolist())

## Temporal Analysis (if timestamp available)

In [None]:
# Check for timestamp columns
time_cols = [col for col in gdf_sample.columns if 'time' in col.lower() or 'date' in col.lower()]
print(f"Time-related columns: {time_cols}")

if time_cols:
    # Convert to datetime if needed
    time_col = time_cols[0]
    if gdf_sample[time_col].dtype == 'object':
        gdf_sample[time_col] = pd.to_datetime(gdf_sample[time_col], errors='coerce')
    
    print(f"\nTime range:")
    print(f"  Start: {gdf_sample[time_col].min()}")
    print(f"  End: {gdf_sample[time_col].max()}")
    print(f"  Duration: {gdf_sample[time_col].max() - gdf_sample[time_col].min()}")

## Export Summary Report

In [None]:
# Create a summary report
summary = {
    'Sample Records': len(gdf_sample),
    'Columns': len(gdf_sample.columns),
    'Geometry Type': gdf_sample.geometry.geom_type.unique().tolist(),
    'CRS': str(gdf_sample.crs),
    'Memory Usage (MB)': gdf_sample.memory_usage(deep=True).sum() / 1024**2
}

print("="*50)
print("DATASET SUMMARY (Sample)")
print("="*50)
for key, value in summary.items():
    print(f"{key}: {value}")
print("="*50)

## Efficient Data Loading Strategies for Large Datasets

In [None]:
# Strategy 1: Load with geographic bounds filter
# Useful if you want to focus on a specific region
# Example: Mediterranean area
bbox = (8.0, 30.0, 45.0, 46.0)  # (minx, miny, maxx, maxy)
# gdf_region = gpd.read_file("../dataset/AISVesselTracks2024.gpkg", 
#                              layer="AISVesselTracks2024", 
#                              bbox=bbox)

# Strategy 2: Load data in chunks and process
print("Strategy 2: Chunked Loading")
chunk_size = 50000
chunks = []
for i, chunk in enumerate(gpd.read_file("../dataset/AISVesselTracks2024.gpkg", 
                                         layer="AISVesselTracks2024", 
                                         chunksize=chunk_size)):
    print(f"  Loaded chunk {i+1}: {len(chunk):,} records")
    if i >= 2:  # Load only first 3 chunks for demo
        break
    chunks.append(chunk)

gdf_chunked = pd.concat(chunks, ignore_index=True)
print(f"Total from chunks: {len(gdf_chunked):,}")

# Strategy 3: SQL-style filtering for large datasets
print("\nStrategy 3: Filter by vessel type or other attributes")
# This would require loading and filtering, but can be done on smaller subsets