In [6]:
# List all layers in the GPKG
import fiona
import geopandas as gpd
layers = fiona.listlayers("dataset/AISVesselTracks2024.gpkg")
print(layers)

['AISVesselTracks2024']


## Load and Explore the Data

In [None]:
# Load the vessel tracks data
gdf = gpd.read_file("../dataset/AISVesselTracks2024.gpkg", layer="AISVesselTracks2024")
print(f"Total records: {len(gdf):,}")
print(f"\nColumns: {list(gdf.columns)}")
print(f"\nCRS: {gdf.crs}")

  return ogr_read(


In [None]:
# Display first few records
gdf.head()

In [None]:
# Data summary
gdf.info()

## Statistical Analysis

In [None]:
# Statistical summary of numerical columns
gdf.describe()

In [None]:
# Check for missing values
missing = gdf.isnull().sum()
missing_pct = (missing / len(gdf)) * 100
missing_df = pd.DataFrame({'Missing Count': missing, 'Percentage': missing_pct})
missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

## Spatial Analysis

In [None]:
# Geographic bounds
bounds = gdf.total_bounds
print(f"Bounding Box:")
print(f"  Min X (West): {bounds[0]:.6f}")
print(f"  Min Y (South): {bounds[1]:.6f}")
print(f"  Max X (East): {bounds[2]:.6f}")
print(f"  Max Y (North): {bounds[3]:.6f}")

In [None]:
# Visualize spatial distribution
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(15, 10))
gdf.plot(ax=ax, markersize=0.5, alpha=0.5, color='blue')
ax.set_title('AIS Vessel Tracks Spatial Distribution', fontsize=16)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
plt.tight_layout()
plt.show()

## Vessel-Specific Analysis

In [None]:
# Count unique vessels (if MMSI column exists)
# Adapt based on actual column names
if 'MMSI' in gdf.columns:
    unique_vessels = gdf['MMSI'].nunique()
    print(f"Total unique vessels: {unique_vessels:,}")
    print(f"\nTop 10 vessels by number of records:")
    print(gdf['MMSI'].value_counts().head(10))
elif 'mmsi' in gdf.columns:
    unique_vessels = gdf['mmsi'].nunique()
    print(f"Total unique vessels: {unique_vessels:,}")
    print(f"\nTop 10 vessels by number of records:")
    print(gdf['mmsi'].value_counts().head(10))
else:
    print("MMSI column not found. Available columns:")
    print(gdf.columns.tolist())

## Temporal Analysis (if timestamp available)

In [None]:
# Check for timestamp columns
time_cols = [col for col in gdf.columns if 'time' in col.lower() or 'date' in col.lower()]
print(f"Time-related columns: {time_cols}")

if time_cols:
    # Convert to datetime if needed
    time_col = time_cols[0]
    if gdf[time_col].dtype == 'object':
        gdf[time_col] = pd.to_datetime(gdf[time_col], errors='coerce')
    
    print(f"\nTime range:")
    print(f"  Start: {gdf[time_col].min()}")
    print(f"  End: {gdf[time_col].max()}")
    print(f"  Duration: {gdf[time_col].max() - gdf[time_col].min()}")

## Export Summary Report

In [None]:
# Create a summary report
summary = {
    'Total Records': len(gdf),
    'Columns': len(gdf.columns),
    'Geometry Type': gdf.geometry.geom_type.unique().tolist(),
    'CRS': str(gdf.crs),
    'Memory Usage (MB)': gdf.memory_usage(deep=True).sum() / 1024**2
}

print("="*50)
print("DATASET SUMMARY")
print("="*50)
for key, value in summary.items():
    print(f"{key}: {value}")
print("="*50)