In [1]:
# Load configuration
%run ../../config.py

Project root: /workspaces/wireless-income
Data directory: /workspaces/wireless-income/data


In [2]:
import boto3
from botocore import UNSIGNED
from botocore.config import Config
import os
from urllib.parse import urlparse

# Create S3 client with no signing (for public buckets)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# List all objects in the bucket
bucket_name = 'ookla-open-data'
response = s3.list_objects_v2(Bucket=bucket_name)

# Separate parquet files and shapefiles
parquet_files = []
shapefiles = []

# Get all objects (handle pagination if needed)
while True:
    for obj in response.get('Contents', []):
        key = obj['Key']
        if key.endswith('.parquet'):
            parquet_files.append(f"s3://{bucket_name}/{key}")
        elif key.endswith('.zip'):
            shapefiles.append(f"s3://{bucket_name}/{key}")
    
    # Check if there are more results
    if response.get('IsTruncated'):
        response = s3.list_objects_v2(
            Bucket=bucket_name,
            ContinuationToken=response['NextContinuationToken']
        )
    else:
        break

print(f"Found {len(parquet_files)} parquet files")
print(f"Found {len(shapefiles)} shapefiles")

# Display first few of each
print("\nFirst 5 parquet files:")
for f in parquet_files[:5]:
    print(f"  {f}")

print("\nFirst 5 shapefiles:")
for f in shapefiles[:5]:
    print(f"  {f}")

Found 54 parquet files
Found 54 shapefiles

First 5 parquet files:
  s3://ookla-open-data/parquet/performance/type=fixed/year=2019/quarter=1/2019-01-01_performance_fixed_tiles.parquet
  s3://ookla-open-data/parquet/performance/type=fixed/year=2019/quarter=2/2019-04-01_performance_fixed_tiles.parquet
  s3://ookla-open-data/parquet/performance/type=fixed/year=2019/quarter=3/2019-07-01_performance_fixed_tiles.parquet
  s3://ookla-open-data/parquet/performance/type=fixed/year=2019/quarter=4/2019-10-01_performance_fixed_tiles.parquet
  s3://ookla-open-data/parquet/performance/type=fixed/year=2020/quarter=1/2020-01-01_performance_fixed_tiles.parquet

First 5 shapefiles:
  s3://ookla-open-data/shapefiles/performance/type=fixed/year=2019/quarter=1/2019-01-01_performance_fixed_tiles.zip
  s3://ookla-open-data/shapefiles/performance/type=fixed/year=2019/quarter=2/2019-04-01_performance_fixed_tiles.zip
  s3://ookla-open-data/shapefiles/performance/type=fixed/year=2019/quarter=3/2019-07-01_perform

In [None]:
# Get the latest fixed and mobile shapefiles
# Files are named with date format YYYY-MM-DD, so sorting will give us the latest
fixed_shapefiles = [f for f in shapefiles if 'type=fixed' in f]
mobile_shapefiles = [f for f in shapefiles if 'type=mobile' in f]

latest_fixed = sorted(fixed_shapefiles)[-1]
latest_mobile = sorted(mobile_shapefiles)[-1]

print("Latest fixed shapefile:")
print(f"  {latest_fixed}")
print("\nLatest mobile shapefile:")
print(f"  {latest_mobile}")

# Download them to local directory
def download_s3_file(s3_uri):
    """Download a file from S3 to local data directory."""
    parsed = urlparse(s3_uri)
    bucket = parsed.netloc
    key = parsed.path.lstrip('/')
    filename = os.path.basename(key)
    local_path = os.path.join(data_dir, filename)
    
    if os.path.exists(local_path):
        print(f"{filename} already exists, skipping download")
    else:
        print(f"Downloading {filename}...")
        s3.download_file(bucket, key, local_path)
        print(f"  Saved to {local_path}")
    return local_path

fixed_path = download_s3_file(latest_fixed)
mobile_path = download_s3_file(latest_mobile)

print(f"\nDownloaded files:")
print(f"  Mobile: {mobile_path}")
print(f"  Fixed: {fixed_path}")

Latest fixed shapefile:
  s3://ookla-open-data/shapefiles/performance/type=fixed/year=2025/quarter=3/2025-07-01_performance_fixed_tiles.zip

Latest mobile shapefile:
  s3://ookla-open-data/shapefiles/performance/type=mobile/year=2025/quarter=3/2025-07-01_performance_mobile_tiles.zip
Downloading 2025-07-01_performance_fixed_tiles.zip...


In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

# Load the shapefile directly from the zip file
print(f"Loading {fixed_path}...")
gdf = gpd.read_file(f"zip://{fixed_path}")

print(f"\nLoaded {len(gdf)} tiles")
print(f"Columns: {list(gdf.columns)}")
print(f"\nFirst few rows:")
print(gdf.head())

In [None]:
# Create a map
fig, ax = plt.subplots(figsize=(15, 10))
gdf.plot(ax=ax, alpha=0.5, edgecolor='black', linewidth=0.1)
plt.title('Ookla Fixed Broadband Tiles - Q3 2025')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.tight_layout()
plt.show()

# Show some statistics
print(f"\nData summary:")
print(gdf.describe())