In [1]:
import boto3
from botocore import UNSIGNED
from botocore.config import Config

# Create S3 client with no signing (for public buckets)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# List all objects in the bucket
bucket_name = 'ookla-open-data'
response = s3.list_objects_v2(Bucket=bucket_name)

# Separate parquet files and shapefiles
parquet_files = []
shapefiles = []

# Get all objects (handle pagination if needed)
while True:
    for obj in response.get('Contents', []):
        key = obj['Key']
        if key.endswith('.parquet'):
            parquet_files.append(f"s3://{bucket_name}/{key}")
        elif key.endswith('.zip'):
            shapefiles.append(f"s3://{bucket_name}/{key}")
    
    # Check if there are more results
    if response.get('IsTruncated'):
        response = s3.list_objects_v2(
            Bucket=bucket_name,
            ContinuationToken=response['NextContinuationToken']
        )
    else:
        break

print(f"Found {len(parquet_files)} parquet files")
print(f"Found {len(shapefiles)} shapefiles")

# Display first few of each
print("\nFirst 5 parquet files:")
for f in parquet_files[:5]:
    print(f"  {f}")

print("\nFirst 5 shapefiles:")
for f in shapefiles[:5]:
    print(f"  {f}")

Found 54 parquet files
Found 54 shapefiles

First 5 parquet files:
  s3://ookla-open-data/parquet/performance/type=fixed/year=2019/quarter=1/2019-01-01_performance_fixed_tiles.parquet
  s3://ookla-open-data/parquet/performance/type=fixed/year=2019/quarter=2/2019-04-01_performance_fixed_tiles.parquet
  s3://ookla-open-data/parquet/performance/type=fixed/year=2019/quarter=3/2019-07-01_performance_fixed_tiles.parquet
  s3://ookla-open-data/parquet/performance/type=fixed/year=2019/quarter=4/2019-10-01_performance_fixed_tiles.parquet
  s3://ookla-open-data/parquet/performance/type=fixed/year=2020/quarter=1/2020-01-01_performance_fixed_tiles.parquet

First 5 shapefiles:
  s3://ookla-open-data/shapefiles/performance/type=fixed/year=2019/quarter=1/2019-01-01_performance_fixed_tiles.zip
  s3://ookla-open-data/shapefiles/performance/type=fixed/year=2019/quarter=2/2019-04-01_performance_fixed_tiles.zip
  s3://ookla-open-data/shapefiles/performance/type=fixed/year=2019/quarter=3/2019-07-01_perform

In [None]:
# Get the latest fixed and mobile shapefiles
# Files are named with date format YYYY-MM-DD, so sorting will give us the latest
fixed_shapefiles = [f for f in shapefiles if 'type=fixed' in f]
mobile_shapefiles = [f for f in shapefiles if 'type=mobile' in f]

latest_fixed = sorted(fixed_shapefiles)[-1]
latest_mobile = sorted(mobile_shapefiles)[-1]

print("Latest fixed shapefile:")
print(f"  {latest_fixed}")
print("\nLatest mobile shapefile:")
print(f"  {latest_mobile}")

# Download them to local directory
import os
from urllib.parse import urlparse

os.makedirs('data', exist_ok=True)

def download_s3_file(s3_uri):
    """Download a file from S3 to local data directory."""
    parsed = urlparse(s3_uri)
    bucket = parsed.netloc
    key = parsed.path.lstrip('/')
    filename = os.path.basename(key)
    local_path = os.path.join('data', filename)
    
    if os.path.exists(local_path):
        print(f"{filename} already exists, skipping download")
    else:
        print(f"Downloading {filename}...")
        s3.download_file(bucket, key, local_path)
        print(f"  Saved to {local_path}")
    return local_path

fixed_path = download_s3_file(latest_fixed)
mobile_path = download_s3_file(latest_mobile)


print(f"\nDownloaded files:")
print(f"  Mobile: {mobile_path}")
print(f"  Fixed: {fixed_path}")

Latest fixed shapefile:
  s3://ookla-open-data/shapefiles/performance/type=fixed/year=2025/quarter=3/2025-07-01_performance_fixed_tiles.zip

Latest mobile shapefile:
  s3://ookla-open-data/shapefiles/performance/type=mobile/year=2025/quarter=3/2025-07-01_performance_mobile_tiles.zip
2025-07-01_performance_fixed_tiles.zip already exists, skipping download
2025-07-01_performance_mobile_tiles.zip already exists, skipping download

Downloaded files:
  Mobile: data/2025-07-01_performance_mobile_tiles.zip
  Fixed: data/2025-07-01_performance_fixed_tiles.zip


: 

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt

# Load the shapefile directly from the zip file
print(f"Loading {fixed_path}...")
gdf = gpd.read_file(f"zip://{fixed_path}")

print(f"\nLoaded {len(gdf)} tiles")
print(f"Columns: {list(gdf.columns)}")
print(f"\nFirst few rows:")
print(gdf.head())


Loading data/2025-07-01_performance_fixed_tiles.zip...


In [None]:

# Create a map
fig, ax = plt.subplots(figsize=(15, 10))
gdf.plot(ax=ax, alpha=0.5, edgecolor='black', linewidth=0.1)
plt.title('Ookla Fixed Broadband Tiles - Q3 2025')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.tight_layout()
plt.show()

# Show some statistics
print(f"\nData summary:")
print(gdf.describe())

In [None]:
# Get Census data - 2025 ACS 5-Year median household income by tract
# Note: 2025 ACS data may not be available yet, will use 2022 (latest available)
import requests
import pandas as pd

# Census API endpoint for ACS 5-year estimates
# B19013_001E is median household income
# We'll get all tracts in all states

print("Fetching ACS 5-Year median household income data by tract...")

# Get state FIPS codes first
states_url = "https://api.census.gov/data/2022/acs/acs5?get=NAME&for=state:*"
states_response = requests.get(states_url)
states_data = states_response.json()
state_fips = [row[1] for row in states_data[1:]]  # Skip header

print(f"Found {len(state_fips)} states/territories")

# Fetch median income for all tracts in each state
all_tracts = []

for i, state_fips_code in enumerate(state_fips[:5]):  # Start with first 5 states for testing
    try:
        url = f"https://api.census.gov/data/2022/acs/acs5?get=NAME,B19013_001E&for=tract:*&in=state:{state_fips_code}"
        response = requests.get(url)
        data = response.json()
        
        # Convert to dataframe
        df = pd.DataFrame(data[1:], columns=data[0])
        all_tracts.append(df)
        print(f"  State {state_fips_code}: {len(df)} tracts")
    except Exception as e:
        print(f"  Error fetching state {state_fips_code}: {e}")

# Combine all data
income_df = pd.concat(all_tracts, ignore_index=True)

# Clean up the data
income_df['median_income'] = pd.to_numeric(income_df['B19013_001E'], errors='coerce')
income_df['GEOID'] = income_df['state'] + income_df['county'] + income_df['tract']

print(f"\nTotal tracts with income data: {len(income_df)}")
print(f"Tracts with valid income: {income_df['median_income'].notna().sum()}")
print(f"\nIncome statistics:")
print(income_df['median_income'].describe())
print(f"\nSample data:")
print(income_df[['NAME', 'median_income', 'GEOID']].head(10))

In [None]:
# Join income data to tract geometries
print("Joining income data to tract shapefile...")

tracts_with_income = tracts_gdf.merge(
    income_df[['GEOID', 'median_income', 'NAME']], 
    on='GEOID', 
    how='left',
    suffixes=('_tract', '_income')
)

print(f"\nTracts with income data: {tracts_with_income['median_income'].notna().sum()}")
print(f"Tracts without income data: {tracts_with_income['median_income'].isna().sum()}")

# Quick map of median income
fig, ax = plt.subplots(figsize=(15, 10))
tracts_with_income.plot(
    column='median_income',
    ax=ax,
    legend=True,
    cmap='YlOrRd',
    edgecolor='black',
    linewidth=0.1,
    missing_kwds={'color': 'lightgrey'}
)
plt.title('Median Household Income by Census Tract (2022 ACS 5-Year)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.tight_layout()
plt.show()

print("\nIncome data ready for analysis!")

In [None]:
# Download Census TIGER tract shapefiles for 2022
# TIGER files are available from Census FTP

print("Downloading Census TIGER tract shapefiles...")

# National tract shapefile URL for 2022
tiger_url = "https://www2.census.gov/geo/tiger/TIGER2022/TRACT/tl_2022_us_tract.zip"
tiger_path = "data/tl_2022_us_tract.zip"

# Download the file if it doesn't exist
if os.path.exists(tiger_path):
    print(f"TIGER shapefile already exists at {tiger_path}, skipping download")
else:
    print(f"Downloading from {tiger_url}...")
    response = requests.get(tiger_url, stream=True)
    with open(tiger_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"Saved to {tiger_path}")

# Load the tract shapefile
print("Loading tract shapefile...")
tracts_gdf = gpd.read_file(f"zip://{tiger_path}")

# Create GEOID to match income data
tracts_gdf['GEOID'] = tracts_gdf['GEOID']

print(f"\nLoaded {len(tracts_gdf)} census tracts")
print(f"Columns: {list(tracts_gdf.columns)}")

print(f"\nSample tracts:")
print(tracts_gdf.head())