# Ookla Broadband Performance Data

This notebook downloads and explores Ookla open data for fixed and mobile broadband performance.

**Important**: Run the cells in order, starting with the configuration cell below.

In [1]:
# Enable IPython autoreload for modules
%load_ext autoreload
%autoreload 2

import pandas as pd 
# Load configuration
data_dir = None # TO silence Pylance, data_dir is defined by the cofig.py file. 
%run ../../config.py

from lib import download_ookla_files, get_ookla_data, list_ookla_objects, build_files_df

Project root: /Users/eric/proj/scratch/WirelessIncome
Data directory: /Users/eric/proj/scratch/WirelessIncome/data


In [2]:
import boto3
from botocore import UNSIGNED
from botocore.config import Config
import os
from urllib.parse import urlparse

# Create S3 client with no signing (for public buckets)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# List parquet URIs from Ookla bucket
parquet_files = list_ookla_objects(s3)

# Build dataset of file paths with metadata
files_df = build_files_df(parquet_files)

files_df.head(20)

Unnamed: 0,path,service_type,year,quarter
0,s3://ookla-open-data/parquet/performance/type=...,fixed,2019,1
1,s3://ookla-open-data/parquet/performance/type=...,fixed,2019,2
2,s3://ookla-open-data/parquet/performance/type=...,fixed,2019,3
3,s3://ookla-open-data/parquet/performance/type=...,fixed,2019,4
4,s3://ookla-open-data/parquet/performance/type=...,fixed,2020,1
5,s3://ookla-open-data/parquet/performance/type=...,fixed,2020,2
6,s3://ookla-open-data/parquet/performance/type=...,fixed,2020,3
7,s3://ookla-open-data/parquet/performance/type=...,fixed,2020,4
8,s3://ookla-open-data/parquet/performance/type=...,fixed,2021,1
9,s3://ookla-open-data/parquet/performance/type=...,fixed,2021,2


In [3]:
files_df.iloc[0]


path            s3://ookla-open-data/parquet/performance/type=...
service_type                                                fixed
year                                                         2019
quarter                                                         1
Name: 0, dtype: object

In [4]:
files_df.year.value_counts()

year
2019    8
2020    8
2021    8
2022    8
2023    8
2024    8
2025    6
Name: count, dtype: int64

In [None]:
TARGET_YEAR = 2022
TARGET_QUARTERS = [1, 2, 3, 4]
SERVICE_TYPES = ["mobile", "fixed"]

downloads = []
for service in SERVICE_TYPES:
    selection = download_ookla_files(
        files_df,
        years=TARGET_YEAR,
        quarters=TARGET_QUARTERS,
        service_type=service,
        data_dir=data_dir,
        s3=s3,
    )
    selection["ookla_service_type"] = service
    downloads.append(selection)

downloaded = pd.concat(downloads, ignore_index=True)
print(
    f"Downloaded metadata for {len(downloaded)} files | "
    f"year={TARGET_YEAR}, quarters={TARGET_QUARTERS}, services={SERVICE_TYPES}"
)
downloaded.head()

Found 4 mobile files for years=[2022], quarters=[1, 2, 3, 4]
2022-01-01_performance_mobile_tiles.parquet already exists, skipping
2022-04-01_performance_mobile_tiles.parquet already exists, skipping
2022-07-01_performance_mobile_tiles.parquet already exists, skipping
2022-10-01_performance_mobile_tiles.parquet already exists, skipping

Downloaded 4 files
Returned rows: 4


Unnamed: 0,path,service_type,year,quarter,local_path
39,s3://ookla-open-data/parquet/performance/type=...,mobile,2022,1,/Users/eric/proj/scratch/WirelessIncome/data/2...
40,s3://ookla-open-data/parquet/performance/type=...,mobile,2022,2,/Users/eric/proj/scratch/WirelessIncome/data/2...
41,s3://ookla-open-data/parquet/performance/type=...,mobile,2022,3,/Users/eric/proj/scratch/WirelessIncome/data/2...
42,s3://ookla-open-data/parquet/performance/type=...,mobile,2022,4,/Users/eric/proj/scratch/WirelessIncome/data/2...


In [None]:
# Load multiple quarters and both service types at once (ensure downloads exist first)
df = get_ookla_data(
    files_df=files_df,
    year=[TARGET_YEAR],
    quarter=TARGET_QUARTERS,
    service_type=SERVICE_TYPES,
    data_dir=data_dir,
)

df.head()

Loading 2022-01-01_performance_mobile_tiles.parquet...
  Loaded 3820724 rows
Loading 2022-04-01_performance_mobile_tiles.parquet...
  Loaded 4027744 rows
Loading 2022-07-01_performance_mobile_tiles.parquet...
  Loaded 4046154 rows
Loading 2022-10-01_performance_mobile_tiles.parquet...
  Loaded 3838065 rows
Concatenated 4 files -> 15732687 total rows


In [8]:
# Inspect which metadata combinations are present after concatenation
t = (
    df[["ookla_year", "ookla_quarter", "ookla_service_type"]]
    .drop_duplicates()
    .sort_values(["ookla_service_type", "ookla_year", "ookla_quarter"])
)
t

Unnamed: 0,ookla_year,ookla_quarter,ookla_service_type
0,2022,1,mobile
3820724,2022,2,mobile
7848468,2022,3,mobile
11894622,2022,4,mobile


In [None]:
import geopandas as gpd
from shapely import wkb, wkt
from shapely.geometry import Point


def _load_geometry(value):
    if hasattr(value, "geom_type"):
        return value
    if isinstance(value, bytes):
        return wkb.loads(value)
    if isinstance(value, str):
        try:
            return wkt.loads(value)
        except Exception:
            pass
    return None

if "geometry" in df.columns:
    geometry_series = df["geometry"].apply(_load_geometry)
    missing_geometry = geometry_series.isna().sum()
    if missing_geometry:
        # Fall back to tile coordinates where geometry parsing failed
        fallback_points = [Point(x, y) for x, y in zip(df["tile_x"], df["tile_y"])]
        geometry_series = geometry_series.fillna(pd.Series(fallback_points, index=df.index))
else:
    geometry_series = [Point(x, y) for x, y in zip(df["tile_x"], df["tile_y"])]

gdf = gpd.GeoDataFrame(df.copy(), geometry=geometry_series, crs="EPSG:4326")
print(f"Created GeoDataFrame with {len(gdf)} points spanning {SERVICE_TYPES}")
gdf.head()

Created GeoDataFrame with 15732687 points


Unnamed: 0,quadkey,tile,tile_x,tile_y,avg_d_kbps,avg_u_kbps,avg_lat_ms,avg_lat_down_ms,avg_lat_up_ms,tests,devices,ookla_year,ookla_quarter,ookla_service_type,geometry
0,22133222312322,"POLYGON((-160.02685546875 70.6435894914449, -1...",-160.0241,70.6427,19110,7891,77,,,3,2,2022,1,mobile,POINT (-160.0241 70.6427)
1,22133222330023,"POLYGON((-160.043334960938 70.6363054807905, -...",-160.0406,70.6354,21870,11875,83,,,2,1,2022,1,mobile,POINT (-160.0406 70.6354)
2,22133222330032,"POLYGON((-160.037841796875 70.6363054807905, -...",-160.0351,70.6354,14567,14691,75,,,9,2,2022,1,mobile,POINT (-160.0351 70.6354)
3,22133222330100,"POLYGON((-160.02685546875 70.6417687358462, -1...",-160.0241,70.6409,5468,9886,83,,,1,1,2022,1,mobile,POINT (-160.0241 70.6409)
4,22133222330102,"POLYGON((-160.02685546875 70.6399478155463, -1...",-160.0241,70.639,24311,16243,72,,,1,1,2022,1,mobile,POINT (-160.0241 70.639)


In [None]:
# Filter to continental US quadkeys and drop unused tile columns
point_gdf = gdf[gdf["quadkey"].str.startswith("0")].copy()
point_gdf = point_gdf.drop(columns=["quadkey", "tile_x", "tile_y", "tile"], errors="ignore")

print(f"Filtered to {len(point_gdf):,} Ookla samples after quadkey filter")
point_gdf.head()

Unnamed: 0,avg_d_kbps,avg_u_kbps,avg_lat_ms,avg_lat_down_ms,avg_lat_up_ms,tests,devices,ookla_year,ookla_quarter,ookla_service_type,geometry
0,19110,7891,77,,,3,2,2022,1,mobile,POINT (-160.0241 70.6427)
1,21870,11875,83,,,2,1,2022,1,mobile,POINT (-160.0406 70.6354)
2,14567,14691,75,,,9,2,2022,1,mobile,POINT (-160.0351 70.6354)
3,5468,9886,83,,,1,1,2022,1,mobile,POINT (-160.0241 70.6409)
4,24311,16243,72,,,1,1,2022,1,mobile,POINT (-160.0241 70.639)


In [None]:
point_gdf.shape

(4066168, 11)