In [None]:
# Enable IPython autoreload for modules
%load_ext autoreload
%autoreload 2

import pandas as pd 
# Load configuration
data_dir = None # To silence Pylance; data_dir is defined by the config.py file.
%run ../../config.py

from lib import download_ookla_files, get_ookla_data, list_ookla_objects, build_files_df

In [None]:
import boto3
from botocore import UNSIGNED
from botocore.config import Config
import os
from urllib.parse import urlparse

# Create S3 client with no signing (for public buckets)
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

# List parquet URIs from Ookla bucket
parquet_files = list_ookla_objects(s3)

# Build dataset of file paths with metadata
files_df = build_files_df(parquet_files)

files_df.head(20)

In [None]:
files_df.year.value_counts()

In [None]:
# Download mobile parquet files for Q1 2019
mobile_2019 = download_ookla_files(
    files_df,
    years=2019,
    quarters=[1],
    service_type='mobile',
    data_dir=data_dir,
    s3=s3
)

print(f"Returned rows: {len(mobile_2019)}")
mobile_2019.head()

In [None]:
# Example usage: load mobile data for 2019 Q1
df = get_ookla_data(files_df, year=2019, quarter=1, service_type='mobile', data_dir=data_dir)



In [None]:
df.head()

In [None]:
import geopandas as gpd
from shapely.geometry import Point

# Convert DataFrame to GeoDataFrame using tile_x and tile_y as Point geometry
gdf = gpd.GeoDataFrame(
    df,
    geometry=[Point(x, y) for x, y in zip(df['tile_x'], df['tile_y'])],
    crs='EPSG:4326'  # WGS84 lat/lon
)

print(f"Created GeoDataFrame with {len(gdf)} points")
gdf.head()

In [None]:
# Sample and filter the data
t = gdf

t = t[t['quadkey'].str.startswith('0')]

# Drop coordinate and quadkey columns
uss_df = t.drop(columns=['quadkey', 'tile_x', 'tile_y', 'tile'])
uss_df.head()

In [None]:
uss_df.sample(4000).plot(markersize=1, column="avg_d_kbps")


In [None]:
uss_df[uss_df.avg_d_kbps > 10000].sample(4000).avg_d_kbps.describe()

In [None]:
# Save to parquet file
import os
output_path = os.path.join(data_dir, 'uss_mobile_2019q1.parquet')
uss_df.to_parquet(output_path)
print(f"Saved {len(uss_df)} rows to {output_path}")

In [None]:
uss_df.avg_d_kbps[uss_df.avg_d_kbps < .2e6].sample(4000).hist(bins=100)