In [None]:
# Enable autoreload and import libraries
%load_ext autoreload
%autoreload 2

import pandas as pd
import geopandas as gpd
import statsmodels.api as sm
import matplotlib.pyplot as plt

# Load configuration
data_dir = None
%run ../../config.py


In [None]:
# Paths to parquet files produced by the 2019 Q1 notebooks
mobile_path = f"{data_dir}/uss_mobile_2019q1.parquet"
tracts_path = f"{data_dir}/tracts_with_income_2019.parquet"

# Load datasets
mobile_df = pd.read_parquet(mobile_path)
tracts_gdf = gpd.read_parquet(tracts_path)

# Convert mobile_df to GeoDataFrame using existing geometry field
import shapely.wkb

# Convert WKB geometry to shapely objects
mobile_df['geometry'] = mobile_df['geometry'].apply(lambda x: shapely.wkb.loads(x) if x else None)

# Create GeoDataFrame with existing geometry
mobile_gdf = gpd.GeoDataFrame(mobile_df, geometry='geometry', crs="EPSG:4326")

print(f"Mobile data rows: {len(mobile_gdf)}")
print(f"Tracts: {len(tracts_gdf)}")

# Reproject to a common CRS for spatial operations
mobile_gdf = mobile_gdf.to_crs(epsg=3857)
tracts_gdf = tracts_gdf.to_crs(epsg=3857)

# Spatial join: assign each speed point to a tract
joined = gpd.sjoin(mobile_gdf, tracts_gdf[["GEOID", "median_income", "geometry"]], how="inner", predicate="within")

print(f"Joined rows: {len(joined)}")
joined.head()

In [None]:
# Aggregation function and OLS helper
import numpy as np

def aggregate_speeds(joined_gdf, by="GEOID", var="avg_d_kbps", stat="mean"):
    """
    Aggregate speeds by tract with chosen statistic.
    stat: 'mean' | 'median'
    var: column in joined_gdf for dependent variable (e.g., 'avg_d_kbps', 'avg_u_kbps')
    """
    agg_dict = {
        'avg_d_kbps': stat,
        'avg_u_kbps': stat,
        'avg_lat_ms': stat,
        'avg_lat_down_ms': stat,
        'avg_lat_up_ms': stat,
        'tests': 'sum',
        'devices': 'sum',
        'median_income': 'first'
    }
    
    return joined_gdf.groupby(by).agg(agg_dict).reset_index()
    
# Example: aggregate using mean download speed and run OLS
agg_df = aggregate_speeds(joined, var="avg_d_kbps", stat="mean")
agg_df.head()

In [None]:
agg_df.tests.hist(bins=50)

In [None]:
agg_df.avg_d_kbps.describe()

In [None]:
agg_df[agg_df.avg_d_kbps < 100_000].avg_d_kbps.hist(bins=50)

In [None]:
x_var = "median_income"
y_var = "avg_d_kbps"

X = sm.add_constant(agg_df[x_var])  # add intercept

y = agg_df[y_var]
model = sm.OLS(y, X, missing="drop").fit()

print(model.summary())

# Quick scatter plot
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(agg_df[x_var], agg_df[y_var], s=4, alpha=0.5)
ax.set_xlabel("Median Household Income (USD)")
ax.set_ylabel("Avg Download Speed (kbps)")
ax.set_title("Income vs Download Speed by Tract (2019 Q1)")
plt.tight_layout()
plt.show()