In [1]:
from pathlib import Path
import duckdb
import pandas as pd
import numpy as np

PROJECT_ROOT = Path("..").resolve()
RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"

# Inputs
TRIPS_PATH = PROJECT_ROOT / "data" / "interim" / "tripdata_2013_2025_clean.parquet"
BOROUGH_GEOJSON = RAW_DIR / "borough_boundaries.geojson"

# From bike counter pipeline
COUNTER_MONTHLY_PATH = PROCESSED_DIR / "nyc_bike_counters" / "bike_counts_monthly_by_borough.parquet"

# Outputs
OUT_DIR = PROCESSED_DIR / "proxy_test"
OUT_DIR.mkdir(parents=True, exist_ok=True)

CITI_MONTHLY_PATH = OUT_DIR / "citi_exposure_monthly_by_borough.parquet"
PROXY_DATASET_PATH = OUT_DIR / "proxy_test_borough_month.parquet"

# DuckDB
con = duckdb.connect(database=str(OUT_DIR / "proxy_test.duckdb"))
con.execute("PRAGMA threads=4")
con.execute("PRAGMA memory_limit='2GB'")
print("OK. OUT_DIR =", OUT_DIR)


OK. OUT_DIR = /Users/patricknussbaum/Desktop/projects/city-bike/data/processed/proxy_test


In [3]:
con.execute("INSTALL spatial;")
con.execute("LOAD spatial;")

con.execute("DROP TABLE IF EXISTS boroughs")
con.execute(f"""
CREATE TABLE boroughs AS
WITH root AS (
  SELECT * FROM read_json('{BOROUGH_GEOJSON.as_posix()}')
),
feat AS (
  SELECT unnest(features) AS feature
  FROM root
)
SELECT
  COALESCE(
    json_extract_string(feature, '$.properties.boro_name'),
    json_extract_string(feature, '$.properties.BoroName'),
    json_extract_string(feature, '$.properties.borough'),
    json_extract_string(feature, '$.properties.boroname'),
    json_extract_string(feature, '$.properties.name')
  ) AS borough,
  ST_GeomFromGeoJSON(json_extract(feature, '$.geometry')) AS geom
FROM feat
WHERE json_extract(feature, '$.geometry') IS NOT NULL;
""")

print(con.execute("SELECT borough, COUNT(*) n FROM boroughs GROUP BY 1").fetch_df())


         borough  n
0         Queens  1
1       Brooklyn  1
2  Staten Island  1
3          Bronx  1
4      Manhattan  1


In [4]:
START = "2020-01-01"
END_EXCL = "2024-12-31"

# Grid size in degrees (~1km is ~0.01 in lat; NYC scale ok for proxy)
GRID_DEG = 0.025   

con.execute("DROP TABLE IF EXISTS citi_monthly")

con.execute(f"""
CREATE TABLE citi_monthly AS
WITH trips AS (
  SELECT
    date_trunc('month', try_cast(started_at AS TIMESTAMP)) AS month_ts,
    floor(try_cast(start_lat AS DOUBLE) / {GRID_DEG}) * {GRID_DEG} AS glat,
    floor(try_cast(start_lng AS DOUBLE) / {GRID_DEG}) * {GRID_DEG} AS glon,
    try_cast(duration_sec AS DOUBLE)/60.0 AS exposure_min
  FROM read_parquet('{TRIPS_PATH.as_posix()}')
  WHERE try_cast(started_at AS TIMESTAMP) >= TIMESTAMP '{START}'
    AND try_cast(started_at AS TIMESTAMP) <  TIMESTAMP '{END_EXCL}'
    AND start_lat IS NOT NULL AND start_lng IS NOT NULL
    AND duration_sec IS NOT NULL
    AND duration_sec > 0 AND duration_sec < 4*60*60
),
cell_month AS (
  SELECT
    month_ts,
    glat,
    glon,
    COUNT(*) AS citi_trips,
    SUM(exposure_min) AS citi_exposure_min
  FROM trips
  GROUP BY 1,2,3
),
cell_boro AS (
  SELECT
    cm.*,
    COALESCE(b.borough, 'UNKNOWN') AS borough
  FROM cell_month cm
  LEFT JOIN boroughs b
    ON ST_Contains(b.geom, ST_Point(cm.glon, cm.glat))
)
SELECT
  month_ts,
  borough,
  SUM(citi_trips) AS citi_trips,
  SUM(citi_exposure_min) AS citi_exposure_min
FROM cell_boro
WHERE borough <> 'UNKNOWN'
GROUP BY 1,2
ORDER BY 1,2;
""")

con.execute(f"COPY citi_monthly TO '{CITI_MONTHLY_PATH.as_posix()}' (FORMAT PARQUET);")
print("Wrote:", CITI_MONTHLY_PATH)
print(con.execute("SELECT * FROM citi_monthly ORDER BY month_ts, borough LIMIT 10").fetch_df())


Wrote: /Users/patricknussbaum/Desktop/projects/city-bike/data/processed/proxy_test/citi_exposure_monthly_by_borough.parquet
    month_ts    borough  citi_trips  citi_exposure_min
0 2020-01-01   Brooklyn    202985.0       2.273642e+06
1 2020-01-01  Manhattan    677190.0       7.994481e+06
2 2020-01-01     Queens     22315.0       2.522009e+05
3 2020-02-01   Brooklyn    189369.0       2.097711e+06
4 2020-02-01  Manhattan    624848.0       7.301890e+06
5 2020-02-01     Queens     21178.0       2.382814e+05
6 2020-03-01   Brooklyn    187635.0       2.960495e+06
7 2020-03-01  Manhattan    561224.0       8.485047e+06
8 2020-03-01     Queens     25082.0       3.950936e+05
9 2020-04-01   Brooklyn    134350.0       2.967044e+06


In [6]:
con.execute("DROP TABLE IF EXISTS counter_monthly")
con.execute(f"""
CREATE TABLE counter_monthly AS
SELECT
  try_cast(month_ts AS TIMESTAMP) AS month_ts,
  upper(trim(borough)) AS borough,
  try_cast(bike_count AS DOUBLE) AS counter_bike_count
FROM read_parquet('{COUNTER_MONTHLY_PATH.as_posix()}')
WHERE month_ts IS NOT NULL;
""")

con.execute("DROP TABLE IF EXISTS citi_monthly_norm")
con.execute("""
CREATE TABLE citi_monthly_norm AS
SELECT
  month_ts,
  upper(trim(borough)) AS borough,
  citi_trips,
  citi_exposure_min
FROM citi_monthly;
""")

# Join dataset
con.execute("DROP TABLE IF EXISTS proxy_bm")
con.execute("""
CREATE TABLE proxy_bm AS
SELECT
  c.month_ts,
  c.borough,
  c.citi_trips,
  c.citi_exposure_min,
  m.counter_bike_count
FROM citi_monthly_norm c
LEFT JOIN counter_monthly m
  USING(month_ts, borough)
WHERE c.borough <> 'UNKNOWN';
""")

df = con.execute("SELECT * FROM proxy_bm ORDER BY month_ts, borough").fetch_df()
print("Rows:", len(df))
df.head()


Rows: 233


Unnamed: 0,month_ts,borough,citi_trips,citi_exposure_min,counter_bike_count
0,2020-01-01,BROOKLYN,202985.0,2273642.0,188120.0
1,2020-01-01,MANHATTAN,677190.0,7994481.0,361681.0
2,2020-01-01,QUEENS,22315.0,252200.9,107117.0
3,2020-02-01,BROOKLYN,189369.0,2097711.0,171002.0
4,2020-02-01,MANHATTAN,624848.0,7301890.0,327645.0


In [7]:
import scipy.stats as st

tmp = df.dropna(subset=["counter_bike_count"]).copy()
tmp = tmp[(tmp["citi_exposure_min"] > 0) & (tmp["counter_bike_count"] > 0)].copy()

tmp["log_citi"] = np.log(tmp["citi_exposure_min"])
tmp["log_cnt"] = np.log(tmp["counter_bike_count"])

pearson = st.pearsonr(tmp["log_citi"], tmp["log_cnt"])
spearman = st.spearmanr(tmp["log_citi"], tmp["log_cnt"])

print("Proxy correlation in log-space:")
print("  Pearson r =", float(pearson.statistic), "p =", float(pearson.pvalue))
print("  Spearman ρ =", float(spearman.statistic), "p =", float(spearman.pvalue))

print("\nCoverage:")
print("  Months×Borough with counter data:", len(tmp), "out of", len(df))


Proxy correlation in log-space:
  Pearson r = 0.8275254300204932 p = 1.5148082663145897e-53
  Spearman ρ = 0.8911749958660714 p = 1.203369975376121e-72

Coverage:
  Months×Borough with counter data: 208 out of 233


In [8]:
import statsmodels.api as sm

X = sm.add_constant(tmp["log_citi"].values)
y = tmp["log_cnt"].values

ols = sm.OLS(y, X).fit()
a, b = ols.params  # intercept, slope

print(ols.summary())
print("\nKey proxy metrics:")
print("  slope b =", float(b), " (ideal ~ 1.0)")
print("  R^2     =", float(ols.rsquared), "(higher is better)")


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.685
Model:                            OLS   Adj. R-squared:                  0.683
Method:                 Least Squares   F-statistic:                     447.5
Date:                Tue, 30 Dec 2025   Prob (F-statistic):           1.51e-53
Time:                        20:20:17   Log-Likelihood:                -66.952
No. Observations:                 208   AIC:                             137.9
Df Residuals:                     206   BIC:                             144.6
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.1742      0.262     27.388      0.0

In [10]:
tmp["share_idx"] = tmp["citi_exposure_min"] / tmp["counter_bike_count"]

# overall distribution
q = tmp["share_idx"].quantile([0.05, 0.25, 0.5, 0.75, 0.95]).to_dict()
print("Share index quantiles (overall):")
for k,v in q.items():
    print(f"  q{int(k*100):02d} = {float(v):.6g}")

# stability per borough: coefficient of variation (std/mean)
stability = (
    tmp.groupby("borough")["share_idx"]
      .agg(["count","mean","std"])
      .assign(cv=lambda x: x["std"]/x["mean"])
      .sort_values("cv")
)
print("\nShare index stability by borough (lower CV = more stable):")
display(stability)


Share index quantiles (overall):
  q05 = 2.35574
  q25 = 5.16401
  q50 = 18.1539
  q75 = 26.133
  q95 = 30.1864

Share index stability by borough (lower CV = more stable):


Unnamed: 0_level_0,count,mean,std,cv
borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MANHATTAN,60,27.251182,3.985691,0.146258
BROOKLYN,60,21.665361,5.271596,0.243319
QUEENS,60,5.787263,1.883496,0.325455
BRONX,28,4.99762,11.682649,2.337643


In [11]:
# keep only rows with counter data
out = tmp[[
    "month_ts", "borough", "citi_trips", "citi_exposure_min",
    "counter_bike_count", "log_citi", "log_cnt", "share_idx"
]].copy()

out.to_parquet(PROXY_DATASET_PATH, index=False)
print("Wrote dashboard dataset:", PROXY_DATASET_PATH)
out.head()


Wrote dashboard dataset: /Users/patricknussbaum/Desktop/projects/city-bike/data/processed/proxy_test/proxy_test_borough_month.parquet


Unnamed: 0,month_ts,borough,citi_trips,citi_exposure_min,counter_bike_count,log_citi,log_cnt,share_idx
0,2020-01-01,BROOKLYN,202985.0,2273642.0,188120.0,14.636893,12.144835,12.086124
1,2020-01-01,MANHATTAN,677190.0,7994481.0,361681.0,15.894262,12.798518,22.10368
2,2020-01-01,QUEENS,22315.0,252200.9,107117.0,12.437981,11.581677,2.354443
3,2020-02-01,BROOKLYN,189369.0,2097711.0,171002.0,14.556357,12.049431,12.267174
4,2020-02-01,MANHATTAN,624848.0,7301890.0,327645.0,15.803644,12.699686,22.285981
