# Binscatter Examples

This notebook mirrors the main README example and the Optuna-based demos, so we can quickly preview how `binscatter` behaves on the shipped artifacts.

## Setup

In [29]:
from pathlib import Path

import polars as pl
from binscatter import binscatter

data_dir = Path("../artifacts")
assert data_dir.exists(), f"Data directory {data_dir.resolve()} does not exist."
pl.Config.set_tbl_rows(6)

polars.config.Config

## README replication data

The processed Moretti (2021) replication frame lives under `artifacts/state_data_processed.parquet`. We'll reuse the same columns showcased in the README's binscatter.

In [43]:
df = (
    pl.read_parquet(data_dir / "state_data_processed.parquet")
    .select(
        "mtr90_lag3",
        "lnpat",
        "top_corp_lag3",
        "real_gdp_pc",
        "population_density",
        "rd_credit_lag3",
        "statenum",
        "year",
    )
)
df = df.with_columns(pl.col("statenum", "year").cast(pl.String))
print("DataFrame loaded with shape:", df.shape)
df.head()

DataFrame loaded with shape: (3744, 8)


mtr90_lag3,lnpat,top_corp_lag3,real_gdp_pc,population_density,rd_credit_lag3,statenum,year
f32,f32,f32,f32,f32,f32,str,str
-0.036664,1.791759,-0.162519,,,0.0,"""2""","""1939"""
-0.036664,1.386294,-0.162519,,,0.0,"""2""","""1940"""
-0.036664,2.302585,-0.210721,,,0.0,"""2""","""1941"""
-0.036664,1.098612,-0.210721,,,0.0,"""2""","""1942"""
-0.040405,1.098612,-0.274437,,,0.0,"""2""","""1943"""


In [None]:
p_binscatter_bare = binscatter(
    df,
    "mtr90_lag3",
    "lnpat",
    title="No controls"
)
controls = [
    "top_corp_lag3",
    "real_gdp_pc",
    "population_density",
    "rd_credit_lag3",
    "statenum",
    "year",
]
p_binscatter_controls = binscatter(
    df,
    "mtr90_lag3",
    "lnpat",
    controls=controls,
    title="With controls"
)
p_binscatter_controls_poly = binscatter(
    df,
    "mtr90_lag3",
    "lnpat",
    controls=controls,
    poly_line=2,
    title="With controls and polynomial fit"
)


p_binscatter_bare.show()
p_binscatter_controls.show()
p_binscatter_controls_poly.show()

## Optuna ElasticNet trials

Next we load the ElasticNet tuning trials produced by `scripts/optuna_example/optimize_elasticnet.py`.

In [33]:
elasticnet_df = (
    pl.read_parquet(data_dir / "optuna_elasticnet_trials.parquet")
    .select("alpha", "l1_ratio", "rmse", "duration_seconds")
    .to_pandas()
)
elasticnet_df.head()

Unnamed: 0,alpha,l1_ratio,rmse,duration_seconds
0,0.015677,0.715189,58.591274,0.004749
1,0.025766,0.544883,58.616739,0.001791
2,0.00495,0.645894,58.553649,0.002373
3,0.005628,0.891773,58.537361,0.001927
4,0.715568,0.383442,57.398389,0.001162


In [34]:
fig_elastic = binscatter(
    elasticnet_df,
    x="alpha",
    y="rmse",
    controls=["l1_ratio", "duration_seconds"],
    num_bins=18,
)
fig_elastic

## Optuna LightGBM trials

Finally, inspect the LightGBM learning-rate sweeps saved under `artifacts/optuna_lightgbm_trials.parquet`.

In [35]:
lightgbm_df = (
    pl.read_parquet(data_dir / "optuna_lightgbm_trials.parquet")
    .select(
        "learning_rate",
        "num_leaves",
        "min_child_samples",
        "feature_fraction",
        "lambda_l1",
        "rmse",
    )
    .to_pandas()
)
lightgbm_df.head()

Unnamed: 0,learning_rate,num_leaves,min_child_samples,feature_fraction,lambda_l1,rmse
0,0.224037,96,50,0.772442,2.118274,0.467123
1,0.180659,116,78,0.691721,3.958625,0.554617
2,0.231537,120,10,0.543565,0.101092,0.461091
3,0.313481,114,79,0.899579,2.307397,0.492723
4,0.056127,88,15,0.972334,2.609242,0.439626


In [36]:
fig_lightgbm = binscatter(
    lightgbm_df,
    x="learning_rate",
    y="rmse",
    controls=[
        "num_leaves",
        "min_child_samples",
        "feature_fraction",
        "lambda_l1",
    ],
    num_bins=15,
)
fig_lightgbm