# Nonparametric Quantile Estimation via Quantization

Linear quantile regression assumes the conditional quantile is a linear
function of the covariates.  The `QuantizationQuantileEstimator` provides
a **fully nonparametric** alternative using optimal quantization.

**Reference**: Charlier, Paindaveine & Saracco (2015). *JSPI* 156, 14\u201330.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pinball import QuantileRegressor
from pinball.nonparametric import QuantizationQuantileEstimator

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
rng = np.random.default_rng(42)

## Synthetic Nonlinear Data

We generate data from a sinusoidal model with heteroscedastic noise.

In [None]:
n = 800
X_1d = rng.uniform(0, 2 * np.pi, n)
noise_scale = 0.3 * (1 + 0.5 * X_1d)
y = np.sin(X_1d) + noise_scale * rng.normal(size=n)

X = X_1d.reshape(-1, 1)
order = np.argsort(X_1d)

plt.scatter(X_1d, y, alpha=0.3, s=10, color='grey')
plt.plot(X_1d[order], np.sin(X_1d[order]), 'k-', lw=2, label='True mean')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Nonlinear Data with Heteroscedastic Noise')
plt.legend()
plt.show()

## Linear vs. Nonparametric: Median Regression

In [None]:
linear_model = QuantileRegressor(tau=0.5, method='fn')
linear_model.fit(X, y)

np_model = QuantizationQuantileEstimator(
    tau=0.5, N=30, n_grids=50, random_state=42
)
np_model.fit(X, y)

x_grid = np.linspace(0, 2 * np.pi, 300).reshape(-1, 1)
y_linear = linear_model.predict(x_grid)
y_np = np_model.predict(x_grid)

plt.figure(figsize=(10, 6))
plt.scatter(X_1d, y, alpha=0.2, s=10, color='grey', label='Data')
plt.plot(x_grid, np.sin(x_grid), 'k-', lw=2, label='True median')
plt.plot(x_grid, y_linear, '--', color='tab:red', lw=2, label='Linear QR')
plt.plot(x_grid, y_np, '-', color='tab:blue', lw=2, label='Quantization QR')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Median Regression: Linear vs. Nonparametric')
plt.legend()
plt.show()

The linear model fits a straight line (missing the curvature), while the
quantization estimator tracks the sinusoidal pattern.

## Multiple Quantiles with the Nonparametric Estimator

In [None]:
taus = [0.1, 0.25, 0.5, 0.75, 0.9]
colours = plt.cm.RdYlBu_r(np.linspace(0.1, 0.9, len(taus)))

plt.figure(figsize=(10, 6))
plt.scatter(X_1d, y, alpha=0.15, s=8, color='grey')

for tau, col in zip(taus, colours):
    model = QuantizationQuantileEstimator(
        tau=tau, N=30, n_grids=50, random_state=42
    )
    model.fit(X, y)
    y_hat = model.predict(x_grid)
    plt.plot(x_grid, y_hat, color=col, lw=2, label=f'tau = {tau}')

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Nonparametric Quantile Regression \u2014 Multiple Quantiles')
plt.legend()
plt.show()

## Effect of Grid Size N

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)

for ax, N in zip(axes, [5, 20, 50]):
    model = QuantizationQuantileEstimator(
        tau=0.5, N=N, n_grids=50, random_state=42
    )
    model.fit(X, y)
    y_hat = model.predict(x_grid)

    ax.scatter(X_1d, y, alpha=0.15, s=5, color='grey')
    ax.plot(x_grid, np.sin(x_grid), 'k-', lw=1.5, label='Truth')
    ax.plot(x_grid, y_hat, '-', color='tab:blue', lw=2, label=f'N={N}')
    ax.set_title(f'N = {N}')
    ax.set_xlabel('X')
    ax.legend(fontsize=9)

axes[0].set_ylabel('Y')
plt.suptitle('Effect of Grid Size N on the Quantization Estimator', y=1.02)
plt.tight_layout()
plt.show()

## Using in an sklearn Pipeline

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

pipe = Pipeline([
    ('scale', StandardScaler()),
    ('quant', QuantizationQuantileEstimator(tau=0.5, N=25, random_state=42)),
])

scores = cross_val_score(pipe, X, y, cv=5, scoring='r2')
print(f'5-fold CV R-squared scores: {scores}')
print(f'Mean R-squared = {scores.mean():.3f} +/- {scores.std():.3f}')

## 2D Example: Bivariate Covariates

In [None]:
n2 = 1000
X2 = rng.uniform(0, 2 * np.pi, (n2, 2))
y2 = np.sin(X2[:, 0]) * np.cos(X2[:, 1]) + 0.3 * rng.normal(size=n2)

model_2d = QuantizationQuantileEstimator(
    tau=0.5, N=40, n_grids=50, random_state=42
)
model_2d.fit(X2, y2)

x1_grid = np.linspace(0, 2 * np.pi, 50)
x2_grid = np.linspace(0, 2 * np.pi, 50)
X1g, X2g = np.meshgrid(x1_grid, x2_grid)
X_pred = np.column_stack([X1g.ravel(), X2g.ravel()])
Y_pred = model_2d.predict(X_pred).reshape(X1g.shape)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

Y_true = np.sin(X1g) * np.cos(X2g)
im0 = axes[0].contourf(X1g, X2g, Y_true, levels=20, cmap='RdYlBu_r')
axes[0].set_title('True Median Surface')
axes[0].set_xlabel('X1')
axes[0].set_ylabel('X2')
plt.colorbar(im0, ax=axes[0])

im1 = axes[1].contourf(X1g, X2g, Y_pred, levels=20, cmap='RdYlBu_r')
axes[1].set_title('Estimated Median (N=40, B=50)')
axes[1].set_xlabel('X1')
axes[1].set_ylabel('X2')
plt.colorbar(im1, ax=axes[1])

plt.suptitle('2D Nonparametric Quantile Regression', y=1.02)
plt.tight_layout()
plt.show()

## Summary

| Feature | Linear QR | Quantization QR |
|---------|-----------|----------------|
| Functional form | Linear | **Any** |
| Coefficients | Yes | No (cell-based) |
| Standard errors | Yes | No |
| Curse of dimensionality | No | Yes |
| Best for | Interpretable linear effects | Nonlinear patterns |