In [1]:
from mitools import reg
import numpy as np
from pandas import DataFrame
import statsmodels.api as sm
from mitools.visuals import LinePlotter, ScatterPlotter

# Quantile Regression from Statsmodels

https://www.statsmodels.org/stable/examples/notebooks/generated/quantile_regression.html

In [None]:
data = sm.datasets.engel.load_pandas().data
data.head()

## Least Absolute Deviation

The LAD model is a special case of quantile regression where q=0.5



In [None]:
model = reg.QuantileRegressionModel(data=data, formula="foodexp ~ income", quantiles=0.5)
res = model.fit()
print(res.summary())

## Visualizing the results

We estimate the quantile regression model for many quantiles between .05 and .95, and compare best fit line from each of these models to Ordinary Least Squares results.

#### Prepare data for plotting

For convenience, we place the quantile regression results in a Pandas DataFrame, and the OLS results in a dictionary.

In [4]:
quantiles = np.arange(0.05, 0.96, 0.1)
model = reg.QuantileRegressionModel(data=data, formula="foodexp ~ income", quantiles=quantiles)
ols = model.fit()

models = DataFrame([[q, res.params['Intercept'], res.params['income']] + res.conf_int().loc['income'].tolist() for q, res in ols.items()])
models.columns = ['q', 'a', 'b', 'lb', 'ub']

In [5]:
ols = reg.OLSModel(data=data, formula="foodexp ~ income")
ols = ols.fit()
ols_ci = ols.conf_int().loc["income"].tolist()
ols = dict(
    a=ols.params["Intercept"], b=ols.params["income"], lb=ols_ci[0], ub=ols_ci[1]
)

In [None]:
print(models)
print(ols)

#### First plot

This plot compares best fit lines for 10 quantile regression models to the least squares fit. As Koenker and Hallock (2001) point out, we see that:

1. Food expenditure increases with income
2. The dispersion of food expenditure increases with income
3. The least squares estimates fit low income observations quite poorly (i.e. the OLS line passes over most low income households)

In [7]:
x = np.arange(data.income.min(), data.income.max(), 50)
def get_y(a, b):
    return a + b * x

In [None]:
line_plot = (LinePlotter([x]*models.shape[0], [get_y(models.a[i], models.b[i]) for i in range(models.shape[0])])
            .set_linestyle("dotted")
            .set_color('grey')
             )
ax = line_plot.draw()
y = get_y(ols["a"], ols["b"])
line_plot = (LinePlotter(x, y, ax=ax, color='red', label='OLS')
             .set_marker("")
             )
ax = line_plot.draw()
scatter_plot = (ScatterPlotter(data.income, data.foodexp, ax=ax, color='blue', label='Data', alpha=0.2)
                .set_limits((240, 3000), (240, 2000))
                .set_xlabel("Income", fontsize=16)
                .set_ylabel("Food expenditure", fontsize=16))
ax = scatter_plot.draw()
_ = ax.legend()

#### Second plot

The dotted black lines form 95% point-wise confidence band around 10 quantile regression estimates (solid black line). The red lines represent OLS regression results along with their 95% confidence interval.

In most cases, the quantile regression point estimates lie outside the OLS confidence interval, which suggests that the effect of income on food expenditure may not be constant across the distribution.

In [None]:
n = models.shape[0]
ax = LinePlotter(models.q, models.b, color="black", label="Quantile Reg.").draw()
LinePlotter(models.q, models.ub, linestyle="dotted", color="black", ax=ax).draw()
LinePlotter(models.q, models.lb, linestyle="dotted", color="black", ax=ax).draw()
LinePlotter(models.q, [ols["b"]] * n, color="red", label="OLS", ax=ax).draw()
LinePlotter(models.q, [ols["lb"]] * n, linestyle="dotted", color="red", ax=ax).draw()
(LinePlotter(models.q, [ols["ub"]] * n, linestyle="dotted", color="red", ax=ax)
 .set_ylabel(r"$\beta_{income}$")
 .set_xlabel("Quantiles of the conditional food expenditure distribution")
 .draw())
_ = ax.legend()

***