# Why is Loess Fit Lower than Mean Value (and Linear Regression Fit)?



In [1]:
import pandas as pd
import altair as alt
# alt.data_transformers.enable('data_server')
alt.data_transformers.disable_max_rows()

# Load panel data. Monthly arrest rate (part 2 crimes per 1,000 people) 
# data for number of localities. 

panel = pd.read_csv(
    "https://github.com/nickeubank/im_baffled/raw/main/arrest_rates.csv.zip"
)
panel.sample(5)

Unnamed: 0.1,Unnamed: 0,years_w_decimals,arrest_rate
28459,106783,2018.166667,0.93932
28921,108841,2015.583333,0.775346
26917,101353,2016.583333,0.825034
2673,9885,2018.833333,0.296077
1610,6230,2014.916667,0.576951


In [2]:
# Sample average is 1.41.
panel.arrest_rate.describe()

count    31818.000000
mean         1.413783
std          1.107998
min          0.000000
25%          0.745517
50%          1.137404
75%          1.697478
max         13.695871
Name: arrest_rate, dtype: float64

In [3]:
# And if I do averages for each month, I get 
# a relatively smooth downward trend. 

grouped_means = panel.groupby("years_w_decimals", as_index=False)[["arrest_rate"]].mean()

chart_grouped = (
    alt.Chart(grouped_means)
    .mark_circle(opacity=0.5)
    .encode(
        x=alt.X("years_w_decimals", scale=alt.Scale(zero=False)),
        y=alt.Y("arrest_rate", scale=alt.Scale(zero=False)),
    )
)
chart_grouped

In [4]:
# A linear regression fits through the monthly averages
# as expected:
reg = (
    alt.Chart(panel)
    .encode(
        x=alt.X("years_w_decimals", scale=alt.Scale(zero=False)),
        y=alt.Y("arrest_rate", scale=alt.Scale(zero=False)),
    )
    .transform_regression(
        "years_w_decimals",
        "arrest_rate",
        method="poly",
        order=1,
    )
    .mark_line()
)
reg + chart_grouped

In [5]:
# But not loess...

loess = (
    alt.Chart(panel)
    .encode(
        x=alt.X("years_w_decimals", scale=alt.Scale(zero=False)),
        y=alt.Y("arrest_rate", scale=alt.Scale(zero=False)),
    )
    .transform_loess(
        on="years_w_decimals",
        loess="arrest_rate",
        bandwidth=0.3
    )
    .mark_line()
)
reg + chart_grouped + loess