In [34]:
import pandas as pd

df = pd.DataFrame(
    {
        "revenue": [100, 150, 145, 50, 45, 100],
        "free_shipping": [False, True, True, False, False, True],
        "bag_size": [10, 10, 10, 30, 30, 30],
    }
)
df

Unnamed: 0,revenue,free_shipping,bag_size
0,100,False,10
1,150,True,10
2,145,True,10
3,50,False,30
4,45,False,30
5,100,True,30


In [35]:
df.loc[df["free_shipping"] == True, "revenue"].mean()

131.66666666666666

In [36]:
import statsmodels.formula.api as smf

m = smf.ols("revenue ~ free_shipping + bag_size", df).fit()
m.summary()


  warn("omni_normtest is not valid with less than 8 observations; %i "


0,1,2,3
Dep. Variable:,revenue,R-squared:,0.997
Model:,OLS,Adj. R-squared:,0.994
Method:,Least Squares,F-statistic:,450.0
Date:,"Tue, 07 Mar 2023",Prob (F-statistic):,0.000191
Time:,15:34:48,Log-Likelihood:,-13.658
No. Observations:,6,AIC:,33.32
Df Residuals:,3,BIC:,32.69
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,123.3333,3.879,31.796,0.000,110.989,135.678
free_shipping[T.True],50.0000,2.887,17.321,0.000,40.813,59.187
bag_size,-2.5000,0.144,-17.321,0.000,-2.959,-2.041

0,1,2,3
Omnibus:,,Durbin-Watson:,3.0
Prob(Omnibus):,,Jarque-Bera (JB):,1.062
Skew:,-0.707,Prob(JB):,0.588
Kurtosis:,1.5,Cond. No.,72.5


## Matching

In [37]:
import pandas as pd
import numpy as np

df = pd.DataFrame(
    {
        "revenue": [10, 20, 30, 40, 50, 60, 30, 40, 50, 60, 90, 90, 100]*5,
        "free_shipping": [False, False, False, False, False, False, True, True, True, True, True, True, True]*5,
        "price_per_lbs": [1, 2, 3, 4, 5, 6, 3, 4, 5, 6, 6.5, 7, 8]*5,
    }
)
import numpy.random as npr
npr.seed(42)
for_scatter = df.copy()
for_scatter["revenue"] = for_scatter["revenue"] + npr.normal(scale=2, size=len(df))
for_scatter["price_per_lbs"] = for_scatter["price_per_lbs"] + npr.normal(
    scale=0.2, size=len(df)
)


In [38]:
import altair as alt

scatter = (
    alt.Chart(for_scatter)
    .mark_point(size=40)
    .encode(
        x=alt.X("price_per_lbs:Q", scale=alt.Scale(zero=False)),
        y=alt.Y("revenue:Q", scale=alt.Scale(zero=False)),
        color="free_shipping",
    )
)
scatter


In [39]:
m = smf.ols("revenue ~ price_per_lbs + free_shipping", df).fit()
m.summary()


0,1,2,3
Dep. Variable:,revenue,R-squared:,0.94
Model:,OLS,Adj. R-squared:,0.938
Method:,Least Squares,F-statistic:,482.2
Date:,"Tue, 07 Mar 2023",Prob (F-statistic):,1.63e-38
Time:,15:34:48,Log-Likelihood:,-214.87
No. Observations:,65,AIC:,435.7
Df Residuals:,62,BIC:,442.3
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-9.3426,2.154,-4.337,0.000,-13.648,-5.037
free_shipping[T.True],3.5657,1.999,1.784,0.079,-0.429,7.561
price_per_lbs,12.6693,0.505,25.110,0.000,11.661,13.678

0,1,2,3
Omnibus:,4.262,Durbin-Watson:,1.216
Prob(Omnibus):,0.119,Jarque-Bera (JB):,2.805
Skew:,0.329,Prob(JB):,0.246
Kurtosis:,2.223,Cond. No.,13.9


In [40]:
import numpy as np

predicted = pd.DataFrame(
    {
        "price_per_lbs": [1, 4, 8, 1, 4, 8],
        "free_shipping": [False, False, False, True, True, True],
    }
)
predicted["revenue"] = np.nan
for t in [True, False]:
    predicted.loc[predicted.free_shipping == t, "revenue"] = m.predict(
        predicted.loc[predicted.free_shipping == t, ["price_per_lbs", "free_shipping"]]
    )


In [41]:
fits = []
colors = {False: "lightblue", True: "orange"}
for t in [True, False]:
    c = (
        alt.Chart(
            predicted[
                (predicted["free_shipping"] == t)
            ]
        )
        .encode(
            x=alt.X("price_per_lbs:Q", scale=alt.Scale(zero=False)),
            y=alt.Y("revenue:Q", scale=alt.Scale(zero=False)),
        )
        .transform_regression("price_per_lbs", "revenue")
        .mark_line(color=colors[t])
    )
    fits.append(c)

alt.layer(*fits, scatter)

In [42]:
m = smf.ols("revenue ~ price_per_lbs + free_shipping", df[(df.price_per_lbs <6.5) & (df.price_per_lbs > 2.5)]).fit()
m.summary()


0,1,2,3
Dep. Variable:,revenue,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,1.647e+31
Date:,"Tue, 07 Mar 2023",Prob (F-statistic):,0.0
Time:,15:34:48,Log-Likelihood:,1225.9
No. Observations:,40,AIC:,-2446.0
Df Residuals:,37,BIC:,-2441.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.132e-14,8.31e-15,2.565,0.015,4.48e-15,3.82e-14
free_shipping[T.True],1.703e-14,3.9e-15,4.370,0.000,9.13e-15,2.49e-14
price_per_lbs,10.0000,1.74e-15,5.74e+15,0.000,10.000,10.000

0,1,2,3
Omnibus:,0.962,Durbin-Watson:,1.721
Prob(Omnibus):,0.618,Jarque-Bera (JB):,0.963
Skew:,-0.225,Prob(JB):,0.618
Kurtosis:,2.387,Cond. No.,20.9


In [43]:
import numpy as np

predicted = pd.DataFrame(
    {
        "price_per_lbs": [2.5, 6, 2.5, 6],
        "free_shipping": [False, False, True, True],
    }
)
predicted["revenue"] = np.nan
for t in [True, False]:
    predicted.loc[predicted.free_shipping == t, "revenue"] = m.predict(
        predicted.loc[predicted.free_shipping == t, ["price_per_lbs", "free_shipping"]]
    )


In [44]:
fits = []
colors = {False: "lightblue", True: "orange"}
for t in [True, False]:
    c = (
        alt.Chart(
            predicted[
                (predicted["free_shipping"] == t)
            ]
        )
        .encode(
            x=alt.X("price_per_lbs:Q", scale=alt.Scale(zero=False)),
            y=alt.Y("revenue:Q", scale=alt.Scale(zero=False)),
        )
        .transform_regression("price_per_lbs", "revenue")
        .mark_line(color=colors[t])
    )
    fits.append(c)

alt.layer(*fits, scatter)