# Extending the Linear Model

In [1]:
import pandas as pd
import numpy as np

import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.gam.api import GLMGam, BSplines

from scipy.optimize import minimize

In [2]:
df_reviews = pd.read_csv('https://tinyurl.com/moviereviewsdata')

## Interactions

In [3]:
model_base = smf.ols(
    formula = 'rating ~ children_in_home + genre', 
    data = df_reviews
).fit()

model_interaction = smf.ols(
    formula = 'rating ~ children_in_home * genre', 
    data = df_reviews
).fit()

model_interaction.summary()

0,1,2,3
Dep. Variable:,rating,R-squared:,0.244
Model:,OLS,Adj. R-squared:,0.232
Method:,Least Squares,F-statistic:,21.16
Date:,"Sun, 04 Aug 2024",Prob (F-statistic):,3.44e-50
Time:,19:00:36,Log-Likelihood:,-814.77
No. Observations:,1000,AIC:,1662.0
Df Residuals:,984,BIC:,1740.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.7641,0.039,70.058,0.000,2.687,2.842
genre[T.Comedy],0.6371,0.076,8.376,0.000,0.488,0.786
genre[T.Drama],0.5352,0.054,9.968,0.000,0.430,0.641
genre[T.Horror],0.1938,0.101,1.919,0.055,-0.004,0.392
genre[T.Kids],-0.2759,0.079,-3.478,0.001,-0.432,-0.120
genre[T.Other],0.0836,0.097,0.862,0.389,-0.107,0.274
genre[T.Romance],0.2981,0.087,3.436,0.001,0.128,0.468
genre[T.Sci-Fi],-0.1090,0.100,-1.090,0.276,-0.305,0.087
children_in_home,0.1419,0.053,2.695,0.007,0.039,0.245

0,1,2,3
Omnibus:,3.039,Durbin-Watson:,2.099
Prob(Omnibus):,0.219,Jarque-Bera (JB):,2.642
Skew:,-0.032,Prob(JB):,0.267
Kurtosis:,2.757,Cond. No.,13.4


## Mixed Models

In [4]:
df_happiness_all = pd.read_csv("https://tinyurl.com/worldhappinessallyears")

df_happiness_all = (
    df_happiness_all
    .assign(year_0 = lambda x: (x['year']- x['year'].min())/10)
)

fit_ran_int = sm.MixedLM.from_formula(
    "happiness_score ~ year_0", 
    df_happiness_all, 
    re_formula='1',
    groups=df_happiness_all["country"]  
).fit()

model_ran_slope = sm.MixedLM.from_formula(
    "happiness_score ~ year_0", 
    df_happiness_all, 
    re_formula='1 + year_0',
    groups=df_happiness_all["country"]  
).fit()

In [5]:
fit_ran_int.summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,happiness_score
No. Observations:,2199,Method:,REML
No. Groups:,165,Scale:,0.1830
Min. group size:,1,Log-Likelihood:,-1610.2402
Max. group size:,17,Converged:,Yes
Mean group size:,13.3,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,5.320,0.084,63.026,0.000,5.154,5.485
year_0,0.092,0.020,4.661,0.000,0.053,0.131
Group Var,1.102,0.301,,,,


In [6]:
model_ran_slope.summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,happiness_score
No. Observations:,2199,Method:,REML
No. Groups:,165,Scale:,0.1176
Min. group size:,1,Log-Likelihood:,-1326.3751
Max. group size:,17,Converged:,Yes
Mean group size:,13.3,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,5.339,0.092,58.296,0.000,5.160,5.519
year_0,0.057,0.050,1.146,0.252,-0.040,0.154
Group Var,1.309,0.459,,,,
Group x year_0 Cov,-0.248,0.182,,,,
year_0 Var,0.328,0.140,,,,


In [7]:
estimated_RE = pd.DataFrame(model_ran_slope.random_effects)

In [8]:
ranef_usa = estimated_RE['United States'].rename({'Group': 'Intercept'})
ranef_usa + model_ran_slope.fe_params

Intercept    7.296623
year_0      -0.275586
dtype: float64

## Generalized Additive Models

In [9]:

import plotly.graph_objects as go
import numpy as np

x = np.random.normal(size = 1000)
x.sort()
y = np.sin(x)

fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x = x, 
        y = y,
        line_shape = 'spline'
    )
)

In [10]:
df_happiness_2018 = pd.read_csv('https://tinyurl.com/worldhappiness2018')

bs = BSplines(
    df_happiness_2018['healthy_life_expectancy_at_birth'],
    df = 9,
    degree = 3
)

gam_happiness = GLMGam.from_formula(
    'happiness_score ~ healthy_life_expectancy_at_birth', 
    smoother = bs,
    data = df_happiness_2018
)

gam_happiness_result = gam_happiness.fit()

gam_happiness_result.summary()

0,1,2,3
Dep. Variable:,happiness_score,No. Observations:,112.0
Model:,GLMGam,Df Residuals:,103.0
Model Family:,Gaussian,Df Model:,8.0
Link Function:,Identity,Scale:,0.38282
Method:,PIRLS,Log-Likelihood:,-100.46
Date:,"Sun, 04 Aug 2024",Deviance:,39.43
Time:,19:00:37,Pearson chi2:,39.4
No. Iterations:,3,Pseudo R-squ. (CS):,0.9093
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.6407,0.967,0.662,0.508,-1.255,2.536
healthy_life_expectancy_at_birth,0.0801,0.010,8.416,0.000,0.061,0.099
healthy_life_expectancy_at_birth_s0,0.3574,0.948,0.377,0.706,-1.501,2.216
healthy_life_expectancy_at_birth_s1,-2.0864,0.605,-3.448,0.001,-3.272,-0.900
healthy_life_expectancy_at_birth_s2,-0.4160,0.637,-0.653,0.514,-1.665,0.833
healthy_life_expectancy_at_birth_s3,-0.5786,0.420,-1.378,0.168,-1.402,0.245
healthy_life_expectancy_at_birth_s4,-0.2643,0.486,-0.543,0.587,-1.218,0.689
healthy_life_expectancy_at_birth_s5,0.5566,0.594,0.936,0.349,-0.608,1.722
healthy_life_expectancy_at_birth_s6,0.5079,0.555,0.915,0.360,-0.580,1.596


## Quantile Regression

In [11]:
model_median = smf.quantreg('rating ~ word_count_sc',  data = df_reviews)
model_median = model_median.fit(q = .5)

model_median.summary()

0,1,2,3
Dep. Variable:,rating,Pseudo R-squared:,0.1247
Model:,QuantReg,Bandwidth:,0.2375
Method:,Least Squares,Sparsity:,1.221
Date:,"Sun, 04 Aug 2024",No. Observations:,1000.0
Time:,19:00:37,Df Residuals:,998.0
,,Df Model:,1.0

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0886,0.019,160.023,0.000,3.051,3.126
word_count_sc,-0.2852,0.019,-14.770,0.000,-0.323,-0.247


In [12]:
model_median = smf.quantreg('rating ~ word_count_sc',  data = df_reviews)

output_dict = {}

for q in [0.25, 0.5, 0.75]:
    model_median = smf.quantreg('rating ~ word_count_sc', data=df_reviews)
    model_median = model_median.fit(q=q)
    output_dict[f"model_q{q}"] = model_median

output_dict

{'model_q0.25': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x154a6ad80>,
 'model_q0.5': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x154b183e0>,
 'model_q0.75': <statsmodels.regression.linear_model.RegressionResultsWrapper at 0x154b1a5a0>}

### Quantile Loss Function

In [13]:
def quantile_loss(par, X, y, tau):
    linear_predictor = X.dot(par)
    
    residual = y - linear_predictor
    
    loss = np.where(
        residual < 0, 
        (tau-1)*residual, 
        tau*residual
    )
    
    return sum(loss)

In [14]:
X = pd.DataFrame(
    {'intercept': 1, 
    'word_count_sc': df_reviews['word_count_sc']}
)
y = df_reviews['rating']

minimize(quantile_loss, x0 = np.array([0, 0]), args = (X, y, .5)).x

array([ 3.09011343, -0.28416408])