# Log-Linear Models and Graphical Models

## Bayesian Network, Data

In [1]:
from pybbn.graph.dag import Bbn
from pybbn.graph.edge import Edge, EdgeType
from pybbn.graph.node import BbnNode
from pybbn.graph.variable import Variable
from pybbn.sampling.sampling import LogicSampler
import pandas as pd

a = BbnNode(Variable(0, 'a', ['on', 'off']), [0.5, 0.5])
b = BbnNode(Variable(1, 'b', ['on', 'off']), [0.5, 0.5, 0.4, 0.6])
c = BbnNode(Variable(2, 'c', ['on', 'off']), [0.7, 0.3, 0.2, 0.8])

bbn = Bbn() \
    .add_node(a) \
    .add_node(b) \
    .add_node(c) \
    .add_edge(Edge(a, b, EdgeType.DIRECTED)) \
    .add_edge(Edge(b, c, EdgeType.DIRECTED))

sampler = LogicSampler(bbn)
df = pd.DataFrame(sampler.get_samples(n_samples=1_000, seed=37)) \
    .rename(columns={0: 'a', 1: 'b', 2: 'c'}) \
    .assign(n=1) \
    .groupby(['a', 'b', 'c']) \
    .agg('sum') \
    .reset_index()

## Likelihood and deviance

In [2]:
from patsy import dmatrices
import statsmodels.api as sm

def get_stats(formula, df):
    y, X = dmatrices(formula, df, return_type='dataframe')
    r = sm.GLM(y, X, family=sm.families.Poisson()).fit()
    
    return {
        'df_model': r.df_model,
        'deviance': r.deviance,
        'log_likelihood': r.llf
    }

formulas = {
    '(A,B,C)': 'n ~ a + b + c',
    '(A,BC)': 'n ~ a + b*c',
    '(B,AC)': 'n ~ b + a*c',
    '(C,AB)': 'n ~ c + a*b',
    '(AC,CB)': 'n ~ a*c + c*b',
    '(AB,BC)': 'n ~ a*b + b*c',
    '(BA,AC)': 'n ~ b*a + a*c',
    '(AB, AC, BC)': 'n ~ (a + b + c)**2'
}

pd.DataFrame([{**{'model': m}, **get_stats(f, df)} for m, f in formulas.items()])

Unnamed: 0,model,df_model,deviance,log_likelihood
0,"(A,B,C)",3,267.85145,-159.940146
1,"(A,BC)",4,16.687786,-34.358314
2,"(B,AC)",4,261.530543,-156.779692
3,"(C,AB)",4,253.136932,-152.582887
4,"(AC,CB)",5,10.366879,-31.19786
5,"(AB,BC)",5,1.973268,-27.001055
6,"(BA,AC)",5,246.816025,-149.422433
7,"(AB, AC, BC)",6,1.445598,-26.73722


## Detecting model differences

In [3]:
result_df = []

for _ in range(10):
    df = pd.DataFrame(sampler.get_samples(n_samples=1_000, seed=37)) \
        .rename(columns={0: 'a', 1: 'b', 2: 'c'}) \
        .assign(n=1) \
        .groupby(['a', 'b', 'c']) \
        .agg('sum') \
        .reset_index()
    
    r = pd.DataFrame([{**{'model': m}, **get_stats(f, df)} for m, f in formulas.items()])
    result_df.append(r)
    
result_df = pd.concat(result_df)

In [4]:
result_df \
    .groupby(['model']) \
    .agg('mean') \
    .sort_values(['log_likelihood', 'deviance'], ascending=[False, False])

Unnamed: 0_level_0,df_model,deviance,log_likelihood
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"(AB, AC, BC)",6,1.445598,-26.73722
"(AB,BC)",5,1.973268,-27.001055
"(AC,CB)",5,10.366879,-31.19786
"(A,BC)",4,16.687786,-34.358314
"(BA,AC)",5,246.816025,-149.422433
"(C,AB)",4,253.136932,-152.582887
"(B,AC)",4,261.530543,-156.779692
"(A,B,C)",3,267.85145,-159.940146


### Deviance

In [5]:
from statsmodels.formula.api import ols

model = ols('deviance ~ model', result_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,deviance,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,4.62e+31
Date:,"Wed, 23 Mar 2022",Prob (F-statistic):,0.0
Time:,13:25:21,Log-Likelihood:,2323.4
No. Observations:,80,AIC:,-4631.0
Df Residuals:,72,BIC:,-4612.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,267.8515,1.97e-14,1.36e+16,0.000,267.851,267.851
"model[T.(A,BC)]",-251.1637,2.78e-14,-9.03e+15,0.000,-251.164,-251.164
"model[T.(AB, AC, BC)]",-266.4059,2.78e-14,-9.58e+15,0.000,-266.406,-266.406
"model[T.(AB,BC)]",-265.8782,2.78e-14,-9.56e+15,0.000,-265.878,-265.878
"model[T.(AC,CB)]",-257.4846,2.78e-14,-9.26e+15,0.000,-257.485,-257.485
"model[T.(B,AC)]",-6.3209,2.78e-14,-2.27e+14,0.000,-6.321,-6.321
"model[T.(BA,AC)]",-21.0354,2.78e-14,-7.56e+14,0.000,-21.035,-21.035
"model[T.(C,AB)]",-14.7145,2.78e-14,-5.29e+14,0.000,-14.715,-14.715

0,1,2,3
Omnibus:,13.354,Durbin-Watson:,1.896
Prob(Omnibus):,0.001,Jarque-Bera (JB):,8.252
Skew:,-0.632,Prob(JB):,0.0162
Kurtosis:,2.062,Cond. No.,8.89


In [6]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
tukey = pairwise_tukeyhsd(endog=result_df['deviance'], groups=result_df['model'], alpha=0.05)
tukey.summary()

group1,group2,meandiff,p-adj,lower,upper,reject
"(A,B,C)","(A,BC)",-251.1637,0.001,-251.1637,-251.1637,True
"(A,B,C)","(AB, AC, BC)",-266.4059,0.001,-266.4059,-266.4059,True
"(A,B,C)","(AB,BC)",-265.8782,0.001,-265.8782,-265.8782,True
"(A,B,C)","(AC,CB)",-257.4846,0.001,-257.4846,-257.4846,True
"(A,B,C)","(B,AC)",-6.3209,0.001,-6.3209,-6.3209,True
"(A,B,C)","(BA,AC)",-21.0354,0.001,-21.0354,-21.0354,True
"(A,B,C)","(C,AB)",-14.7145,0.001,-14.7145,-14.7145,True
"(A,BC)","(AB, AC, BC)",-15.2422,0.001,-15.2422,-15.2422,True
"(A,BC)","(AB,BC)",-14.7145,0.001,-14.7145,-14.7145,True
"(A,BC)","(AC,CB)",-6.3209,0.001,-6.3209,-6.3209,True


### Log-likelihood

In [7]:
model = ols('log_likelihood ~ model', result_df).fit()
model.summary()

0,1,2,3
Dep. Variable:,log_likelihood,R-squared:,1.0
Model:,OLS,Adj. R-squared:,1.0
Method:,Least Squares,F-statistic:,3.6209999999999997e+31
Date:,"Wed, 23 Mar 2022",Prob (F-statistic):,0.0
Time:,13:26:42,Log-Likelihood:,2369.1
No. Observations:,80,AIC:,-4722.0
Df Residuals:,72,BIC:,-4703.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-159.9401,1.11e-14,-1.44e+16,0.000,-159.940,-159.940
"model[T.(A,BC)]",125.5818,1.57e-14,7.99e+15,0.000,125.582,125.582
"model[T.(AB, AC, BC)]",133.2029,1.57e-14,8.48e+15,0.000,133.203,133.203
"model[T.(AB,BC)]",132.9391,1.57e-14,8.46e+15,0.000,132.939,132.939
"model[T.(AC,CB)]",128.7423,1.57e-14,8.19e+15,0.000,128.742,128.742
"model[T.(B,AC)]",3.1605,1.57e-14,2.01e+14,0.000,3.160,3.160
"model[T.(BA,AC)]",10.5177,1.57e-14,6.69e+14,0.000,10.518,10.518
"model[T.(C,AB)]",7.3573,1.57e-14,4.68e+14,0.000,7.357,7.357

0,1,2,3
Omnibus:,8.336,Durbin-Watson:,1.809
Prob(Omnibus):,0.015,Jarque-Bera (JB):,8.23
Skew:,0.778,Prob(JB):,0.0163
Kurtosis:,3.225,Cond. No.,8.89


In [8]:
tukey = pairwise_tukeyhsd(endog=result_df['log_likelihood'], groups=result_df['model'], alpha=0.05)
tukey.summary()

group1,group2,meandiff,p-adj,lower,upper,reject
"(A,B,C)","(A,BC)",125.5818,0.001,125.5818,125.5818,True
"(A,B,C)","(AB, AC, BC)",133.2029,0.001,133.2029,133.2029,True
"(A,B,C)","(AB,BC)",132.9391,0.001,132.9391,132.9391,True
"(A,B,C)","(AC,CB)",128.7423,0.001,128.7423,128.7423,True
"(A,B,C)","(B,AC)",3.1605,0.001,3.1605,3.1605,True
"(A,B,C)","(BA,AC)",10.5177,0.001,10.5177,10.5177,True
"(A,B,C)","(C,AB)",7.3573,0.001,7.3573,7.3573,True
"(A,BC)","(AB, AC, BC)",7.6211,0.001,7.6211,7.6211,True
"(A,BC)","(AB,BC)",7.3573,0.001,7.3573,7.3573,True
"(A,BC)","(AC,CB)",3.1605,0.001,3.1605,3.1605,True
