## Preliminaries

### Study question 1.4.1

In [1]:
import networkx as nx

g = nx.DiGraph()

nodes = ['W', 'X', 'Y', 'Z', 'T']
edges = [('X', 'W'), ('X', 'Y'), ('W', 'Y'), ('W', 'Z'), ('Y', 'Z'), ('Y', 'T'), ('Z', 'T')]

_ = [g.add_node(n) for n in nodes]
_ = [g.add_edge(p, c) for p, c in edges]

In [2]:
list(g.predecessors('Z'))

['W', 'Y']

In [3]:
from networkx.algorithms.dag import ancestors

list(ancestors(g, 'Z'))

['X', 'W', 'Y']

In [4]:
list(g.successors('W'))

['Y', 'Z']

In [5]:
from networkx.algorithms.dag import descendants

list(descendants(g, 'W'))

['T', 'Z', 'Y']

In [6]:
from networkx.algorithms.simple_paths import all_simple_paths

list(all_simple_paths(g, 'X', 'T'))

[['X', 'W', 'Y', 'Z', 'T'],
 ['X', 'W', 'Y', 'T'],
 ['X', 'W', 'Z', 'T'],
 ['X', 'Y', 'Z', 'T'],
 ['X', 'Y', 'T']]

### Study question 1.5.1

In [7]:
import numpy as np
import pandas as pd

np.random.seed(37)

N = 10_000

U_x = np.random.normal(0, 1, N)
U_y = np.random.normal(0, 1, N)
U_z = np.random.normal(0, 1, N)

X = U_x
Y = (1/3) * X + U_y
Z = (1/16) * Y + U_z

df = pd.DataFrame({
    'X': X,
    'Y': Y,
    'Z': Z
})

In [8]:
df.head()

Unnamed: 0,X,Y,Z
0,-0.054464,0.115728,0.229677
1,0.674308,-0.736683,0.766547
2,0.346647,0.011832,0.599696
3,-1.300346,-0.579125,-1.107032
4,1.518512,-0.544518,-0.403825


In [9]:
from sklearn.linear_model import LinearRegression

X = df[['Y']]
y = df.Z

model = LinearRegression().fit(X, y)
model.predict(np.array([[3]]))

array([0.21822383])

In [10]:
X = df[['X']]
y = df.Z

model = LinearRegression().fit(X, y)
model.predict(np.array([[3]]))

array([0.09937442])

In [11]:
X = df[['X', 'Y']]
y = df.Z

model = LinearRegression().fit(X, y)
model.predict(np.array([[1, 3]]))

array([0.21900358])

In [12]:
df.cov()

Unnamed: 0,X,Y,Z
X,0.9907,0.338077,0.031365
Y,0.338077,1.125325,0.079959
Z,0.031365,0.079959,0.990133


## Graphical Models and Their Applications

### Study question 2.3.1

In [13]:
fig2_5 = nx.DiGraph()

nodes = ['X', 'R', 'S', 'T', 'U', 'V', 'Y']
edges = [('X', 'R'), ('R', 'S'), ('S', 'T'), ('U', 'T'), ('V', 'U'), ('V', 'Y')]

_ = [fig2_5.add_node(n) for n in nodes]
_ = [fig2_5.add_edge(p, c) for p, c in edges]

In [14]:
fig2_6 = nx.DiGraph()

nodes = ['X', 'R', 'S', 'T', 'U', 'V', 'Y', 'P']
edges = [('X', 'R'), ('R', 'S'), ('S', 'T'), ('U', 'T'), ('V', 'U'), ('V', 'Y'), ('T', 'P')]

_ = [fig2_6.add_node(n) for n in nodes]
_ = [fig2_6.add_edge(p, c) for p, c in edges]

a. List all pairs of variables in the graph that are independent conditional on the set $Z=\{R,V\}$.

In [15]:
def get_paths(g, source, target):
    return all_simple_paths(g.to_undirected(), source, target)

def get_descendants(g):
    return {n: list(descendants(g, n)) for n in g.nodes()}

def is_path_active(g, path, Z, descendants):
    def get_triplet(i):
        return path[i-1], path[i], path[i+1]
    
    def is_collider(x, z, y):
        if g.has_edge(x, z) and g.has_edge(y, z):
            return True
        return False
    
    def is_collider_active(z):
        if z in Z:
            return True
        if len(set(descendants[z]) & set(Z)) > 0:
            return True
        return False
    
    def is_noncollider_active(z):
        if z in Z:
            return False
        return True
    
    def is_node_active(x, z, y):
        if is_collider(x, z, y):
            return is_collider_active(z)
        return is_noncollider_active(z)
    
    nodes = (i for i in range(len(path)))
    nodes = filter(lambda i: 0 < i < len(path) - 1, nodes)
    nodes = map(lambda i: get_triplet(i), nodes)
    
    for x, z, y in nodes:
        if not is_node_active(x, z, y):
            return False
        
    return True

def is_d_separated(g, source, target, Z=[]):
    m = get_descendants(g)
    paths = get_paths(g, source, target)
    for p in paths:
        if not is_path_active(g, p, Z, m):
            return True
    return False

In [16]:
is_d_separated(fig2_5, 'X', 'Y', ['R', 'V'])

True

In [17]:
from itertools import combinations

for source, target in combinations(['X', 'S', 'T', 'U', 'Y'], 2):
    d_sep = is_d_separated(fig2_5, source, target, ['R', 'V'])
    print(f'I({source}, {target} | [R,V]) = {d_sep}')

I(X, S | [R,V]) = True
I(X, T | [R,V]) = True
I(X, U | [R,V]) = True
I(X, Y | [R,V]) = True
I(S, T | [R,V]) = False
I(S, U | [R,V]) = True
I(S, Y | [R,V]) = True
I(T, U | [R,V]) = False
I(T, Y | [R,V]) = True
I(U, Y | [R,V]) = True


b. For each pair of non-adjacent variables, give a set of variables that, when conditioned on, renders that pair independent.

In [18]:
from itertools import chain

def get_neighbors(g):
    return {n: set(g.predecessors(n)) | set(g.successors(n)) for n in g.nodes()}

def all_nodes_in_paths(g, source, target):
    paths = get_paths(g, source, target)
    paths = (p[1:len(p)-1] for p in get_paths(g, source, target))
    paths = chain(*paths)
    paths = set(paths)
    
    return paths

def get_conditional_independence_stmt(X, Y, Z):
    if len(Z) == 0:
        return f'I({X}, {Y})'
    else:
        given = ','.join(Z)
        return f'I({X}, {Y} | {given})'
    
def get_implied_conditional_independencies(g):
    n_map = get_neighbors(g)
    
    is_non_neighbor = lambda p: p[1] not in n_map[p[0]]
    get_nodes_between = lambda p: (p[0], p[1], all_nodes_in_paths(g, p[0], p[1]))
    to_triplet = lambda t: [(t[0], t[1], [z]) for z in t[2]] + [(t[0], t[1], [])]
    d_sep = lambda t: (t[0], t[1], t[2], is_d_separated(g, t[0], t[1], t[2]))

    pairs = combinations(g.nodes(), 2)
    pairs = filter(is_non_neighbor, pairs)
    pairs = map(get_nodes_between, pairs)
    pairs = map(to_triplet, pairs)
    pairs = chain(*pairs)
    pairs = map(d_sep, pairs)
    pairs = filter(lambda t: t[3], pairs)
    pairs = map(lambda t: get_conditional_independence_stmt(t[0], t[1], t[2]), pairs)
    
    return pairs

In [19]:
list(get_implied_conditional_independencies(fig2_5))

['I(X, S | R)',
 'I(X, T | R)',
 'I(X, T | S)',
 'I(X, U | R)',
 'I(X, U | S)',
 'I(X, U)',
 'I(X, V | R)',
 'I(X, V | U)',
 'I(X, V | S)',
 'I(X, V)',
 'I(X, Y | V)',
 'I(X, Y | S)',
 'I(X, Y | R)',
 'I(X, Y | U)',
 'I(X, Y)',
 'I(R, T | S)',
 'I(R, U | S)',
 'I(R, U)',
 'I(R, V | U)',
 'I(R, V | S)',
 'I(R, V)',
 'I(R, Y | V)',
 'I(R, Y | U)',
 'I(R, Y | S)',
 'I(R, Y)',
 'I(S, U)',
 'I(S, V | U)',
 'I(S, V)',
 'I(S, Y | U)',
 'I(S, Y | V)',
 'I(S, Y)',
 'I(T, V | U)',
 'I(T, Y | U)',
 'I(T, Y | V)',
 'I(U, Y | V)']

c. List all pairs of variables in the graph of Figure 2.6 that are independent conditional on the set $Z=\{R,P\}$.

In [20]:
for source, target in combinations(['X', 'S', 'T', 'U', 'Y', 'V'], 2):
    d_sep = is_d_separated(fig2_6, source, target, ['R', 'P'])
    if d_sep:
        print(f'I({source}, {target} | [R,P]) = {d_sep}')

I(X, S | [R,P]) = True
I(X, T | [R,P]) = True
I(X, U | [R,P]) = True
I(X, Y | [R,P]) = True
I(X, V | [R,P]) = True


d. For each pair of non-adjacent variables in in Figure 2.6, give a set of variables that, when conditioned on,renders that pair independent.

In [21]:
list(get_implied_conditional_independencies(fig2_6))

['I(X, S | R)',
 'I(X, T | R)',
 'I(X, T | S)',
 'I(X, U | R)',
 'I(X, U | S)',
 'I(X, U)',
 'I(X, V | R)',
 'I(X, V | U)',
 'I(X, V | S)',
 'I(X, V)',
 'I(X, Y | V)',
 'I(X, Y | S)',
 'I(X, Y | R)',
 'I(X, Y | U)',
 'I(X, Y)',
 'I(X, P | R)',
 'I(X, P | T)',
 'I(X, P | S)',
 'I(R, T | S)',
 'I(R, U | S)',
 'I(R, U)',
 'I(R, V | U)',
 'I(R, V | S)',
 'I(R, V)',
 'I(R, Y | V)',
 'I(R, Y | U)',
 'I(R, Y | S)',
 'I(R, Y)',
 'I(R, P | T)',
 'I(R, P | S)',
 'I(S, U)',
 'I(S, V | U)',
 'I(S, V)',
 'I(S, Y | U)',
 'I(S, Y | V)',
 'I(S, Y)',
 'I(S, P | T)',
 'I(T, V | U)',
 'I(T, Y | U)',
 'I(T, Y | V)',
 'I(U, Y | V)',
 'I(U, P | T)',
 'I(V, P | T)',
 'I(V, P | U)',
 'I(Y, P | T)',
 'I(Y, P | U)',
 'I(Y, P | V)']

e. Suppose we generate data by the model described in Figure 2.5, and we fit them with the linear equation $Y=a+bX+cZ$. Which of the variables in the model may be chosen for $Z$ so as to guarantee that the slope b would be equal to zero?

In [22]:
[(z, is_d_separated(fig2_5, 'X', 'Y', [z])) 
 for z in fig2_5.nodes() if z not in ['X', 'Y']]

[('R', True), ('S', True), ('T', False), ('U', True), ('V', True)]

In [23]:
from networkx.algorithms.dag import topological_sort

def simulate(g, ceof=0.7, n_samples=10_000):
    parents = {n: set(g.predecessors(n)) for n in g.nodes()}
    values = {}

    nodes = list(topological_sort(g))

    for n in nodes:
        pas = parents[n]

        if len(pas) == 0:
            values[n] = np.random.normal(0, 1, n_samples)
        else:
            v = np.zeros(n_samples)
            for pa in pas:
                v = v + ceof * values[pa]
            values[n] = v

    df = pd.DataFrame(values)
    return df

# df = simulate(fig2_5)
df = pd.read_csv('./fig2_5.csv')

In [24]:
df.head()

Unnamed: 0,R,S,T,U,V,X,Y
0,1.072982,1.276989,-0.086071,-1.208541,-1.382943,0.131102,-0.618901
1,0.156704,0.392222,-0.838862,-1.49783,0.806445,-0.905952,1.360863
2,0.66278,-0.668059,-1.097601,-1.089884,0.169829,0.612817,-0.412453
3,-0.9706,-0.646592,-0.481943,-0.197093,-0.001528,-1.6243,-0.778605
4,0.13171,-0.809683,-1.169966,-0.570742,-0.248697,-0.768592,0.987305


In [25]:
import statsmodels.api as sm
from patsy import dmatrices

y, X = dmatrices('Y ~ X + V', data=df, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.488
Model:,OLS,Adj. R-squared:,0.488
Method:,Least Squares,F-statistic:,4761.0
Date:,"Tue, 05 Oct 2021",Prob (F-statistic):,0.0
Time:,02:52:14,Log-Likelihood:,-10897.0
No. Observations:,10000,AIC:,21800.0
Df Residuals:,9997,BIC:,21820.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0041,0.007,-0.568,0.570,-0.018,0.010
X,-0.0052,0.007,-0.710,0.478,-0.019,0.009
V,0.7116,0.007,97.585,0.000,0.697,0.726

0,1,2,3
Omnibus:,0.358,Durbin-Watson:,2.021
Prob(Omnibus):,0.836,Jarque-Bera (JB):,0.364
Skew:,0.015,Prob(JB):,0.834
Kurtosis:,2.995,Cond. No.,1.02


In [26]:
y, X = dmatrices('Y ~ X + T', data=df, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.138
Model:,OLS,Adj. R-squared:,0.138
Method:,Least Squares,F-statistic:,798.1
Date:,"Tue, 05 Oct 2021",Prob (F-statistic):,2.6e-322
Time:,02:52:14,Log-Likelihood:,-13502.0
No. Observations:,10000,AIC:,27010.0
Df Residuals:,9997,BIC:,27030.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0048,0.009,-0.512,0.609,-0.023,0.014
X,-0.1354,0.010,-13.464,0.000,-0.155,-0.116
T,0.4006,0.010,39.952,0.000,0.381,0.420

0,1,2,3
Omnibus:,0.215,Durbin-Watson:,1.992
Prob(Omnibus):,0.898,Jarque-Bera (JB):,0.186
Skew:,0.003,Prob(JB):,0.911
Kurtosis:,3.02,Cond. No.,1.43


f. Continuing question (e), suppose we fit the data with the equation:

- $Y=a+bX+cR+dS+eT+fP$

which of the coefficients would be zero?

In [27]:
df = pd.read_csv('./fig2_6.csv')

y, X = dmatrices('Y ~ X + R + S + T + P', data=df, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,Y,R-squared:,0.235
Model:,OLS,Adj. R-squared:,0.235
Method:,Least Squares,F-statistic:,613.8
Date:,"Tue, 05 Oct 2021",Prob (F-statistic):,0.0
Time:,02:52:14,Log-Likelihood:,-12909.0
No. Observations:,10000,AIC:,25830.0
Df Residuals:,9994,BIC:,25870.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0082,0.009,0.934,0.350,-0.009,0.025
X,0.0195,0.012,1.566,0.117,-0.005,0.044
R,-0.0146,0.015,-0.968,0.333,-0.044,0.015
S,-0.4554,0.015,-30.616,0.000,-0.485,-0.426
T,0.6771,0.015,45.438,0.000,0.648,0.706
P,-0.0042,0.012,-0.342,0.732,-0.028,0.020

0,1,2,3
Omnibus:,0.41,Durbin-Watson:,2.017
Prob(Omnibus):,0.815,Jarque-Bera (JB):,0.375
Skew:,0.004,Prob(JB):,0.829
Kurtosis:,3.029,Cond. No.,3.93


### Study question 2.4.1

In [28]:
fig2_9 = nx.DiGraph()

nodes = ['Z1', 'Z2', 'Z3', 'X', 'W', 'Y']
edges = [
    ('Z1', 'Z3'), ('Z1', 'X'), 
    ('Z2', 'Z3'), ('Z2', 'Y'), 
    ('Z3', 'X'), ('Z3', 'Y'), 
    ('X', 'W'), ('W', 'Y')
]

_ = [fig2_9.add_node(n) for n in nodes]
_ = [fig2_9.add_edge(p, c) for p, c in edges]

a. For each pair of non-adjacent nodes in this graph, find a set of variables that d-separates that pair. What does this list tell us about independencies in the data?

In [29]:
list(get_implied_conditional_independencies(fig2_9))

['I(Z1, Z2 | X)',
 'I(Z1, Z2 | W)',
 'I(Z1, Z2 | Z3)',
 'I(Z1, Z2)',
 'I(Z1, W | X)',
 'I(Z1, W | Z2)',
 'I(Z1, W | Z3)',
 'I(Z1, W)',
 'I(Z1, Y | X)',
 'I(Z1, Y | Z2)',
 'I(Z1, Y | W)',
 'I(Z1, Y | Z3)',
 'I(Z1, Y)',
 'I(Z2, X | Z1)',
 'I(Z2, X | W)',
 'I(Z2, X | Z3)',
 'I(Z2, X)',
 'I(Z2, W | X)',
 'I(Z2, W | Z1)',
 'I(Z2, W | Z3)',
 'I(Z2, W)',
 'I(Z3, W | X)',
 'I(Z3, W | Z1)',
 'I(Z3, W | Z2)',
 'I(Z3, W)',
 'I(X, Y | Z1)',
 'I(X, Y | Z2)',
 'I(X, Y | W)',
 'I(X, Y | Z3)',
 'I(X, Y)']