In [1]:
import numpy as np
import pandas as pd

np.random.seed(37)

N = 10_000

Xy = pd.DataFrame({'x1': np.random.normal(1, 1, N)}) \
    .assign(x2=lambda d: np.random.normal(1 + 3.5 * d['x1'], 1)) \
    .assign(x3=np.random.normal(2, 1, N)) \
    .assign(x4=lambda d: np.random.normal(3.8 - 2.5 * d['x3'], 1))

Xy.shape

(10000, 4)

In [2]:
from sklearn.linear_model import Lasso
import itertools

def coefs2df(coefs):
    index = [i for i in coefs.keys()]
    data = [coefs[i] for i in index]
    return pd.DataFrame(data, index=index)[index]

def edges2df(edges):
    return pd.DataFrame(edges, columns=['pa', 'ch', 'weight'])

def get_coef(X_cols, y_col, Xy):
    if X_cols is None or len(X_cols) == 0:
        X_cols = Xy.columns.drop([y_col]).tolist()
    X = Xy[X_cols]
    y = Xy[y_col]
    m = Lasso().fit(X, y)
    return {x: c for x, c in zip(X_cols, np.abs(m.coef_))}

def get_parents(y, coef, delta=1e-3):
    def is_parent(k, v):
        if v < delta or coef[y][k] < coef[k][y]:
            return False
        return True
    c = coef[y]
    return {k: v for k, v in coef[y].items() if is_parent(k, v)}

def get_edges(parents):
    edges = [[(pa, ch, w) for pa, w in d.items()] for ch, d in parents.items()]
    edges = itertools.chain(*edges)
    return sorted(list(edges), key=lambda tup: tup[2], reverse=True)

def get_cov(L, E):
    def get_value(r, c):
        if r == c:
            return E.iloc[r, c]
        if L.iloc[r, c] == 0:
            return 0
        return E.iloc[r, c]

    rows, cols = L.shape
    return pd.DataFrame([[get_value(r, c) for c in range(cols)] for r in range(rows)], columns=L.columns, index=L.index)

def get_model_params(Xy):
    coefs = {c: get_coef([], c, Xy) for c in Xy.columns}
    parents = {c: get_parents(c, coefs) for c in Xy.columns}
    edges = edges2df(get_edges(parents))

    H = Xy.columns.tolist()
    M = Xy.mean()
    E = get_cov(coefs2df(coefs), Xy.cov())

    return H, M, E

In [3]:
H, M, E = get_model_params(Xy)

In [4]:
H

['x1', 'x2', 'x3', 'x4']

In [5]:
M

x1    1.001723
x2    4.496861
x3    2.004989
x4   -1.223565
dtype: float64

In [6]:
E

Unnamed: 0,x1,x2,x3,x4
x1,0.9907,3.475295,0.0,0.0
x2,3.475295,13.201002,0.0,0.0
x3,0.0,0.0,0.984534,-2.446672
x4,0.0,0.0,-2.446672,7.094515


In [7]:
E.values[0,0]

0.9907002440358973

In [9]:
m = GaussianInference(H, M, E.values)

In [10]:
m.sample_marginals().mean()

x1    1.060392
x2    4.502321
x3    1.989818
x4   -1.093466
dtype: float64

In [11]:
m.do_inference('x2', 1).P

{'x2': (1, 0),
 'x1': (1.9223066923201397, 0.07579451969769568),
 'x3': (2.004988556873993, 0.9845336964643419),
 'x4': (-1.2235648652888027, 7.094514989457332)}

In [12]:
m.do_inference('x1', 2).P

{'x1': (2, 0),
 'x2': (0.9949891657678243, 1.0099559400241862),
 'x3': (2.004988556873993, 0.9845336964643419),
 'x4': (-1.2235648652888027, 7.094514989457332)}

In [13]:
m.do_inference('x1', 1).P

{'x1': (1, 0),
 'x2': (4.502906486242751, 1.0099559400241862),
 'x3': (2.004988556873993, 0.9845336964643419),
 'x4': (-1.2235648652888027, 7.094514989457332)}

In [14]:
m.do_inferences([('x1', 1), ('x3', 1)]).P

{'x1': (1, 0),
 'x3': (1, 0),
 'x2': (4.502906486242751, 1.0099559400241862),
 'x4': (-3.7210697421092043, 1.0142705232317635)}

In [8]:
class GaussianInference(object):
    """
    Gaussian inference.
    """

    def __init__(self, H, M, E, meta={}):
        """
        ctor.

        :param H: Headers.
        :param M: Means.
        :param E: Covariance matrix.
        :param meta: Dictionary storing observations.
        """
        self.H = H
        self.M = M
        self.E = E
        self.I = {h: i for i, h in enumerate(H)}
        self.meta = meta

    @property
    def marginals(self):
        """
        Gets the marginals.

        :return: List of dictionary. Each element has name, mean and variance.
        """
        return [{'name': name, 'mean': mean, 'var': var}
                for name, (mean, var) in self.P.items()]

    @property
    def P(self):
        """
        Gets the univariate parameters of each variable.

        :return: Dictionary. Keys are variable names. Values are tuples of (mean, variance).
        """
        params = {k: (v, 0) for k, v in self.meta.items()}
        for i, (k, v) in enumerate(zip(self.H, self.M)):
            params[k] = (v, self.E[i][i])
        return params

    def sample_marginals(self, size=1000):
        """
        Samples data from the marginals.

        :param size: Number of samples.
        :return: Dictionary with keys as names and values as pandas series (sampled data).
        """

        def get_samples(m, v):
            if v == 0.0:
                s = 0.01
            else:
                s = np.sqrt(v)
            return pd.Series(np.random.normal(m, s, size=size))

        return pd.DataFrame({m['name']: get_samples(m['mean'], m['var']) for m in self.marginals})

    def do_inference(self, name, observation):
        """
        Performs inference. Simply calls the `do_inferences` method.

        :param name: Name of variable.
        :param observation: Observation value.
        :return: GaussianInference.
        """
        return self.do_inferences([(name, observation)])

    def do_inferences(self, observations):
        """
        Performs inference.

        Denote the following.

        - :math:`z` as the variable observed
        - :math:`y` as the set of other variables
        - :math:`\\mu` as the vector of means
            - :math:`\\mu_z` as the partitioned :math:`\\mu`` of length :math:`|z|`
            - :math:`\\mu_y` as the partitioned :math:`\\mu`` of length :math:`|y|`
        - :math:`\\Sigma` as the covariance matrix
            - :math:`\\Sigma_{yz}` as the partitioned :math:`\\Sigma` of :math:`|y|` rows and :math:`|z|` columns
            - :math:`\\Sigma_{zz}` as the partitioned :math:`\\Sigma` of :math:`|z|` rows and :math:`|z|` columns
            - :math:`\\Sigma_{yy}` as the partitioned :math:`\\Sigma` of :math:`|y|` rows and :math:`|y|` columns

        If we observe evidence :math:`z_e`, then the new means :math:`\\mu_y^{*}` and
        covariance matrix :math:`\\Sigma_y^{*}` corresponding to :math:`y`
        are computed as follows.

        - :math:`\\mu_y^{*} = \\mu_y - \\Sigma_{yz} \\Sigma_{zz} (z_e - \\mu_z)`
        - :math:`\\Sigma_y^{*} = \\Sigma_{yy} \\Sigma_{zz} \\Sigma_{yz}^{T}`

        :param observations: List of observation. Each observation is tuple (name, value).
        :return: GaussianInference.
        """
        z_index = [self.I[name] for name, _ in observations]
        y_index = [i for i in range(self.E.shape[1]) if i not in z_index]

        m_Z = np.array([m for i, m in enumerate(self.M) if i in z_index])
        m_Y = np.array([m for i, m in enumerate(self.M) if i in y_index])

        z = np.array([o for _, o in observations])

        S_YZ = self.E[y_index][:, z_index]
        S_ZZ = np.linalg.inv(self.E[z_index][:, z_index])
        S_YY = self.E[y_index][:, y_index]

        H = [name for i, name in enumerate(self.H) if i in y_index]
        M = m_Y - S_YZ.dot(S_ZZ).dot(z - m_Z)
        E = S_YY - S_YZ.dot(S_ZZ).dot(S_YZ.T)
        meta = {**self.meta, **{n: o for n, o in observations}}

        return GaussianInference(H, M, E, meta)