In [2]:
import numpy as np

In [1]:
from collections import defaultdict


def stratify(X, Y):
    """Stratifies Y based on unique values of X.
    Args:
        X (sequence): sequence of discrete outcomes
        Y (sequence): sequence of discrete outcomes
    Returns:
        (dict): list of Y-values for a X-value
    """
    Y_grps = defaultdict(list)
    for i, x in enumerate(X):
        Y_grps[x].append(Y[i])
    return Y_grps


def to_nested(X):
    """Converts the given sequence to a nested sequence.
    Args:
        X (sequence): sequence of discrete outcomes
    Returns:
        (nested sequence): nested sequence of X
    """
    return [[x] for x in X]

In [3]:
def cisc(X, Y, plain=False):
    """Computes the total stochastic complexity from X to Y and vice versa.
    Args:
        X (sequence): sequence of discrete outcomes
        Y (sequence): sequence of discrete outcomes
        plain (bool): whether to compute the plain conditional stochastic
            complexity or not. If not provided, we compute the weighted one.
    Returns:
        (float, float): the total multinomial stochastic complexity of X and Y
            in the direction from X to Y, and vice versa.
    """
    assert len(X) == len(Y)

    n = len(X)

    scX = sc(X)
    scY = sc(Y)

    YgX = stratify(X, Y)
    XgY = stratify(Y, X)

    domX = YgX.keys()
    domY = XgY.keys()

    ndomX = len(domX)
    ndomY = len(domY)

    if plain:
        scYgX = sum(sc(Yp, ndomY) for Yp in YgX.values())
        scXgY = sum(sc(Xp, ndomX) for Xp in XgY.values())
    else:
        scYgX = sum(len(Yp) / n * sc(Yp, ndomY) for Yp in YgX.values())
        scXgY = sum(len(Xp) / n * sc(Xp, ndomX) for Xp in XgY.values())

    ciscXtoY = scX + scYgX
    ciscYtoX = scY + scXgY

    return (ciscXtoY, ciscYtoX)

In [None]:
"""This module implements the linear algorithm for computing the stochastic
complexity of a discrete sequence relative to a parametric family of
multinomial distributions. For more detail, please refer to
http://pgm08.cs.aau.dk/Papers/31_Paper.pdf
"""
from __future__ import division
from collections import Counter
from math import ceil, log, sqrt


def log2(n):
    return log(n or 1, 2)


def model_cost(ndistinct_vals, n):
    """Computes the logarithm of the normalising term of multinomial
    stochastic complexity.
    Args:
        ndistinct_vals (int): number of distinct values of a multinomial r.v.
        n (int): number of trials
    Returns:
        float: the model cost of the parametric family of multinomials
    """
    total = 1.0
    b = 1.0
    d = 10

    bound = int(ceil(2 + sqrt(2 * n * d * log(10))))  # using equation (38)
    for k in range(1, bound + 1):
        b = (n - k + 1) / n * b
        total += b

    log_old_sum = log2(1.0)
    log_total = log2(total)
    log_n = log2(n)
    for j in range(3, ndistinct_vals + 1):
        log_x = log_n + log_old_sum - log_total - log2(j - 2)
        x = 2 ** log_x
        # log_one_plus_x = (x + 8 * x / (2 + x) + x / (1 + x)) / 6
        log_one_plus_x = log2(1 + x)
        # one_plus_x = 1 + n * 2 ** log_old_sum / (2 ** log_total * (j - 2))
        # log_one_plus_x = log2(one_plus_x)
        log_new_sum = log_total + log_one_plus_x
        log_old_sum = log_total
        log_total = log_new_sum
        # print log_total,

    if ndistinct_vals == 1:
        log_total = log2(1.0)

    return log_total


def sc(X, ndistinct_vals=None):
    """Computes the stochastic complexity of a discrete sequence.
    Args:
        X (sequence): sequence of discrete outcomes
        ndistinct_vals (int): number of distinct values of the multinomial
            r.v. X. If not provided, we take it directly from X.
    Returns:
        float: the multinomial stochastic complexity of X
    """
    freqs = Counter(X)
    n = len(X)
    ndistinct_vals = ndistinct_vals or len(freqs)
    data_cost = 0.0
    for freq in freqs.values():
        data_cost += freq * (log2(n) - log2(freq))
    return data_cost + model_cost(ndistinct_vals, n)


if __name__ == "__main__":
    print(sc([1, 2, 3, 2, 1, 2]))

In [217]:
from __future__ import division
from math import ceil, log, sqrt
from scipy.stats import binom

def log2(n):
    return log(n or 1, 2)

def C_MN(n: int, K: int):
    """Computes the normalizing term of P_NML recursively. O(n+K)
    
    For more detail, please refer to eq (19) (Theorem1) in 
    "NML Computation Algorithms for Tree-Structured Multinomial Bayesian Networks"
    https://pubmed.ncbi.nlm.nih.gov/18382603/
    
    and algorithm 2 in
    "Computing the Multinomial Stochastic Complexity in Sub-Linear Time"
    http://pgm08.cs.aau.dk/Papers/31_Paper.pdf
    
    
    Args
    ----------
        n (int): sample size of a dataset
        K (int): K-value multinomal distribution
        
    Returns
    ----------
        float: (Approximated) Multinomal Normalizing Sum 
    
    """
    
    total = 1
    b = 1
    d = 16 # 16 digit precision

    bound = int(ceil(2 + sqrt( -2 * n * np.log(2 * 10**(-d) - 100 ** (-d)))))

    for k in range(1, bound + 1):
        b = (n - k + 1) / n * b
        total += b

    old_sum = 1

    for j in range(3, K + 1):
        new_sum = total + (n * old_sum) / (j - 2)
        old_sum = total
        total = new_sum

    return total
    

def parametric_complexity(X, Y, model_type: str, X_ndistinct_vals=None, Y_ndistinct_vals=None):
    """Computes the Parametric Complexity of Multinomals. 
    
    Args
    ----------
        model_type (str): ["to", "gets", "indep", "confounder"]
        X (sequence): sequence of discrete outcomes
        Y (sequence): sequence of discrete outcomes
        X_ndistinct_vals (int): number of distinct values of the multinomial r.v X.
        Y_ndistinct_vals (int): number of distinct values of the multinomial r.v Y.
         
    Returns
    ----------
        float: Parametric Complexity of Multinomals 
        
    """
    
    assert len(X)==len(Y)
    n = len(X)
    X_ndistinct_vals = X_ndistinct_vals or len(np.unique(X))
    Y_ndistinct_vals = Y_ndistinct_vals or len(np.unique(Y))
    
    
    if model_type == "confounder":
        return  log2(C_MN(n=n, K=X_ndistinct_vals * Y_ndistinct_vals))
    
    else:
        return  log2(C_MN(n=n, K=X_ndistinct_vals)) + log2(C_MN(n=n, K=Y_ndistinct_vals))

In [188]:
%%time
C_MN(n=10000, K=20)

CPU times: user 175 µs, sys: 0 ns, total: 175 µs
Wall time: 179 µs


8.985119564280846e+29

In [192]:
%%time
C_MN(n=10000000, K=20)

CPU times: user 5.03 ms, sys: 454 µs, total: 5.49 ms
Wall time: 5.1 ms


2.1526114216272685e+58

In [193]:
log2(6.941955710839545e+48)

195.46717166437602

In [53]:
%%time
compute_multinomal_with_recurrence(n=1000000, K=20)

CPU times: user 1.48 ms, sys: 0 ns, total: 1.48 ms
Wall time: 1.48 ms


6.941955712181016e+48

In [55]:
log2(6.941955712181016e+48)

162.24789071578118

In [221]:
import numpy as np
import argparse
parser = argparse.ArgumentParser(description="BBCI single experiment")
parser.add_argument("--N", type=int, default=1000, help="number of samples")
parser.add_argument("--m0", type=int, default=4, help="number of distinct values of the multinomial r.v X")
parser.add_argument("--m1", type=int, default=3, help="number of distinct values of the multinomial r.v Y")
args = parser.parse_args([])


x0 = np.random.randint(args.m0, size=args.N)
x1 = (x0 + np.random.randint(args.m1, size=args.N)) % args.m1

In [31]:
len(np.unique(x1))

3

In [179]:
%%time
parametric_complexity(x0, x1, model_type="to")

CPU times: user 2.08 s, sys: 88.1 ms, total: 2.16 s
Wall time: 2.1 s


55.64200710899405

In [180]:
parametric_complexity(x0, x1, model_type="gets")

55.64200710899405

In [35]:
%%time
parametric_complexity(x0, x1, model_type="confounder")

CPU times: user 2min 47s, sys: 2.71 s, total: 2min 50s
Wall time: 2min 48s


62.84586560707742

In [37]:
log2(5.274725986932661e+20)

68.83765804795357

In [48]:
def compute_multinomal_with_recurrence(K: int, n:int):
    total = 1
    b = 1
    d = 16
    
    bound = int(ceil(2 + sqrt( -2 * n * np.log(2 * 10**(-d) - 100 ** (-d)))))
    
    for k in range(1, bound + 1):
        b = (n - k + 1) / n * b
        total += b
       
    old_sum = 1
    
    for j in range(3, K + 1):
        new_sum = total + (n * old_sum) / (j - 2)
        old_sum = total
        total = new_sum
       
    return total
        
    

In [207]:
%%time
compute_multinomal_with_recurrence(n=100, K=20)

CPU times: user 51 µs, sys: 249 µs, total: 300 µs
Wall time: 608 µs


1157718227106.408

In [59]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ref: https://github.molgen.mpg.de/EDA/cisc/blob/master/formatter.py
"""This module provides common methods for manipulating data"""
from collections import defaultdict


def stratify(X, Y):
    """Stratifies Y based on unique values of X.
    Args:
        X (sequence): sequence of discrete outcomes
        Y (sequence): sequence of discrete outcomes
    Returns:
        (dict): list of Y-values for a X-value
    """
    Y_grps = defaultdict(list)
    for i, x in enumerate(X):
        Y_grps[x].append(Y[i])
    return Y_grps


def to_nested(X):
    """Converts the given sequence to a nested sequence.
    Args:
        X (sequence): sequence of discrete outcomes
    Returns:
        (nested sequence): nested sequence of X
    """
    return [[x] for x in X]

In [56]:
def map_to_majority(X, Y):
    """Creates a function that maps y to the frequently co-occuring x.
    Args:
        X (sequence): sequence of discrete outcomes
        Y (sequence): sequence of discrete outcomes
    Returns:
        (dict): map from Y-values to frequently co-occuring X-values
    """
    f = dict()
    Y_grps = stratify(X, Y)
    for x, Ys in Y_grps.items():
        frequent_y, _ = Counter(Ys).most_common(1)[0]
        f[x] = frequent_y
    return f

In [242]:
X = np.random.choice([0, 1, 2, 3], 1000)
Y = np.random.choice([4, 5, 6], 1000)

f = map_to_majority(X, Y)

In [243]:
f

{0: 5, 3: 6, 2: 6, 1: 5}

In [246]:
pair_cnt = defaultdict(lambda: defaultdict(int))
for x, y in zip(X, Y):
    pair_cnt[x][y] += 1

In [247]:
pair_cnt

defaultdict(<function __main__.<lambda>()>,
            {0: defaultdict(int, {6: 69, 4: 69, 5: 80}),
             3: defaultdict(int, {4: 102, 5: 83, 6: 106}),
             2: defaultdict(int, {5: 86, 6: 90, 4: 82}),
             1: defaultdict(int, {5: 87, 4: 72, 6: 74})})

In [77]:
f_inv = dict()
for k, v in f.items():
    f_inv[v] = k
f_inv

{-1: 1, -2: -1, 2: 4}

In [75]:
freqs = Counter(X)
for freq in freqs.items():
    print(freq)

(2, 244)
(1, 260)
(-1, 253)
(4, 243)


In [329]:
def update_regression(C, E, f):
    """
    Args
    -------
        C (sequence): sequence of discrete outcomes
        E (sequence): sequence of discrete outcomes
        f (dict): map from Y-values to frequently co-occuring X-values
        
    """
    

SyntaxError: invalid syntax (2264971159.py, line 1)

In [322]:
def cause_effect_ml(C_freqs, E_freqs, n):
    """Compute maximum log likelihood for cause & effect pair.
    
    Args
    -------
        C_freqs (Counter): Counter of discrete outcomes (Cause)
        E_freqs (Counter): Counter of discrete outcomes (Effect)
        n (int): sample size 
    
    Returns
    -------
        (float): maximum log likelihood 
    """
    pass

def maximum_log_likelihood(X, Y, model_type, f=None):
    """Compute maximum log-likelihood of the model given observations z^n.
    
    Args
    ------
        X (sequence): sequence of discrete outcomes
        Y (sequence): sequence of discrete outcomes
        model_type (str): one of ["to", "gets", "indep", "confounder"]
        f (dict): map from Y-values to frequently co-occuring X-values
    Returns
    -----------
        (float): (negative) maximum log likelihood 
    """
    supp_X = list(set(X))
    supp_Y = list(set(Y))
    mod_X = len(set(X))
    mod_Y = len(set(Y))
    X_freqs = Counter(X)
    Y_freqs = Counter(Y)
    n = len(X)
    loglikelihood = 0.0
    
    pair_cnt = defaultdict(lambda: defaultdict(int))
    for x, y in zip(X, Y):
        pair_cnt[x][y] += 1
    
    # 今のところfは，とりあえずこれで実行する
    if model_type == "to":
        f = map_to_majority(X, Y)
    elif model_type == "gets":
        g = map_to_majority(Y, X)
            
    if model_type == "to":
        for freq in X_freqs.values():
            loglikelihood += freq * (log2(n) - log2(freq))            
        
        for e_y in supp_Y:
            freq = 0
            for y in supp_Y:
                for x in supp_X:
                    if (y - f[x]) % mod_Y == e_y:
                        freq += pair_cnt[x][y]
            loglikelihood += freq * (log2(n) - log2(freq))
                
#         for y in supp_Y:
#             freq = 0
#             for x in supp_X:
#                 if f[x] == (y - x) % mod_Y:
#                     freq += pair_cnt[x][y]
#                 loglikelihood += freq * (log2(n) - log2(freq))
            
    elif model_type == "gets":
        for freq in Y_freqs.values():
            loglikelihood += freq * (log2(n) - log2(freq))            
        
        for e_x in supp_X:
            freq = 0
            for x in supp_X:
                for y in supp_Y:
                    if (x - g[y]) % mod_X == e_x:
                        freq += pair_cnt[x][y]
            loglikelihood += freq * (log2(n) - log2(freq))
#         for x in supp_X:
#             freq = 0
#             for y in supp_Y:
#                 if f[y] == (x - y) % mod_X:
#                     freq += pair_cnt[x][y]
#                 loglikelihood += freq * (log2(n) - log2(freq))
    
    elif model_type == "indep":
        for freq in X_freqs.values():
            loglikelihood += freq * (log2(n) - log2(freq))  
        for freq in Y_freqs.values():
            loglikelihood += freq * (log2(n) - log2(freq)) 
    
    elif model_type == "confounder":
        for x in supp_X:
            for y in supp_Y:
                loglikelihood += pair_cnt[x][y] * (log2(n) - log2(pair_cnt[x][y]))
    
    return loglikelihood
        

In [323]:

pair_cnt = defaultdict(lambda: defaultdict(int))
for x, y in zip(x0, x1):
    pair_cnt[x][y] += 1
pair_cnt

defaultdict(<function __main__.<lambda>()>,
            {0: defaultdict(int, {1: 800, 2: 814, 0: 856}),
             3: defaultdict(int, {2: 822, 1: 890, 0: 846}),
             2: defaultdict(int, {1: 831, 0: 826, 2: 811}),
             1: defaultdict(int, {2: 833, 1: 851, 0: 820})})

In [324]:
X_freqs = Counter(x0)
X_freqs, sum(X_freqs.values())

(Counter({0: 2470, 3: 2558, 2: 2468, 1: 2504}), 10000)

In [325]:
maximum_log_likelihood(x0, x1, model_type="to")

35845.02851718127

In [326]:
maximum_log_likelihood(x0, x1, model_type="gets")

35845.445024273075

In [327]:
maximum_log_likelihood(x0, x1, model_type="indep")

35847.11205869069

In [328]:
maximum_log_likelihood(x0, x1, model_type="confounder")

35844.0046083294

In [211]:
maximum_log_likelihood(X=np.random.randint(5, size=100000),
                          Y=np.random.randint(4, size=10000), model_type="to")

514437.27156313084

In [218]:
def sc(X, Y, model_type: str, X_ndistinct_vals=None, Y_ndistinct_vals=None):
    """Computes the stochastic complexity of z^n(two discrete sequences).
    
    Args
    ------        
        X (sequence): sequence of discrete outcomes
        Y (sequence): sequence of discrete outcomes
        model_type (str): ["to", "gets", "indep", "confounder"]
        X_ndistinct_vals (int): number of distinct values of the multinomial r.v X.
        Y_ndistinct_vals (int): number of distinct values of the multinomial r.v Y.
         
    Returns
    ----------
        float: Stochastic Complexity of a given dataset
    """
    assert len(X)==len(Y)
    X_ndistinct_vals = X_ndistinct_vals or len(set(X))
    Y_ndistinct_vals = Y_ndistinct_vals or len(set(Y))
    
    neg_log_likelihood =  maximum_log_likelihood(X, Y, model_type)
    parametric_comp = parametric_complexity(X, Y, model_type, X_ndistinct_vals, Y_ndistinct_vals)
    
    stochastic_complexity = neg_log_likelihood + parametric_comp
    
    # add function code length
    if model_type == "to":
        stochastic_complexity += X_ndistinct_vals * log2(Y_ndistinct_vals)
    elif model_type == "gets":
        stochastic_complexity += Y_ndistinct_vals * log2(X_ndistinct_vals)
    
    return stochastic_complexity
    

In [297]:
MODEL_CANDIDATES = ["to", "gets", "indep", "confounder"]

def ndm(X, Y, ):
    """NML Discrete Model
    """
    
    results = []
    
    for model_type in MODEL_CANDIDATES:
        stochastic_complexity = sc(X, Y, model_type)
        results.append((stochastic_complexity, model_type))
    
    return results

        

In [317]:
ndm(x0, x1)

[(35883.9621303126, 'to'),
 (35884.038787401514, 'gets'),
 (35879.70582181913, 'indep'),
 (35905.69288080137, 'confounder')]

In [301]:
c = np.random.randint(args.m0, size=args.N)
x0 = (c + np.random.randint(args.m0, size=args.N)) % args.m0
x1 = (c + np.random.randint(args.m1, size=args.N)) % args.m1

In [306]:
x0 = np.random.randint(args.m0, size=args.N)
x1 = np.random.randint(args.m1, size=args.N)

In [316]:
x0 = np.random.randint(args.m0, size=args.N)
x1 = (x0 + np.random.randint(args.m1, size=args.N)) % args.m1

In [305]:
args.N=10000