# Causality Inference in Continuous Data using Bayesian Model Selection

In [1]:
import pickle
import numpy as np
import os
import sys
import pandas as pd
from sklearn import preprocessing
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.linear_model import BayesianRidge, ARDRegression
from scipy.stats import t

Creating Tübingen data set (Cause Effect Pairs - CEP)

In [13]:
data_dir = "./data/causality/causality-tubingen/"

file_list = pd.Series(os.listdir(data_dir))
file_list = file_list[file_list.str.contains("pair0")]
file_list = file_list[~file_list.str.contains("_des")].sort_values()

all_data = []
data_to_be_excluded = ["pair0052.txt","pair0053.txt","pair0054.txt","pair0055.txt","pair0071.txt"]
for cur_file in file_list:
    if cur_file in data_to_be_excluded:
        continue
    current_dataset = {}
    with open(data_dir + cur_file.replace(".txt", "_des.txt"), 'r', encoding="utf8", errors='ignore') as myfile:
        text_data = pd.Series(myfile.read().lower().split("\n"))
        text_data = text_data.str.replace("x \(int\):","x:").str.replace("y \(float\):","y:").str.replace("x =","x:").str.replace("y =","y:")
        text_data = text_data.str.replace("x\(t\)","x:").str.replace("y\(t\)","y:").str.replace("x is", "x:").str.replace("y is", "y:")
        text_data = text_data.str.replace("x \(first column\):", "x:").str.replace("y \(second column\):", "y:").str.replace("first column \(x\):","x:")
        text_data = text_data.str.replace("second column \(y\):","y:").str.replace("x :","x:").str.replace("y :","y:")
        try:
            current_dataset["file"] = cur_file
            current_dataset["x"] = text_data[text_data.str.contains("x:")].values[0].replace("x:","").strip().split("\t")[0]
            current_dataset["y"] = text_data[text_data.str.contains("y:")].values[0].replace("y:","").strip().split("\t")[0]
            current_dataset["dir"] = text_data[(text_data.str.contains("- - >") | text_data.str.contains("-- >") | text_data.str.contains("-->") | text_data.str.contains("->")) & (text_data.str.contains("x"))  & (text_data.str.contains("y")) & (~text_data.str.contains("very"))].values[0].replace(" ", "").replace("-->", "to").replace("->", "to").replace("groundtruth", "").replace(":", "")
            """print(cur_file)
            print("x: ",current_dataset["x"])
            print("y: ",current_dataset["y"])
            print(current_dataset["dir"])"""
        except: 
            print(text_data)
    current_dataset["data"] = np.loadtxt(data_dir+cur_file)
    all_data.append(current_dataset)
pickle.dump(all_data, open("data/causality/causality-tubingen/causality-tubingen.pkl","wb"))

## Modeling approach

We model causality as the statistical independence between the marginal distribution of the cause variable and the conditional distribution of the effect variable given the cause variable. We construct two versions of the same model in two directions; we deduct the direction of causality by comparing the model marginal likelihood for two directions, and decide for the direction for which the marginal likelihood is higher.

In all the models below, we assume the marginal distribution of the cause variable $P(C)$ is a Gaussian distribution with the variance unknown, thus we will be using a Student's $t$ distribution to model it. Therefore the alternative models presented below will differ from each other according to how the conditional distribution $P(E|C)$ is modeled.

### Modeling the conditional distribution with Bayesian linear regression

We model the conditional distribution with a Bayesian linear regression. The graphical model is presented below:

<img src="img/BLR.png" width="200">

$$
p(y|w,x) = \mathcal{N}(y|f(x;w), \beta) \\
f(x;w) = \sum^B_{i=1}w_i \phi_i (x) = w^\intercal \phi(x)\\
p(w|\alpha) = \mathcal{N} (w|0, \alpha^{-1} I) \\
$$

where $\alpha$ and $\beta$ are precision parameters. The priors for $\alpha$ and $\beta$ are chosen to be gamma distributions. In our construction the basis function corresponds to the powers of $x$ such that $\phi_i(x) = x^i$.  The number of basis functions, as well as the hyperparameters for the prior distributions are chosen such that the marginal likelihood for the given direction is maximized.

#### Implementation through scikit-learn library

In [None]:
def scale_x_y(x, y, scale_x, scale_y):
    if scale_x:
        x = preprocessing.scale(x)
    if scale_y:
        y = preprocessing.scale(y)
    return x, y

def get_likelihood(x_orig, x, y, max_term, params):
    alpha_1, alpha_2, lambda_1, lambda_2 = params
    cond_likelihood =-np.inf
    for i in range(1,max_term+1):
        x = np.hstack([x_orig ** j for j in range(1,i+1)])
        reg = BayesianRidge(compute_score=True, ormalize=normalize,
                            alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2)
        reg.fit(x,y)
        if cond_likelihood < reg.scores_[-1]:
            cond_likelihood = reg.scores_[-1]
    likelihood = cond_likelihood + np.sum(np.log(t.pdf(x_orig, len(x_orig)-1))) # xtoy# + 
    return likelihood, cond_likelihood

def compare_bayesian_regression(max_term=5, scale_x=True, scale_y=False, params=(1e-6,1e-6,1e-6,1e-6)):
    correct_exp, total_exp = 0, 0
    
    for entry in all_data:
        data = entry['data']
        
        x, y = data[:,0], data[:,1]
        x, y = scale_x_y(x, y, scale_x, scale_y)
        x_orig = x.reshape(1,len(x)).T
        xtoy, xtoy_cond = get_likelihood(x_orig, x, y, max_term, params)

        x, y = data[:,1], data[:,0]
        x, y = scale_x_y(x, y, scale_x, scale_y)
        x_orig = x.reshape(1,len(x)).T
        ytox, ytox_cond = get_likelihood(x_orig, x, y, max_term, params)
        
        result = "xtoy" if xtoy>ytox else "ytox"
        if result == entry["dir"]:
            correct_exp += 1
        total_exp += 1

    return correct_exp/total_exp

In [None]:
compare_bayesian_regression()

In [None]:
results = np.zeros((12,12))
for i in range(-6,7):
    for j in range(-6,7):
        results[i, j] = compare_bayesian_regression(params=(10**i, 1e-6, 10**j, 1e-6))

In [243]:
results = np.zeros((12,12))
for i in range(-6,7):
    for j in range(-6,7):
        results[i, j] = compare_bayesian_regression(scale_x=True, scale_y=True, params=(10**i, 1e-6, 10**j, 1e-6))

In [215]:
normalize=False
max_term = 1

total_exp = 0
correct_exp = 0.
for entry in all_data:
    data = entry['data']
    #checking x to y
    x, y = data[:,0], data[:,1]
    #x, y = preprocessing.scale(x), preprocessing.scale(y)
    #x = preprocessing.scale(x)
    x_orig = x.reshape(1,len(x)).T
    xtoy=-np.inf
    for i in range(1,max_term+1):
        x = np.hstack([x_orig ** j for j in range(1,i+1)])
        reg = GaussianProcessRegressor()
        reg.fit(x,y)
        if xtoy < reg.log_marginal_likelihood_value_:
            xtoy = reg.log_marginal_likelihood_value_
    xtoy = xtoy + np.sum(np.log(t.pdf(x_orig, len(x_orig)-1))) # xtoy# + 
    #checking y to x
    x, y = data[:,1], data[:,0]
    #x, y = preprocessing.scale(x), preprocessing.scale(y)
    #x = preprocessing.scale(x)
    x_orig = x.reshape(1,len(x)).T
    ytox=-np.inf
    for i in range(1,max_term+1):
        x = np.hstack([x_orig ** j for j in range(1,i+1)])
        reg = GaussianProcessRegressor()
        reg.fit(x,y)
        if ytox < reg.log_marginal_likelihood_value_:
            ytox = reg.log_marginal_likelihood_value_
    ytox = ytox + np.sum(np.log(t.pdf(x_orig, len(x_orig)-1)))  #ytox # +
    if xtoy > ytox:
        result="xtoy"
    else:
        result="ytox"
    
    if result == entry["dir"]:
        correct_exp += 1
    total_exp += 1
    
print(correct_exp/total_exp)

  Px /= np.sqrt(r*np.pi)*(1+(x**2)/r)**((r+1)/2)
  Px /= np.sqrt(r*np.pi)*(1+(x**2)/r)**((r+1)/2)


0.4791666666666667


## For bridges data

In [122]:
s_list = []
s = """   1.  IDENTIF	-	-			identifier of the examples
   2.  RIVER	n	A, M, O
   3.  LOCATION	n       1 to 52
   4.  ERECTED	c,n	1818-1986 ; CRAFTS, EMERGING, MATURE, MODERN
   5.  PURPOSE	n	WALK, AQUEDUCT, RR, HIGHWAY
   6.  LENGTH	c,n	804-4558 ; SHORT, MEDIUM, LONG
   7.  LANES	c,n	1, 2, 4, 6 ; 1, 2, 4, 6
   8.  CLEAR-G	n	N, G
   9.  T-OR-D	n	THROUGH, DECK
   10. MATERIAL	n	WOOD, IRON, STEEL
   11. SPAN	n	SHORT, MEDUIM, LONG
   12. REL-L	n	S, S-F, F
   13. TYPE	n	WOOD, SUSPEN, SIMPLE-T, ARCH, CANTILEV, CONT-T"""
for line in s.split("\n"):
    s_list.append(line[7:].lower().split("\t")[0])
df = pd.read_csv("data/causality/bridges/bridges.data.version2", header=None)
df.columns = s_list

In [124]:
from collections import Counter

In [135]:
df[(df[["erected", "span"]]!="?").all(axis=1)].groupby(["erected", "span"]).count()["identif"]

erected   span  
CRAFTS    MEDIUM     6
          SHORT      6
EMERGING  LONG       2
          MEDIUM     8
          SHORT      1
MATURE    LONG      19
          MEDIUM    30
          SHORT      2
MODERN    LONG       9
          MEDIUM     9
Name: identif, dtype: int64

In [138]:
Counter(df.type)

Counter({'WOOD': 16,
         'SUSPEN': 11,
         'SIMPLE-T': 44,
         'ARCH': 13,
         'CANTILEV': 11,
         'NIL': 1,
         'CONT-T': 10,
         '?': 2})

In [120]:
(df[["material", "lanes"]]!="?").all(axis=1).sum()

92

In [121]:
df[["material", "lanes"]]

Unnamed: 0,material,lanes
0,WOOD,2
1,WOOD,2
2,WOOD,1
3,WOOD,2
4,WOOD,2
5,WOOD,2
6,IRON,1
7,IRON,2
8,WOOD,1
9,WOOD,2


In [31]:
df.groupby([0,1,2]).count()[3]

0      1      2    
high   high   2        27
              3        27
              4        27
              5more    27
       low    2        27
              3        27
              4        27
              5more    27
       med    2        27
              3        27
              4        27
              5more    27
       vhigh  2        27
              3        27
              4        27
              5more    27
low    high   2        27
              3        27
              4        27
              5more    27
       low    2        27
              3        27
              4        27
              5more    27
       med    2        27
              3        27
              4        27
              5more    27
       vhigh  2        27
              3        27
                       ..
med    high   4        27
              5more    27
       low    2        27
              3        27
              4        27
              5more    27
       med    2   

In [22]:
Counter(df[6])

Counter({'unacc': 1210, 'acc': 384, 'vgood': 65, 'good': 69})

In [24]:
Counter(df.groupby(0.his

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed

In [11]:
df[[0,1]]

Unnamed: 0,0,1
0,vhigh,vhigh
1,vhigh,vhigh
2,vhigh,vhigh
3,vhigh,vhigh
4,vhigh,vhigh
5,vhigh,vhigh
6,vhigh,vhigh
7,vhigh,vhigh
8,vhigh,vhigh
9,vhigh,vhigh


In [None]:
pd.read_csv()

In [68]:
df = pd.read_csv("data/causality/abalone/abalone.data", header=None)

In [78]:
len(df[3].round(2).unique())

28

In [76]:
df[3].round?

In [None]:
df[3].round

In [55]:
len((df[1].unique()))

134

In [62]:
a = np.sort(df[1].unique())

In [65]:
df[1].str.

<pandas.core.strings.StringMethods at 0x7f178754fa90>

In [66]:
df[1].round

<bound method Series.round of 0       0.455
1        0.35
2        0.53
3        0.44
4        0.33
5       0.425
6        0.53
7       0.545
8       0.475
9        0.55
10      0.525
11       0.43
12       0.49
13      0.535
14       0.47
15        0.5
16      0.355
17       0.44
18      0.365
19       0.45
20      0.355
21       0.38
22      0.565
23       0.55
24      0.615
25       0.56
26       0.58
27       0.59
28      0.605
29      0.575
        ...  
4147    0.695
4148     0.77
4149     0.28
4150     0.33
4151     0.35
4152     0.37
4153     0.43
4154    0.435
4155     0.44
4156    0.475
4157    0.475
4158     0.48
4159     0.56
4160    0.585
4161    0.585
4162    0.385
4163     0.39
4164     0.39
4165    0.405
4166    0.475
4167      0.5
4168    0.515
4169     0.52
4170     0.55
4171     0.56
4172    0.565
4173     0.59
4174      0.6
4175    0.625
4176     0.71
Name: 1, Length: 4177, dtype: object>

In [None]:
pd.read_csv("./data/causality/")

In [8]:
from scipy.io import loadmat 
import scipy
import numpy as np

In [9]:
len(loadmat("data/nips/nips_data_yearly_collabs.mat")["yearly_collabs"])

26

In [11]:
sum_matrix = scipy.sparse.csr_matrix.todense(loadmat("data/nips/nips_data_yearly_collabs.mat")["yearly_collabs"][0][0])
for i in range(1,26):
    sum_matrix+=scipy.sparse.csr_matrix.todense(loadmat("data/nips/nips_data_yearly_collabs.mat")["yearly_collabs"][i][0])

In [12]:
np.sum(sum_matrix)

28136.0

In [13]:
13608*2

27216