In [2]:
import pandas as pd
import numpy as np

def is_cloudy():
    """cloudy is an independent random variable.
    """
    return (np.random.random() < .5)
    
def is_sprinkler(cloudy):
    """sprinkler is dependent on cloudiness. if it is 
    cloudy, sprinkler has a 10% of being on. otherwise, 
    if it is sunny, then the sprinkler has a 50% of being 
    on.
    """
    if cloudy:
        return (np.random.random() < .1)
    else:
        return (np.random.random() < .5)

def is_rain(cloudy):
    if cloudy:
        return (np.random.random() < .8)
    else:
        return (np.random.random() < .2)
    
def is_wet(sprinkler, rain):
    if ((sprinkler == True) & (rain == True)):
        return (np.random.random() < .99)
    elif ((sprinkler == False) & (rain == True)):
        return (np.random.random() < .9)
    elif ((sprinkler == True) & (rain == False)):
        return (np.random.random() < .9)
    else:
        return False

In [71]:
# generate toy dataset via sampling (we couldve done this by multiplying probabilities)

n_samples = 10000

ls_c = []
ls_s = []
ls_r = []
ls_w = []

for _ in range(n_samples):
    c = is_cloudy()
    s = is_sprinkler(c)
    r = is_rain(c)
    w = is_wet(s, r)
    
    ls_c.append(c)
    ls_s.append(s)
    ls_r.append(r)
    ls_w.append(w)

collection = {"cloudy": ls_c, "sprinkler": ls_s, "rain": ls_r, "wet": ls_w}

In [72]:
# joint observations
df = pd.DataFrame(collection)
df.head()

Unnamed: 0,cloudy,rain,sprinkler,wet
0,True,True,False,True
1,False,False,True,True
2,False,False,True,True
3,False,False,True,False
4,True,True,False,True


In [120]:
parent = ['cloudy']
child = 'sprinkler'

In [121]:
ps = [df[x] for x in parent]
cs = df[child]

In [122]:
# https://stackoverflow.com/questions/53510319/python-pandas-merging-with-more-than-one-level-overlap-on-a-multi-index-is-not
pd.crosstab(ps, cs, normalize = 'index').reset_index()

sprinkler,cloudy,False,True
0,False,0.5001,0.4999
1,True,0.893869,0.106131


In [123]:
class Node(object):
    
    def __init__(self, name, ls_parents):
        self.name = name
        self.ls_parents = ls_parents
        pass    

In [124]:
class BN(object):
    
    def __init__(self, ls_nodes, observations):
        self.ls_nodes = ls_nodes
        self.observations = observations
        
        # create dict of nodes for fast lookup
        self.dict_nodes = self._generate_dict_nodes(ls_nodes)
        
    def _generate_dict_nodes(self, ls_nodes):
        d = {}
        for node in ls_nodes:
            d[node.name] = node
        return d
    
    def generate_cpt(self, name):
        
        # first, fetch node
        node = self.dict_nodes[name]
        
        # then find its parents
        parent = node.ls_parents
        
        # subset its corresponding marginals
        ps = [self.observations[x.name] for x in parent]
        cs = self.observations[node.name]
        
        # finally, crosstab
        return pd.crosstab(ps, cs, normalize = 'index').reset_index()

In [125]:
C = Node("cloudy", None)
R = Node("rain", [C])
S = Node("sprinkler", [C])
W = Node("wet", [R, S])

In [126]:
ls_n = [C, R, S, W]

In [9]:
joint_observations = pd.read_csv("observations.csv").drop("Unnamed: 0", axis=1)
joint_observations.columns

Index(['cloudy', 'rain', 'sprinkler', 'wet'], dtype='object')

In [127]:
bayes_network = BN(ls_n, df)

In [131]:
h = bayes_network.generate_cpt("wet")

In [134]:
h.columns

Index(['rain', 'sprinkler', False, True], dtype='object', name='wet')

In [135]:
h.head()

wet,rain,sprinkler,False,True
0,False,False,1.0,0.0
1,False,True,0.101908,0.898092
2,True,False,0.097028,0.902972
3,True,True,0.012346,0.987654
