In [28]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Loading data from a text file

The following cell loads the data stored in the text files "train.txt" and "test.txt". This results in two NumPy arrays with shapes 500x5 (train.txt) and 5000x5 (test.txt) - 500 and 5000 samples of the following 5 random variables:

Column 0: S ... stress (false (0) or true (1))     
Column 1: E ... easily catches cold (false (0) or true (1))  
Column 2: G ... genetic disposition (false (0) or true (1))   
Column 3: I ... increased blood pressure (false (0) or true (1))   
Column 4: H ... heart attack (false (0) or true (1))   

In [30]:
# load the training dataset Y. Note: the file "train.txt" has to be in the same directory you started the ipython notebook server in
Y = np.loadtxt('train.txt', dtype=int)
print Y.shape

# load the training dataset Z (used for last exercise). Again, the file has to be in correct directory.
Z = np.loadtxt('test.txt', dtype=int)
print Z.shape

# row indices of the random variables
_s_, _e_, _g_, _i_, _h_ = 0, 1, 2, 3, 4

na=newaxis

(500, 5)
(5000, 5)


# Helper functions for probability tables

In [33]:
def mle_1d(X, x):
    sm = X[:, x].sum()
    num = X[:, x].shape[0] 
    #print sm
    if num == 0.0: 
        return array([NaN, NaN]) #suggestion from T.A.
    else:
        return array([1 - sm / float(num), sm / float(num)])
    
def mle_2d(X, a, b):  
    x = X[:, a]
    y = X[:, b]
    num = X.shape[0] 
    condA = where((x == 0) & (y == 0))[0].shape[0] / float(num)
    condB = where((x == 0) & (y == 1))[0].shape[0] / float(num)
    condC = where((x == 1) & (y == 0))[0].shape[0] / float(num)
    condD = 1 - (condA + condB + condC)   
    ab = array([[condA, condB], [condC, condD]])
    return ab / ab.sum(1) / (ab / ab.sum(1)).sum(0)
    

def mle_3d(X, a, b, c):  
    x = X[:, a]
    y = X[:, b]
    z = X[:, c]
    num = X.shape[0] 
    condA = where((x == 0) & (y == 0) & (z == 0))[0].shape[0] / float(num)
    condB = where((x == 0) & (y == 0) & (z == 1))[0].shape[0] / float(num)
    condC = where((x == 0) & (y == 1) & (z == 0))[0].shape[0] / float(num)
    condD = where((x == 0) & (y == 1) & (z == 1))[0].shape[0] / float(num)
    condE = where((x == 1) & (y == 0) & (z == 0))[0].shape[0] / float(num)
    condF = where((x == 1) & (y == 0) & (z == 1))[0].shape[0] / float(num)
    condG = where((x == 1) & (y == 1) & (z == 0))[0].shape[0] / float(num)
    condH = 1 - (condA + condB + condC + condD + condE + condF + condG)  
    abc =  array([[[condA, condB], [condC, condD]], [[condE, condF], [condG, condH]]])   
    bc = abc.sum(0)
    return abc / bc

# 1.1.A) Probability tables
of P(S), P(G), P(I), P(E), P(H)

In [None]:
S = mle_1d(Y,_s_)
G = mle_1d(Y,_g_)
I = mle_1d(Y,_i_)
E = mle_1d(Y,_e_)
H = mle_1d(Y,_h_)

# 1.1.B) log likelihood of model 1 relative to the train dataset.

In [36]:
SEGIH1 = S[:, na, na, na, na] * E[na, :, na, na, na] * G[na, na, :, na, na] * I[na, na, na, :, na] * H[na, na, na, na, :] 
ll1r = log(SEGIH1[Y[:,0],Y[:,1],Y[:,2],Y[:,3],Y[:,4]]).sum()
print ll1r

-970.811902926


# 1.2.A) Probability tables
of P(E|S), P (I | G, S), and P (H | I) (The remaining ones are already given by 1A)).

In [38]:
E_S = mle_2d(Y, _e_, _s_)
H_I = mle_2d(Y, _h_, _i_)
I_GS = mle_3d(Y, _i_, _g_, _s_)

# 1.2.B) log likelihood of model 2 relative to the train dataset

In [40]:
SEGIH2 = S[:, na, na, na, na] * E_S.transpose((1, 0))[:, :, na, na, na] * G[na, na, :, na, na] * I_GS.transpose((2, 1, 0))[:, na, :, :, na] * H_I.transpose((1, 0))[na, na, na, :, :] 
ll2r = log(SEGIH2[Y[:,0],Y[:,1],Y[:,2],Y[:,3],Y[:,4]]).sum()
print ll2r

-940.762978609


# 1.3.A) Probability tables
of P (S | G), P (E | S, I), and P (H | I, E) (The remaining ones are already given by 1A) and 2A)).


In [42]:
S_G = mle_2d(Y, _s_, _g_)
E_SI = mle_3d(Y, _e_, _s_, _i_)
H_IE = mle_3d(Y, _h_, _i_, _e_)

# 1.3.B) log likelihood of model 3 relative to the train dataset

In [43]:
SEGIH3 = S_G[:, na, :, na, na] * E_SI.transpose((1, 0, 2))[:, :, na, :, na] * G[na, na, :, na, na] * I_GS.transpose((2, 1, 0))[:, na, :, :, na] * H_IE.transpose((2, 1, 0))[na, :, na, :, :] 
ll3r = log(SEGIH3[Y[:,0],Y[:,1],Y[:,2],Y[:,3],Y[:,4]]).sum()
print ll3r

-939.734032094


# 2.A
Model 3 is the best for training :-)

# 2.B Compare test data

In [44]:
print log(SEGIH1[Z[:,0],Z[:,1],Z[:,2],Z[:,3],Z[:,4]]).sum()
print log(SEGIH2[Z[:,0],Z[:,1],Z[:,2],Z[:,3],Z[:,4]]).sum()
print log(SEGIH3[Z[:,0],Z[:,1],Z[:,2],Z[:,3],Z[:,4]]).sum()

-9545.45920494
-9216.89949044
-9226.91241436


This time model 2 wins. Model 3 is probably overfitting