In [5]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Loading data from a text file

The following cell loads the data stored in the text files "train.txt" and "test.txt". This results in two NumPy arrays with shapes 500x5 (train.txt) and 5000x5 (test.txt) - 500 and 5000 samples of the following 5 random variables:

Column 0: S ... stress (false (0) or true (1))     
Column 1: E ... easily catches cold (false (0) or true (1))  
Column 2: G ... genetic disposition (false (0) or true (1))   
Column 3: I ... increased blood pressure (false (0) or true (1))   
Column 4: H ... heart attack (false (0) or true (1))   

In [137]:
# load the training dataset Y. Note: the file "train.txt" has to be in the same directory you started the ipython notebook server in
Y = np.loadtxt('train.txt', dtype=int)
print Y.shape

# load the training dataset Z (used for last exercise). Again, the file has to be in correct directory.
Z = np.loadtxt('test.txt', dtype=int)
print Z.shape

# row indices of the random variables
_s_, _e_, _g_, _i_, _h_ = 0, 1, 2, 3, 4

def counter_1d(X, x, y):
    return where(X[:, x] == y)[0].shape[0]

def counter_2d(X, x, y, v, w):
    return where((X[:, x] == y) & (X[:, v] == w))[0].shape[0]

def mle_1d(X, x):
    sm = X[:, x].sum()
    num = X[:, x].shape[0] 
    #print sm
    if num == 0.0: 
        return array([NaN, NaN]) #suggestion from T.A.
    else:
        return array([1 - sm / float(num), sm / float(num)])
    
def mle_2d(X, a, b):  
    x = X[:, a]
    y = X[:, b]
    num = X.shape[0] 
    condA = where((x == 0) & (y == 0))[0].shape[0] / float(num)
    condB = where((x == 0) & (y == 1))[0].shape[0] / float(num)
    condC = where((x == 1) & (y == 0))[0].shape[0] / float(num)
    condD = 1 - (condA + condB + condC)   
    return array([[condA, condB],
             [condC, condD]])
    

def mle_3d(X, a, b, c):  
    x = X[:, a]
    y = X[:, b]
    z = X[:, c]
    num = X.shape[0] 
    condA = where((x == 0) & (y == 0) & (z == 0))[0].shape[0] / float(num)
    condB = where((x == 0) & (y == 0) & (z == 1))[0].shape[0] / float(num)
    condC = where((x == 0) & (y == 1) & (z == 0))[0].shape[0] / float(num)
    condD = where((x == 0) & (y == 1) & (z == 1))[0].shape[0] / float(num)
    condE = where((x == 1) & (y == 0) & (z == 0))[0].shape[0] / float(num)
    condF = where((x == 1) & (y == 0) & (z == 1))[0].shape[0] / float(num)
    condG = where((x == 1) & (y == 1) & (z == 0))[0].shape[0] / float(num)
    condH = 1 - (condA + condB + condC + condD + condE + condF + condG)  
    return array([[[condA, condB], [condC, condD]], [[condE, condF], [condG, condH]]])   


(500, 5)
(5000, 5)


In [142]:
S = mle_1d(Y,_s_)
E = mle_1d(Y,_e_)
G = mle_1d(Y,_g_)
I = mle_1d(Y,_i_)
H = mle_1d(Y,_h_)

print S
print E
print G
print I
print H

#mle_2d(Y, 2, 3)
#mle_3d(Y, 0, 1, 2)

#print _s_

ES = mle_2d(Y, _e_, _s_)
E_cond_S = ES / ES.sum(1) / (ES / ES.sum(1)).sum(0)
print E_cond_S

HI = mle_2d(Y, _h_, _i_)
H_cond_I = HI / HI.sum(1) / (HI / HI.sum(1)).sum(0)
print H_cond_I

IGS = mle_3d(Y, _i_, _g_, _s_)
GS = IGS.sum(0)
I_cond_GS = IGS / GS
print I_cond_GS 

Ns0 = counter_1d(Y, _s_, 0)

Ns1 = counter_1d(Y, _s_, 1)

Ne0s0 = counter_2d(Y, _e_, 0, _s_, 0)

Ne0s1 = counter_2d(Y, _e_, 0, _s_, 1)

Ne1s0 = counter_2d(Y, _e_, 1, _s_, 0)

Ne1s1 = counter_2d(Y, _e_, 1, _s_, 1)

print array([[Ne0s0 / float(Ns0), Ne0s1 / float(Ns1)], [Ne1s0 / float(Ns0), Ne1s1 / float(Ns1)]])

[ 0.806  0.194]
[ 0.904  0.096]
[ 0.884  0.116]
[ 0.834  0.166]
[ 0.9  0.1]
[[ 0.92059553  0.83505155]
 [ 0.07940447  0.16494845]]
[[ 0.90647482  0.86746988]
 [ 0.09352518  0.13253012]]
[[[ 0.90960452  0.67045455]
  [ 0.69387755  0.22222222]]

 [[ 0.09039548  0.32954545]
  [ 0.30612245  0.77777778]]]
[[ 0.92059553  0.83505155]
 [ 0.07940447  0.16494845]]
[[ 0.95057129  0.80184805]
 [ 0.04942871  0.19815195]]
