In [8]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


# Assignment 3 - Parameter Learning


## Loading data from a text file

The following cell loads the data stored in the text files "train.txt" and "test.txt". This results in two NumPy arrays with shapes 500x5 (train.txt) and 5000x5 (test.txt) - 500 and 5000 samples of the following 5 random variables:

Column 0: S ... stress (false (0) or true (1))     
Column 1: E ... easily catches cold (false (0) or true (1))  
Column 2: G ... genetic disposition (false (0) or true (1))   
Column 3: I ... increased blood pressure (false (0) or true (1))   
Column 4: H ... heart attack (false (0) or true (1))  

In [116]:
# load the training dataset Y. Note: the file "train.txt" has to be in the same directory you started the ipython notebook server in
Y = np.loadtxt('train.txt', dtype=int)
print Y.shape
# print Y[:,0]
# load the training dataset Z (used for last exercise). Again, the file has to be in correct directory.
Z = np.loadtxt('test.txt', dtype=int)
print Z.shape

# row indices of the random variables
row_index = {"_s_" : 0, "_e_" : 1, "_g_" : 2, "_i_" : 3, "_h_" : 4}

(500L, 5L)
(5000L, 5L)


## Helper functions

For each shape of the probability table there is a corresponding sample taking the row indices and returning the probability table. Those will be further use for computing the probabilities for each model.

### Model Learning for 1D Probability Distribution Table

In [105]:
def mle_1d(var1_column):
    """ 
    Calculates the probability distribution
    of a variable given the training data
    :param var1_column: column index of the random variable
    :returns: Numpy array containing the probability distribution learned from the training data
    """
    count_pos = 0
    var1_val = Y[:, var1_column]
    for i in range(0, var1_val.size):
        if var1_val[i] == 1:
            count_pos += 1
            
    return [double(var1_val.size-count_pos)/var1_val.size, double(count_pos)/var1_val.size]

In [70]:
def mle_2d(var1_column, var2_column):
    """ 
    Calculates the probability distribution of a conditional probability
    P(var1|var2)
    :param var1_column: column index of the first random variable
    :param var2_column: column index of the second random variable
    :returns: Numpy array containing the probability distribution learned from the training data
    """
    if (var1_column == var2_column) :
        return "Invalid input parameters. Column indices should be different"
    var1_val = Y[:, var1_column]
    var2_val = Y[:, var2_column]
    
    mle_pd = array([[0, 0],
              [0, 0]])
    for i in range(0, var1_val.size):
        mle_pd [var1_val[i]][var2_val[i]] += 1 
        
    return array([[double(mle_pd[0][0])/np.sum(mle_pd[:, 0]), double(mle_pd[0][1])/np.sum(mle_pd[:, 1])],
            [double(mle_pd[1][0])/np.sum(mle_pd[:, 0]), double(mle_pd[1][1])/np.sum(mle_pd[:, 1])]])

In [131]:
def mle_3d(var1_column, var2_column, var3_column):
    """ 
    Calculates the probability distribution of a conditional probability
    P(var1|var2, var3)
    :param var1_column: column index of the first random variable
    :param var2_column: column index of the second random variable
    :param var3_column: column index of the third random variable
    :returns: Numpy array containing the probability distribution learned from the training data
    """
    if (var1_column == var2_column or var1_column == var3_column or var2_column == var3_column) :
        return "Invalid input parameters. Column indices should be different"
    
    var1_val = Y[:, var1_column]
    var2_val = Y[:, var2_column]
    var3_val = Y[:, var3_column]
    
    mle_pd = array ([[[0.0, 0.0],
                    [0.0, 0.0]],
                   [[0.0, 0.0],
                   [0.0, 0.0]]])
    
    for i in range(0, var1_val.size):
        mle_pd [var1_val[i]][var2_val[i]][var3_val[i]] += 1
    
    for i in range(0, 2):
        for j in range(0, 2):
            sum = np.sum(mle_pd[:, i, j])
            for k in range (0, 2):
                x = double(mle_pd[k][i][j])/sum
                mle_pd[k][i][j] = double(mle_pd[k][i][j])/sum
                 
    return mle_pd

## Model 1

Learn probability tables for the first model.

In [144]:
# Displaying information about the PDs of the first model
S = mle_1d(row_index["_s_"])
E = mle_1d(row_index["_e_"])
G = mle_1d(row_index["_g_"])
I = mle_1d(row_index["_i_"])
H = mle_1d(row_index["_h_"])
print 'Probability distribution table for P(S): ', S
print 'Probability distribution table for P(E): ', E
print 'Probability distribution table for P(G): ', G
print 'Probability distribution table for P(I): ', I
print 'Probability distribution table for P(H): ', H

Probability distribution table for P(S):  [0.80600000000000005, 0.19400000000000001]
Probability distribution table for P(E):  [0.90400000000000003, 0.096000000000000002]
Probability distribution table for P(G):  [0.88400000000000001, 0.11600000000000001]
Probability distribution table for P(I):  [0.83399999999999996, 0.16600000000000001]
Probability distribution table for P(H):  [0.90000000000000002, 0.10000000000000001]


### Log Likelihood of the first model


In [173]:
def ml1_loglik(data):
    l = 0
    sample_no = data[:, 0].size
    for i in range(0, sample_no):
        row = data[i]
        sample_prob = S[row[0]] * E[row[1]] * G[row[2]] * I[row[3]] * H[row[4]]
        l += np.log(sample_prob)
    return l

print ml1_loglik(Y)
print ml1_loglik(Z)

-970.811902926
-9545.45920494


## Model 2

Learn probability tables for the second model.


In [145]:
# Displaying information about the PDs of the second model
E_S = mle_2d(row_index["_e_"], row_index["_s_"])
I_GS = mle_3d(row_index["_i_"], row_index["_g_"], row_index["_s_"])
H_I = mle_2d(row_index["_h_"], row_index["_i_"])
print 'Probability distribution table for P(S): ', S
print 'Probability distribution table for P(E|S): ', E_S
print 'Probability distribution table for P(G): ', G
print 'Probability distribution table for P(I|G, S): ', I_GS
print 'Probability distribution table for P(H|I): ', H_I

Probability distribution table for P(S):  [0.80600000000000005, 0.19400000000000001]
Probability distribution table for P(E|S):  [[ 0.92059553  0.83505155]
 [ 0.07940447  0.16494845]]
Probability distribution table for P(G):  [0.88400000000000001, 0.11600000000000001]
Probability distribution table for P(I|G, S):  [[[ 0.90960452  0.67045455]
  [ 0.69387755  0.22222222]]

 [[ 0.09039548  0.32954545]
  [ 0.30612245  0.77777778]]]
Probability distribution table for P(H|I):  [[ 0.90647482  0.86746988]
 [ 0.09352518  0.13253012]]


### Log Likelihood of the second model


In [180]:
def ml2_loglik(data):
    l = 0
    sample_no = data[:, 0].size
    for i in range(0, sample_no):
        row = data[i]
        sample_prob = S[row[0]] * E_S[row[1]][row[0]] * G[row[2]] * I_GS[row[3]][row[2]][row[0]] * H_I[row[4]][row[3]]
        #
        l += np.log(sample_prob)
    return l

print ml2_loglik(Y)
print ml2_loglik(Z)

-940.762978609
-9216.89949044


## Model 3

Learn probability tables for the third model.


In [146]:
# Displaying information about the PDs of the third model
S_G = mle_2d(row_index["_s_"], row_index["_g_"])
E_SI = mle_3d(row_index["_e_"], row_index["_s_"], row_index["_i_"])
H_IE = mle_3d(row_index["_h_"], row_index["_i_"], row_index["_e_"] )
print 'Probability distribution table for P(G): ', G
print 'Probability distribution table for P(S|G): ', S_G
print 'Probability distribution table for P(E|S, I): ', E_SI
print 'Probability distribution table for P(I|G, S): ', I_GS
print 'Probability distribution table for P(H|I, E): ', H_IE

Probability distribution table for P(G):  [0.88400000000000001, 0.11600000000000001]
Probability distribution table for P(S|G):  [[ 0.80090498  0.84482759]
 [ 0.19909502  0.15517241]]
Probability distribution table for P(E|S, I):  [[[ 0.91853933  0.93617021]
  [ 0.81967213  0.86111111]]

 [[ 0.08146067  0.06382979]
  [ 0.18032787  0.13888889]]]
Probability distribution table for P(I|G, S):  [[[ 0.90960452  0.67045455]
  [ 0.69387755  0.22222222]]

 [[ 0.09039548  0.32954545]
  [ 0.30612245  0.77777778]]]
Probability distribution table for P(H|I, E):  [[[ 0.9071618  0.9      ]
  [ 0.88       0.75     ]]

 [[ 0.0928382  0.1      ]
  [ 0.12       0.25     ]]]


### Log Likelihood of the second model


In [181]:
def ml3_loglik(data):
    l = 0
    sample_no = data[:, 0].size
    for i in range(0, sample_no):
        row = data[i]
        sample_prob = G[row[2]] * S_G[row[0]][row[2]] * E_SI[row[1]][row[0]][row[3]] * I_GS[row[3]][row[2]][row[0]] * H_IE[row[4]][row[3]][row[1]]
        
        l += np.log(sample_prob)
    return l

print ml3_loglik(Y)
print ml3_loglik(Z)

-939.734032094
-9226.91241436


# Comparison

## Part A - Comparison of the log likelihood of models on the training data

## Part B - Comparison of the log likelihood of models on the test data
