# Worksheet 5 Generative models I 
# author: Michael Galarnyk test

In [20]:
import numpy as np
import sys
import os

def read_data(path, file_name):
    with open(os.path.join(path, file_name), 'r') as f:
        data_dic = {}
        increment = 0
        for line in f:
            doc_word_count_gen = (int(i) for i in line.split()) # generator comprehension
            doc_id = next(doc_word_count_gen)
            word_id = next(doc_word_count_gen)
            count = next(doc_word_count_gen)
            data_dic[increment] =  [doc_id, word_id, count] 
            increment +=1
        data_dic = np.array(data_dic.values())
    return data_dic

#path = '20news-bydate/matlab/'
#data_array = read_data(path, 'train.data')

In [21]:
def read_label(path, file_name):
    with open(os.path.join(path, file_name), 'r') as f:
        label_list = [0]
        classify = [0 for i in range(21)]
        for line in f:
            label_list.append(int(line))
            classify[int(line)] += 1
        classify = map(lambda x: 1.0 * x / len(label_list), classify)
        label_array = np.array(label_list)
        pi = np.array(classify)
        pi[0] = 1.0
        pi = np.log2(pi)
    return label_array, pi

In [22]:
def setup_multinomial_model(label, data):
    m = np.zeros((21, 61189))
    len_data = data.shape[0]
    for i in range(len_data):
        data_iter = iter(data[i])
        doc_id = next(data_iter)
        word_id = next(data_iter)
        count = next(data_iter)
        classify = label[doc_id]
        m[classify][word_id] += count
    # Remove stop words
    stop_word = {12:"of", 23:"and",139:"an",978:"am",297:"at",51:"but",52:"with",33:"to",48:"on",27:"are",29:"the",72:"can",1367:"else",81:"for",301:"he",389:"she",99:"so"}
    for k in stop_word:
        m[:, k] = 0.0
    m += 1
    m[:,0] = 0.0
    s = np.sum(m, axis = 1)
    s_trans = np.transpose([s])
    m = m / s_trans
    m[:,0] = 1.0
    m = np.log2(m)
    return m

In [23]:
def naive_bayes(m, pi, test_data, test_label):
    len_test_data = test_data.shape[0]
    number_doc_plus_1 = len(test_label)
    test_m = np.zeros((number_doc_plus_1, 61189))
    for i in range(len_test_data):
        data_iter = iter(test_data[i])
        doc_id = next(data_iter)
        word_id = next(data_iter)
        count = next(data_iter)
        test_m[doc_id][word_id] += count

    # log(1+f)
    test_m = np.log2(1+test_m)
    error = 0
    for i in xrange(1, number_doc_plus_1):
        cur_doc = test_m[i]
        cur_s = np.sum(cur_doc * m, axis = 1)
        final = cur_s + pi
        final = final[1:]
        label = np.argmax(final) + 1
        if label != test_label[i]:
            error += 1
    return error * 100.0 / (number_doc_plus_1 - 1)

In [24]:
path = '20news-bydate/matlab/'
label_array, pi = read_label(path, 'train.label')
data_array = read_data(path, 'train.data')
m = setup_multinomial_model(label_array, data_array)

test_label, _ = read_label(path, 'test.label')
test_data = read_data(path, 'test.data')

err = naive_bayes(m, pi, test_data, test_label)
print "Error Rate: ", err

Error Rate:  20.546302465


In [25]:
#data_array

In [26]:
"""

len_test_data = test_data.shape[0]
number_doc_plus_1 = len(test_label)
test_m = np.zeros((number_doc_plus_1, 61189))
for i in range(len_test_data):
    data_iter = iter(test_data[i])
    doc_id = next(data_iter)
    word_id = next(data_iter)
    count = next(data_iter)
    test_m[doc_id][word_id] += count

# log(1+f)
test_m = np.log2(1+test_m)
error = 0
for i in range(1, number_doc_plus_1):
    cur_doc = test_m[i]
    cur_s = np.sum(cur_doc * m, axis = 1)
    final = cur_s + pi
    final = final[1:]
    label = np.argmax(final) + 1
    if label != test_label[i]:
        error += 1
return error * 100.0 / (number_doc_plus_1 - 1)

"""

'\n\nlen_test_data = test_data.shape[0]\nnumber_doc_plus_1 = len(test_label)\ntest_m = np.zeros((number_doc_plus_1, 61189))\nfor i in range(len_test_data):\n    data_iter = iter(test_data[i])\n    doc_id = next(data_iter)\n    word_id = next(data_iter)\n    count = next(data_iter)\n    test_m[doc_id][word_id] += count\n\n# log(1+f)\ntest_m = np.log2(1+test_m)\nerror = 0\nfor i in range(1, number_doc_plus_1):\n    cur_doc = test_m[i]\n    cur_s = np.sum(cur_doc * m, axis = 1)\n    final = cur_s + pi\n    final = final[1:]\n    label = np.argmax(final) + 1\n    if label != test_label[i]:\n        error += 1\nreturn error * 100.0 / (number_doc_plus_1 - 1)\n\n'

In [27]:

#next(data_iter)