In [1]:
# encoding:utf-8

# 朴素贝叶斯分类器

import math, random
import numpy as np
from util.file_utils import *
from util.utils import *

In [2]:
def data_loader(data_dir):
    instances, labels = list(), list()
    for label in os.listdir(data_dir):
        sub_dir = os.path.join(data_dir, label)
        for name in os.listdir(sub_dir):
            one_line = '\n'.join(read_lines(os.path.join(sub_dir, name), 'gbk'))
            words = tokenizer(one_line, 'zh')
            instances.append(words)
            labels.append(label)
    total_instances = zip(instances, labels)
    random.shuffle(total_instances)
    inst_num_part = int(len(total_instances) * 0.1)
    return total_instances[:7*inst_num_part], total_instances[7*inst_num_part:]

In [3]:
def init_vocab(instances):
    labels, label2inds = list(), dict()
    words, word2inds = list(), dict()
    for tokens, label in instances:
        if label not in label2inds:
            label2inds[label] = len(labels)
            labels.append(label)
        for token in tokens:
            if token not in word2inds:
                word2inds[token] = len(words)
                words.append(token)
    return words, word2inds, labels, label2inds

In [4]:
def train(train_insts, words, word2inds, labels, label2inds):
    label_freqs = np.zeros(len(labels)) # 20
    label_token_freqs = np.zeros((len(labels), len(words))) # 20, 20000

    for tokens, label in train_insts:
        label_id = label2inds[label]
        label_freqs[label_id] += 1
        for token in tokens:
            token_id = word2inds[token]
            label_token_freqs[label_id, token_id] += 1

    label_token_freq_sum = np.sum(label_token_freqs, 1)

    # model_pc = label_freqs * 1.0 / np.sum(label_freqs)
    # model_pw = np.transpose(label_token_freqs) * 1.0 / label_token_freq_sum
    model_pc = (label_freqs + 1.0) / (np.sum(label_freqs) + len(labels))
    model_pw = np.transpose(label_token_freqs + 1.0) / (label_token_freq_sum + len(words))
    model_pw = np.transpose(model_pw)

    return model_pc, model_pw

In [5]:
def classify(tokens, classifier, words, word2inds, labels, label2inds):
    # model包括两部分概率，一部分是类别的先验概率，另一部分是类别的概率（生成）特征的概率
    model_pc, model_pw = classifier # 是numpy多维数据
    probs = list() # 该示例对应不同类别的概率
    for label in labels:
        label_id = label2inds[label]
        prob = model_pc[label_id] # 先验概率
        word_prob_sum = 0
        for token in tokens:
            if token not in word2inds:
                continue
            token_id = word2inds[token]
            word_prob_sum += math.log(model_pw[label_id, token_id])
        probs.append(math.log(prob) + word_prob_sum)
    probs_np = np.asarray(probs)
    label_ind = np.argmax(probs_np)
    return labels[label_ind]

In [6]:
dir_name = '/home/dl/codes/day1/tc-corpus-answer-test/'
train_insts, test_insts = data_loader(dir_name)
words, word2inds, labels, label2inds = init_vocab(train_insts)
classifier = train(train_insts, words, word2inds, labels, label2inds)
accs = list()
for words, real_label in test_insts:
    pred_label = classify(words, classifier, words, word2inds, labels, label2inds)
    if pred_label == real_label:
        accs.append(1)
    else:
        accs.append(0)
print "accuracy: ", sum(accs) * 100.0 / len(accs)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.765 seconds.
Prefix dict has been built succesfully.


accuracy:  88.671875
