In [53]:
import math

class UnigramModel:
    def __init__(self):
        self.probs = {};
    
    def struct(self, input_file, model_file):
        """モデルを構築してファイルに書き込む
        """
        word_count = {}
        total_count = 0
        for line in UnigramModel._read_lines(input_file):
            words = line.split(" ")
            words.append("</s>")
            for word in words:
                if word in word_count:
                    word_count[word] += 1
                else:
                    word_count[word] = 1
                total_count += 1
        with open(model_file, "w") as writer:
            for word in word_count:
                count = word_count[word]
                prob = count / total_count
                writer.write("{} {}\n".format(word, prob))

    def load(self, model_file):
        """ファイルからモデルを読み込む
        """
        for line in UnigramModel._read_lines(model_file):
            (word, prob) = line.split(" ")
            self.probs[word] = float(prob);
    
    def evaluate(self, test_file):
        """モデルの評価を行う
        """
        lambda_1 = 0.95 # 単語の出現確率に割り当てるウェイト
        lambda_unk = 1 - lambda_1 # 未知語に割り当てるウェイト
        V = 1000000 # 英単語の語彙数
        W = 0 # 単語数
        H = 0 # 負の底2の対数尤度
        unk = 0 # 未知語数
        for line in UnigramModel._read_lines(test_file):
            words = line.split(" ")
            words.append("</s>")
            for word in words:
                W += 1
                prob = lambda_unk / V # 未知語の確率
                if word in self.probs:
                    prob += lambda_1 * self.probs[word] # 単語の出現確率
                else:
                    unk += 1
                H -= math.log2(prob)
        return {
            "entropy": H / W,
            "coverage": (W - unk) / W
        }

    def _read_lines(file):
        with open(file) as fh:
            while True:
                line = fh.readline().rstrip("\n\r")
                if not line:
                    break
                yield line

In [54]:
input_file = "01-train-input.txt"
test_file = "01-test-input.txt"
model_file = "model_sample.txt"

model = UnigramModel()
model.struct(input_file, model_file)
model.load(model_file)
model.evaluate(test_file) # {'entropy': 6.709899494272102, 'coverage': 0.8}

{'entropy': 6.709899494272102, 'coverage': 0.8}

In [55]:
input_file = "wiki-en-train.word"
test_file = "wiki-en-test.word"
model_file = "model_wiki.txt"

model = UnigramModel()
model.struct(input_file, model_file)
model.load(model_file)
model.evaluate(test_file) # {'entropy': 10.527337238682652, 'coverage': 0.895226024503591}

{'entropy': 10.527337238682652, 'coverage': 0.895226024503591}