In [1]:
import pandas as pd
import re

In [2]:
text = 'di45ri78()(rt)\n\drn f \n tt </doc>\n'
p1 = re.compile(u"[a-z]")
text_1 = p1.sub('', text)
text_1

'4578()()\n\\  \n  </>\n'

In [3]:
file_0 = open('data/pro_zh_wiki_00',mode='r',encoding='utf-8')
file_1 = open('data/pro_zh_wiki_01',mode='r',encoding='utf-8')
text_0 = file_0.readlines()
text_1 = file_1.readlines()
file_0.close()
file_1.close()

In [4]:
text = text_0+text_1
dic = ''.join(text)
len(dic)

441472442

$$Unigram¶ $$
$$P(W_oW_1W_2Wn) = P(W_o) *P(W_1) * P(W_2)*P(W_n) $$

$$2-Gram¶$$
$$ Pr(w_ow_1w_2...w_n) = Pr(w_1 | w_0) \cdot Pr(w_2 | w_1) ... \cdot Pr(w_n | w_{n-1})  $$
$$ Pr(w_1 | w_0) = \frac{Pr(w_1 w_0)}{Pr(w_0)} $$

In [5]:
from collections import Counter
from functools import reduce
from operator import mul, add

In [6]:
def tokenize(string): 
    return ''.join(re.findall('[\w|\d]+', string))

In [7]:
def get_pram_split(dic,gram_length):
    gram_counts = Counter(dic[i:i+gram_length] for i in range(len(dic) - gram_length))
    return gram_counts

In [8]:
one_gram_counts = get_pram_split(dic,1)
one_gram_counts.most_common(10)

[('，', 15566659),
 ('的', 9972240),
 ('。', 9106752),
 ('1', 7472419),
 ('"', 6625737),
 ('0', 5905550),
 ('2', 5165412),
 ('=', 4159276),
 ('年', 4124627),
 ('9', 4030322)]

In [9]:
two_gram_counts = get_pram_split(dic,2)
two_gram_counts.most_common(10)

[('="', 3111460),
 ('"=', 2075073),
 ('20', 1683234),
 ('19', 1577953),
 ('00', 1298046),
 ('..', 1056849),
 (':.', 1032851),
 ('":', 1031468),
 ('.?', 1029985),
 ('?=', 1029944)]

In [10]:
def get_probability_from_counts(count):
    all_occurences = sum(count.values())
    def get_prob(item): 
        return count[item] / all_occurences
    return get_prob

In [11]:
get_char_prob = get_probability_from_counts(one_gram_counts)
get_pair_prob = get_probability_from_counts(two_gram_counts)

In [12]:
def get_2_gram_prob(word, prev):
    if get_pair_prob(word+prev) > 0: 
        return get_pair_prob(word+prev) / get_char_prob(prev)
    else:
        return get_char_prob(word)

In [13]:
def prob_of_string(string,gram_length):
    if gram_length ==1:
        return reduce(mul, [get_char_prob(c) for c in string])
    elif gram_length ==2:
        probablities = []
        for i, c in enumerate(string):
            prev = '<s>' if i == 0 else string[i-1]
            probablities.append(get_2_gram_prob(c, prev))
        return reduce(mul, probablities)
    else:
        print('gram_length error')
        return

In [14]:
def get_probability_prefromance(language_model_func, pairs,gram_length):
    for (p1, p2) in pairs:
        print('*'*18)
        print('\t\t {} with probability {}'.format(p1, language_model_func(tokenize(p1),gram_length)))
        print('\t\t {} with probability {}'.format(p2, language_model_func(tokenize(p2),gram_length)))

In [15]:
pair = """前天晚上吃晚饭的时候
前天晚上吃早饭的时候""".split('\n')

pair2 = """正是一个好看的小猫
真是一个好看的小猫""".split('\n')

pair3 = """我无言以对，简直
我简直无言以对""".split('\n')

pairs = [pair, pair2, pair3]

In [16]:
get_probability_prefromance(prob_of_string, pairs,1)

******************
		 前天晚上吃晚饭的时候 with probability 3.794135938930681e-33
		 前天晚上吃早饭的时候 with probability 7.642373144714525e-33
******************
		 正是一个好看的小猫 with probability 1.7028656272430964e-26
		 真是一个好看的小猫 with probability 5.4148708183350224e-27
******************
		 我无言以对，简直 with probability 8.250759781675059e-23
		 我简直无言以对 with probability 8.250759781675057e-23


In [17]:
get_probability_prefromance(prob_of_string, pairs,2)

******************
		 前天晚上吃晚饭的时候 with probability 1.5935503139766104e-33
		 前天晚上吃早饭的时候 with probability 1.1867303075229088e-34
******************
		 正是一个好看的小猫 with probability 9.72439908858432e-27
		 真是一个好看的小猫 with probability 9.057076964894018e-27
******************
		 我无言以对，简直 with probability 1.4112642911854104e-25
		 我简直无言以对 with probability 1.9633623601780332e-25
