In [1]:
%load_ext autotime

time: 0 ns


In [2]:
import sys
sys.version

'3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]'

time: 0 ns


## 資料前處理

確認版本為 python3

In [3]:
import re

time: 0 ns


In [4]:
def prepocess_line(line):
    """
    僅僅挑出中文字元，並且斷開不連續的中文字
    """
    # 先定義中文的範圍
    pattern_cn = r"[\u4e00-\u9fa5]"
    pattern_not_cn = r"[^\u4e00-\u9fa5]"    
    
    tmp = re.sub(pattern_not_cn, ' ', line)
    segments = [i for i in tmp.split(' ') if len(i)>0]
    
    # END YOUR CODE
    return segments

time: 0 ns


In [5]:
prepocess_line('“英語”一詞源於遷居英格蘭的日耳曼部落盎格魯（），而“盎格魯”得名於')  

['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

time: 0 ns


#### 讀入資料並處理

In [6]:
segments = []
with open('./wiki_zh_small.txt', encoding='utf-8') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)

time: 156 ms


## Ngram

一開始要先計算字詞出現的次數

In [9]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]  # 分別代表計算0、1、...個字的出現次數

    def fit(self, segments):
        # 因為 self.counters 分別代表計算0、1、...個字的出現次數
        # 請在此實作利用 segments 以及函式 _skip 來統計次數
        for k in range(self.n):
            for segment in segments:
                self.counters[k].update(i for i in self._skip(segment, k+1))
        
    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i+shift+1]

time: 0 ns


In [10]:
counters = Counters(n=3)
counters.fit(segments)

time: 593 ms


In [49]:
counters.n

3

time: 0 ns


In [130]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n or n > 0
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1
        # 使用 Ngram 的公式計算出下一個字出現的機率
        # 輸出為機率與字的tuple列表，詳見下方輸出範例
        wd = prefix[::-1][:self.n][::-1]  #取出要用 prefix的幾個字來做查詢
        sorted_probs = []
        
        tmp_b = self.minor_counter[wd] #分母
        
        for i in self.major_counter.keys():
            if  i.startswith(wd):
                tmp_a = self.major_counter[i]
                sorted_probs.append((tmp_a / (tmp_b+0.001), i[-1])) #避免找不到字除以零
        
        sorted_probs = sorted(sorted_probs, key=lambda s: s[0], reverse=True) #按機率大到小排序

        return sorted_probs[:top_k] if top_k > 0 else sorted_probs

    def get_proba_dict(self, prefix=''):
        return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}


time: 0 ns


In [114]:
unigram = Ngram(1, counters)
unigram.predict_proba('我思')

wd: 思


[(0.3370147126258971, '想'),
 (0.12154628979950387, '考'),
 (0.09944696438141226, '維'),
 (0.044198650836183226, '是'),
 (0.03867381948166032, '汗')]

time: 15 ms


In [118]:
unigram = Ngram(2, counters)
unigram.predict_proba('我思')

wd: 我思


[(0.7498125468632841, '故'), (0.2499375156210947, '維')]

time: 31 ms


In [116]:
unigram = Ngram(3, counters)
unigram.predict_proba('我思')

wd: 我思


[]

time: 0 ns


In [None]:
# unigram.predict_proba('我思')
# # 應該為：[(0.035732269174118744, '的'),
# #         (0.012927703414087723, '國'),
# #         (0.010620050461395955, '中'),
# #         (0.009984570768472667, '在'),
# #         (0.009852627950874188, '一')]

[(0.035732269174118744, '的'),
 (0.012927703414087723, '國'),
 (0.010620050461395955, '中'),
 (0.009984570768472667, '在'),
 (0.009852627950874188, '一')]

In [131]:
unigram = Ngram(1, counters)
bigram = Ngram(2, counters)
trigram = Ngram(3, counters)

time: 0 ns


## 使用Ngram來建立第一版選字系統

In [149]:
class ChineseWordRecommenderV1:
    def __init__(self, unigram, bigram, trigram):
        self.unigram = unigram
        self.bigram = bigram
        self.trigram = trigram
    
    def predict_proba(self, prefix='', top_k=5):
        # 使用Ngram來建立選字系統
        tmp_probs = []
        sorted_probs = []
        
        # 依模型複雜到簡單加入機率
        tmp_probs.extend(self.trigram.predict_proba(prefix, top_k))
        tmp_probs.extend(self.bigram.predict_proba(prefix, top_k))
        tmp_probs.extend(self.unigram.predict_proba(prefix, top_k))
        
        list_wd = [] #為了避免不同模型重複選字
        for i in tmp_probs:
            if i[1] in list_wd:
                continue
            else:
                list_wd.extend(i[1])
            
            sorted_probs.append(i)
            
            if len(list_wd) == top_k:
                break
        
        return sorted_probs[:top_k]
        

time: 0 ns


In [150]:
model = ChineseWordRecommenderV1(unigram, bigram, trigram)

time: 0 ns


In [151]:
probs = model.predict_proba('我思', top_k=10)
probs

[(0.7498125468632841, '故'),
 (0.2499375156210947, '維'),
 (0.3370147126258971, '想'),
 (0.12154628979950387, '考'),
 (0.044198650836183226, '是'),
 (0.03867381948166032, '汗'),
 (0.022099325418091613, '爲'),
 (0.011049662709045806, '源'),
 (0.011049662709045806, '無'),
 (0.011049662709045806, '一')]

time: 47 ms


In [None]:
# probs = model.predict_proba('我思', top_k=10)
# probs

[(0.75, '故'), (0.25, '維')]

## Demo

In [None]:
!pip install -U pip
!pip install -q ipywidgets

Requirement already up-to-date: pip in /Users/ycchen/.pyenv/versions/3.6.5/lib/python3.6/site-packages (20.1.1)


In [153]:
import ipywidgets as widgets

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='value')

Label(value='')

Textarea(value='')

time: 15 ms


In [154]:
text.observe(func, names='我思')

time: 0 ns
