In [1]:
import srilm.discount
import srilm.vocab
import srilm.stats

In [2]:
# 定义折扣实例
discount = srilm.discount.Discount(method='kneser-ney', interpolate=True)
# 查看折扣方法
print("折扣方法 = ", discount.method)
# 是否用插值 lambada_1*p(n-gram) + lambda_2*p((n-1)-gram) + ... lamda_n*p(1-gram)
print("是否进行插值 = ", discount.interpolate)

折扣方法 =  kneser-ney
是否进行插值 =  True


## 定义训练集，词汇表，统计n-gram计数

In [3]:
# 定义训练集
text = """
It was the best of times,
it was the worst of times,
it was the age of wisdom,
it was the age of foolishness,
it was the epoch of belief,
it was the epoch of incredulity, it was the season of Light,
it was the season of Darkness, it was the spring of hope,
it was the winter of despair,
"""

# 定义词汇表实例
v = srilm.vocab.Vocab()
# 定义Stats实例
ts = srilm.stats.Stats(v, 3)

# 读入词汇表
for w in text.split():
    v.add(w)
    
# 遍历词表
print("词汇表：")
for w, i in v:
    print(w, i)

# 统计n-gram
ts.count_string(text)

# 输出3-gram计数
print("3-gram计数：")
for w, cnt in ts:
    print(v.string(w), cnt)

# 输出2-gram计数
print("2-gram计数：")
for w, cnt in ts.iter(2):
    print(v.string(w), cnt)
    
# 输出1-gram计数
print("1-gram计数：")
for w, cnt in ts.iter(1):
    print(v.string(w), cnt)

词汇表：
-pau- 3
times, 9
worst 11
<s> 1
<unk> 0
hope, 22
the 6
age 12
foolishness, 14
It 4
it 10
winter 23
of 8
epoch 15
incredulity, 17
was 5
Darkness, 20
season 18
best 7
Light, 19
</s> 2
belief, 16
spring 21
despair, 24
wisdom, 13
3-gram计数：
['<s>', 'It', 'was'] 1
['It', 'was', 'the'] 1
['was', 'the', 'best'] 1
['was', 'the', 'worst'] 1
['was', 'the', 'age'] 2
['was', 'the', 'epoch'] 2
['was', 'the', 'season'] 2
['was', 'the', 'spring'] 1
['was', 'the', 'winter'] 1
['the', 'best', 'of'] 1
['the', 'worst', 'of'] 1
['the', 'age', 'of'] 2
['the', 'epoch', 'of'] 2
['the', 'season', 'of'] 2
['the', 'spring', 'of'] 1
['the', 'winter', 'of'] 1
['best', 'of', 'times,'] 1
['of', 'times,', 'it'] 2
['of', 'wisdom,', 'it'] 1
['of', 'foolishness,', 'it'] 1
['of', 'belief,', 'it'] 1
['of', 'incredulity,', 'it'] 1
['of', 'Light,', 'it'] 1
['of', 'Darkness,', 'it'] 1
['of', 'hope,', 'it'] 1
['of', 'despair,', '</s>'] 1
['times,', 'it', 'was'] 2
['it', 'was', 'the'] 9
['worst', 'of', 'times,'] 1
['age',

## 加法平滑

In [4]:
# 定义折扣实例
discount = srilm.discount.Discount(method='additive', interpolate=True)
# 查看折扣方法
print("折扣方法 = ", discount.method)
# 是否用插值 
print("是否进行插值 = ", discount.interpolate)
# 估计折扣值
print("估计折扣值是否成功：", discount.estimate(ts, 3))
# 输出折扣值， 在加法平滑中该值对应公式p = (c + delta) / (T + N * delta)中的delta
print("折扣值", discount.discount)

折扣方法 =  additive
是否进行插值 =  True
估计折扣值是否成功： True
折扣值 1.0


## Witten-Bell平滑

<pre>
Witten-Bell平滑不需要调用estimate方法
Boolean estimate(NgramStats &counts, unsigned order)
{ return true; } ;
Boolean estimate(NgramCounts<FloatCount> &counts, unsigned order)
{ return true; } ;
</pre>

In [5]:
# 定义折扣实例
discount = srilm.discount.Discount(method='witten-bell', interpolate=True)
# 查看折扣方法
print("折扣方法 = ", discount.method)
# 是否用插值 
print("是否进行插值 = ", discount.interpolate)

折扣方法 =  witten-bell
是否进行插值 =  True
