In [1]:
import srilm.stats
import srilm.vocab
import random
import array

In [2]:
# 查看stats的阶
vocab = srilm.vocab.Vocab()
stats = srilm.stats.Stats(vocab, 3)
stats.order

3

In [3]:
# 查看3-gram计数
words = array.array('I', [1, 2, 3])
stats[words]

0

In [4]:
# 设置3-gram计数
words = array.array('I', [1, 2, 3])
stats[words] = 100
stats[words]

100

In [5]:
# 给3-gram计数添加计数
words = array.array('I', [1, 2, 3])
stats.add(words, 10)
stats[words]

110

In [6]:
# 删除1个3-gram
words = array.array('I', [1, 2, 3])
stats[words] = 100
del stats[words]
stats[words]

0

In [7]:
words = array.array('I', [1, 2, 3])
print("words = ", words)
print("words[0] = ", words[0])
print("words[1] = ", words[1])
print("words[2] = ", words[2])

words =  array('I', [1, 2, 3])
words[0] =  1
words[1] =  2
words[2] =  3


In [8]:
# 随机产生10个3-gram计数
stats = srilm.stats.Stats(vocab, 3)
words = array.array('I', [1, 2, 3])
for i in range(20):
    words[random.randint(0, 2)] = random.randint(0, 100)
    stats[words] = random.randint(1, 1000)

In [9]:
# 遍历所有可能的3-gram
for w, cnt in stats.iter(3):
    print(w, cnt)

array('I', [1, 2, 14]) 911
array('I', [40, 26, 90]) 644
array('I', [40, 26, 41]) 34
array('I', [40, 26, 70]) 45
array('I', [44, 2, 14]) 812
array('I', [44, 80, 14]) 429
array('I', [83, 80, 70]) 221
array('I', [83, 80, 64]) 574
array('I', [12, 80, 32]) 603
array('I', [12, 80, 39]) 566
array('I', [12, 80, 70]) 317
array('I', [12, 80, 14]) 102
array('I', [12, 80, 36]) 104
array('I', [50, 26, 90]) 140
array('I', [56, 80, 64]) 300
array('I', [92, 26, 64]) 47
array('I', [92, 26, 90]) 309
array('I', [31, 80, 14]) 995
array('I', [70, 80, 64]) 349
array('I', [70, 26, 64]) 394


In [10]:
# 遍历所有可能的2-gram
for w, cnt in stats.iter(2):
    print(w, cnt)

array('I', [1, 2]) 0
array('I', [40, 26]) 0
array('I', [44, 2]) 0
array('I', [44, 80]) 0
array('I', [83, 80]) 0
array('I', [12, 80]) 0
array('I', [50, 26]) 0
array('I', [56, 80]) 0
array('I', [92, 26]) 0
array('I', [31, 80]) 0
array('I', [70, 80]) 0
array('I', [70, 26]) 0


In [11]:
vocab = srilm.vocab.Vocab()
stats = srilm.stats.Stats(vocab, 3)
# 保存计数文件
words = array.array('I', [1, 2, 3])
stats[words] = 15
words = array.array('I', [1, 2, 0])
stats[words] = 2
fname = 'data/count.txt'
stats.write(fname)

# 读取计数文件， 词汇表可以是一个空对象
new_stats = srilm.stats.Stats(vocab, 3)
new_stats.read(fname)
for w, cnt in new_stats:
    print(w, cnt)

array('I', [1, 2, 3]) 15
array('I', [1, 2, 0]) 2


In [12]:
# 保存二进制计数文件
words = array.array('I', [1, 2, 3])
stats[words] = 15
words = array.array('I', [1, 2, 0])
stats[words] = 2
fname = 'data/count.bin'
stats.write(fname, binary=True)

# 读取二进制计数文件
new_stats = srilm.stats.Stats(vocab, 3)
new_stats.read(fname, binary=True)
for w, cnt in new_stats:
    print(w, cnt)

array('I', [1, 2, 0]) 2
array('I', [1, 2, 3]) 15


## 由文件生成计数stats

In [13]:
# 生成文件
text = 'this is a test\n'
fname = 'data/text.txt'
with open(fname, 'w') as fout:
    fout.write(text)

# 向词汇表中添加词语
vocab = srilm.vocab.Vocab()
stats = srilm.stats.Stats(vocab, 3)
for w in text.split():
    vocab.add(w)

# 定义词语序列
a = ['this', 'is', 'a']
# 获取词语序列对应的索引序列
b = vocab.index(a)
print("a对应索引序列 = ", b)
# 输出词语序列a的计数
print("stats[b] = ", stats[b])
# 对文件计数
print("从文件中读入的词语数（注意：每个句子添加<bos>和<eos>） = ", stats.count_file(fname))
# 再次输出词语序列a的计数
print("after count from file stats[b] = ", stats[b])

a对应索引序列 =  array('I', [4, 5, 6])
stats[b] =  0
从文件中读入的词语数（注意：每个句子添加<bos>和<eos>） =  6
after count from file stats[b] =  1


## 由字符串String生成计数stats

In [14]:
vocab = srilm.vocab.Vocab()
stats = srilm.stats.Stats(vocab, 3)
# 默认\n作为句子结束
text = 'this is a test\n'
for w in text.split():
    vocab.add(w)
a = ['a', 'test', '</s>']
b = vocab.index(a)
print("stats[b] = ", stats[b])
print("从字符串中读入的词语数（注意：每个句子添加<bos>和<eos>） = ", stats.count_string(text))
print("after count from string stats[b] = ", stats[b])

stats[b] =  0
从字符串中读入的词语数（注意：每个句子添加<bos>和<eos>） =  6
after count from string stats[b] =  1


## 由词语序列生成计数stats

In [15]:
vocab = srilm.vocab.Vocab()
stats = srilm.stats.Stats(vocab, 3)
text = 'this is a test\n'
words = text.split()
for w in words:
    vocab.add(w)
# 获取1个词语序列的索引序列
b = vocab.index(words)
print("stats[b] = ", stats[b])
print("从字符串序列中读入的词语数（注意：每个句子添加<bos>和<eos>） = ", stats.count(b))
print("after count from string_li stats[b] = ", stats[b])

stats[b] =  0
从字符串序列中读入的词语数（注意：每个句子添加<bos>和<eos>） =  6
after count from string_li stats[b] =  1


## 3-gram统计后的stats计数之和

In [16]:
vocab = srilm.vocab.Vocab()
stats = srilm.stats.Stats(vocab, 3)
print("空stats len = ", len(stats))
a = array.array('I', [1, 2, 3])
stats[a] = 3
print("含有1个3次的3-gram的stats len = ", len(stats))
stats.count_string('this is a test')
for w, cnt in stats:
    print(w, cnt)
print("count之后的stats len = ", len(stats))

空stats len =  0
含有1个3次的3-gram的stats len =  3
array('I', [1, 2, 3]) 3
array('I', [1, 0, 0]) 1
array('I', [0, 0, 0]) 2
array('I', [0, 0, 2]) 1
count之后的stats len =  7


In [17]:
vocab = srilm.vocab.Vocab()
stats = srilm.stats.Stats(vocab, 3)
# 更新3-gram计数后，重新计算低阶计数
text = 'this is a test'
for w in text.split():
    vocab.add(w)
# 获取3-gram索引
a = vocab.index('this is a'.split())
# 获取2-gram索引
b = vocab.index('this is'.split())
# 更新3-gram计数
stats[a] = 1
print("2-gram计数 c(this is) = ", stats[b])
# 跟新2-gram计数
stats.sum_counts()
print("更新2-gram计数后，2-gram计数 c(this is) = ", stats[b])

2-gram计数 c(this is) =  0
更新2-gram计数后，2-gram计数 c(this is) =  1
