In [1]:
import kenlm
import cPickle as pickle
import numpy as np
import copy

In [2]:
beam_size = 10
# templates_dump_location = './eo_with_surf_forms_templates.p'
templates_dump_location = './ar_without_surf_forms_templates.p'
# Whether to sample from sentences with surface form tuples and property placeholders OR with pure text.
# summaries_type = 'summary_with_surf_forms_and_types'
summaries_type = 'original_summary'

### Loading trained $n$-gram model along with the original dataset file

In [3]:
model = kenlm.LanguageModel('ar_without_surf_forms.klm')
dataset_file_location = '../Datasets/ar/Dataset/with-Surface-Forms/splitDataset_with_targets.p'
with open(dataset_file_location, 'rb') as f:
    dataset = pickle.load(f)

### Building vocabulary

In [4]:
vocab_len = 0
token2id = {}
id2token = {}

for i in range(0, len(dataset['train'][summaries_type])):
    tempSummary = dataset['train'][summaries_type][i].encode('utf-8').split()
    for token in tempSummary:
        if token not in token2id:
            vocab_len += 1
            token2id[token] = vocab_len
            id2token[vocab_len] = token
vocab_len += 1
token2id['</s>'] = vocab_len
id2token[vocab_len] = '</s>'

# Make sure that sequences are absolved from start- and end-of-sequence tokens.
# These are not handled very well by this model.
assert('<start>' not in token2id)
assert('<end>' not in token2id)

### Beam-search decoding on the trained $n$-gram model
We initialise the beams with the $n$ most probable words given the `<s>` token (i.e. `<start>` token in KenLM Language Model Toolkit)

More information at: https://kheafield.com/code/kenlm/

In [5]:
sentences = []
sentences_prob = []
candidates = []
num_active_beams = beam_size


beam_probabilities = np.zeros(vocab_len) 
for token in token2id:
    tempCandidate = ' '.join([] + [token]) 
    beam_probabilities[token2id[token] - 1] = model.score(tempCandidate, eos = False)
indices = np.argsort(beam_probabilities)[-num_active_beams:]
print indices
for j in range(beam_size - 1, -1, -1):
    candidates.append([id2token[indices[j] + 1]])

while num_active_beams > 0:
    beam_probabilities = np.zeros(num_active_beams * vocab_len)
    beam_probabilities.fill(np.NINF)
    for s in range(0, num_active_beams):
        for token in token2id:
            tempCandidate = ' '.join(candidates[s] + [token])
            beam_probabilities[s * vocab_len + token2id[token] - 1] = model.score(tempCandidate, eos = False)
        
    indices = np.argsort(beam_probabilities)[-num_active_beams:]
    print indices
    cloned_candidates = copy.deepcopy(candidates)
    completed_beams_counter = 0
    candidates = []
    for j in range(num_active_beams - 1, -1, -1):
        
        candidates.append([])
        candidates[-1] = copy.deepcopy(cloned_candidates[indices[j] / vocab_len])
        
        candidates[-1] += [id2token[indices[j] % vocab_len + 1]]
        if id2token[indices[j] % vocab_len + 1] == '</s>':
            completed_beams_counter += 1
            sentences.append(candidates[-1])
            sentences_prob.append(beam_probabilities[indices[j]])
            candidates.pop(-1)
    num_active_beams -= completed_beams_counter

[423458     23   1197    487   3478    639    421     10      7     43]
[ 852805  864784    1078  848042    6411  423502 1694807     784  847022
  423553]
[1271582      95 2545349 2541879 3812537 1693845 2964375  847080  437077
     120]
[3813604    1706 3394202 3813713 3387768 2541328 1693846 1271165  847703
     121]
[1270519  847997 2964238 2541725  861699     504 2117392  846968 1270518
  423585]
[ 865303  856119  854492 3387682    2804 2964333 2117305  864152  423469
     786]
[3811141 2975390 3396874 2541140 1700102 2120100 1270763  846928  423601
     386]
[1694222 1277951    3619  847304   19845    6889    2628    4774     787
   10919]
[3811181 3398849 2965001 2118083 2540804 1694624 1278554  851664  424247
     788]
[  10920  436005 3387682  423509     400   19710 2134529      50 1271165
  847706]
[2565451 2129236 3387800 2964312    6168      50 1693964  431033 1270387
  859251]
[3811259 3387800 2964274 1698610 2555194 2117423 1270438  863225  423558
     128]
[3811192 338773

### Purely-greedy decoding on the trained $n$-gram model

In [6]:
# sentences = [[]]
# sentences_prob = [np.NINF]
# selectedToken = ''

# while selectedToken != '</s>':
#     max_prob = np.NINF
#     for token in token2id:
#         tempCandidate = ' '.join(sentences[0] + [token])
#         sentences_prob[-1] = tempScore
#         tempScore = model.score(tempCandidate, eos = False)
#         if tempScore > max_prob:
#             selectedToken = token
#             max_prob = tempScore
#     sentences[0].append(selectedToken)

#     print(sentences[0])

In [7]:
for s in sentences:
    print s

['\xd9\x82\xd8\xb1\xd9\x8a\xd8\xa9', '\xd8\xb3\xd9\x88\xd8\xb1\xd9\x8a\xd8\xa9', '\xd8\xaa\xd8\xaa\xd8\xa8\xd8\xb9', '\xd9\x86\xd8\xa7\xd8\xad\xd9\x8a\xd8\xa9', '\xd8\xb3\xd9\x84\xd9\x88\xd9\x83', '\xd9\x81\xd9\x8a', '\xd9\x85\xd9\x86\xd8\xb7\xd9\x82\xd8\xa9', '\xd8\xaa\xd9\x84', '\xd8\xa3\xd8\xa8\xd9\x8a\xd8\xb6', '\xd9\x81\xd9\x8a', '\xd9\x85\xd8\xad\xd8\xa7\xd9\x81\xd8\xb8\xd8\xa9', '\xd8\xa7\xd9\x84\xd8\xb1\xd9\x82\xd8\xa9.', '</s>']
['\xd9\x82\xd8\xb1\xd9\x8a\xd8\xa9', '\xd8\xb3\xd9\x88\xd8\xb1\xd9\x8a\xd8\xa9', '\xd8\xaa\xd8\xaa\xd8\xa8\xd8\xb9', '\xd9\x86\xd8\xa7\xd8\xad\xd9\x8a\xd8\xa9', '\xd9\x85\xd8\xb1\xd9\x83\xd8\xb2', '\xd8\xa7\xd9\x84\xd8\xb1\xd9\x82\xd8\xa9', '\xd9\x81\xd9\x8a', '\xd9\x85\xd9\x86\xd8\xb7\xd9\x82\xd8\xa9', '\xd9\x85\xd8\xb1\xd9\x83\xd8\xb2', '\xd8\xa7\xd9\x84\xd8\xb1\xd9\x82\xd8\xa9', '\xd9\x81\xd9\x8a', '\xd9\x85\xd8\xad\xd8\xa7\xd9\x81\xd8\xb8\xd8\xa9', '\xd8\xa7\xd9\x84\xd8\xb1\xd9\x82\xd8\xa9.', '</s>']
['\xd9\x82\xd8\xb1\xd9\x8a\xd8\xa9', '\xd8\xb3\x

In [8]:
for s in range(0, len(sentences)):
    assert(sentences[s][-1] == '</s>')
    sentences[s] = ' '.join(sentences[s][:-1]).decode('utf-8')
    print sentences[s], sentences_prob[s]

قرية سورية تتبع ناحية سلوك في منطقة تل أبيض في محافظة الرقة. -4.30183410645
قرية سورية تتبع ناحية مركز الرقة في منطقة مركز الرقة في محافظة الرقة. -4.10472583771
قرية سوريّة تتبع إداريّاً لمحافظة حلب منطقة منبج ناحية مركز منبج، بلغ تعداد سكانها 195 نسمة حسب تعداد اليمن لعام 2004. -6.04410171509
قرية سوريّة تتبع إداريّاً لمحافظة حلب منطقة منبج ناحية مركز منبج، بلغ تعداد سكانها 282 نسمة حسب تعداد اليمن لعام 2004. -6.12608671188
قرية سوريّة تتبع إداريّاً لمحافظة حلب منطقة منبج ناحية مركز منبج، بلغ تعداد سكانها 846 نسمة حسب تعداد اليمن لعام 2004. -6.34612083435
قرية سوريّة تتبع إداريّاً لمحافظة حلب منطقة جبل سمعان ناحية تل الضمان، بلغ تعداد سكانها 332 نسمة حسب تعداد اليمن لعام 2004. -6.11217737198
قرية سوريّة تتبع إداريّاً لمحافظة حلب منطقة جبل سمعان ناحية تل الضمان، بلغ تعداد سكانها 295 نسمة حسب تعداد اليمن لعام 2004. -6.14283323288
قرية سوريّة تتبع إداريّاً لمحافظة حلب منطقة عين العرب ناحية مركز عين العرب، بلغ تعداد سكانها 76 نسمة حسب تعداد اليمن لعام 2004. -6.26259183884
قرية سوريّة تتبع

### Computing probabilities
KenLM scores are $log_{10}$ probabilities; we compute the actual ones (i.e. $0 \leq p_i \leq 1$).
We normalise as follows:
\begin{equation}
\widetilde{p_i} = \frac{p_i}{\sum_i p_i}
\end{equation}

In [9]:
nominator = np.power(10, np.asarray(sentences_prob))
denominator = np.power(10, np.asarray(sentences_prob)).sum()

In [10]:
prob_distribution = nominator / denominator

In [11]:
print prob_distribution

[ 0.37392506  0.58869839  0.00676888  0.00560444  0.00337674  0.00578684
  0.00539244  0.00409286  0.00308995  0.0032644 ]


### Saving everything in a dictionary
```python
{'sentences': type(list), 'prob_distribution': type(numpy.ndarray)}
```

In [12]:
with open(templates_dump_location, 'wb') as f:
    pickle.dump({'sentences': sentences, 'prob_distribution': prob_distribution}, f)