In [1]:
import tqdm
import re
import numpy as np
from collections import Counter

In [2]:
class LanguageModel():
    def __init__(self, order=2):
        self.order=order
        
    def product(self, nums):
        "Multiply the numbers together.  (Like `sum`, but with multiplication.)"
        result = 1
        for x in nums: result *= x
        return result

    def get_ngrams(self, tokens, n):
        return [' '.join(tokens[i:i+n]) for i, token in enumerate(tokens)]
    
    def get_counts(self, corpus, order):  
        counts = {'n' + str(i) : Counter(self.get_ngrams(corpus, n=i)) for i in range(1, order+1)}
        counts['n0'] = {'':len(corpus)}
        return counts
    
    def get_prob(self, counts, word, context=''):
        '''With Laplace shoothing as yet.
        Not for public use.'''
        order = len(context.split())+1
        separator = ' ' if order > 1 else ''
        return (counts['n'+str(order)][separator.join([context, word])] + 1) / \
               (counts['n'+str(order-1)][context] + len(counts['n'+str(order)]))
        
    def get_logprob(self, counts, word, context=''):
        return np.log(self.get_prob(counts, word, context))
    
    def get_following(self, counts, context):
        '''Slow as hell. 
        To optimize might use embedded dictionaries.'''
        order = len(context.split())+1
        return sorted(
            [(k.split()[-1], v, self.get_prob(counts, k.split()[-1], context)) \
            for k, v in counts['n'+str(order)].items()                         \
            if re.match(context+' '+'\w+', k)],                                \
            key=lambda x:x[1], reverse=True)   
    
    def get_string_probs(self, counts, string, order, log=True):
        prob_fun = self.get_logprob if log else self.get_prob
        tokens = string.split()
        probs = []
        for i in range(len(tokens)):
            context = ' '.join(tokens[i-order+1:i]) if i>=order else ' '.join(tokens[:i])
            prob = prob_fun(counts, word = tokens[i], context = context)
            probs.append(prob)
        return probs
    
    def interpolate(self, counts, string, order, log=True, lambdas='default'):
        lmbd = [0.3, 0.7, 0.0] if lambdas == 'default' else lambdas
        aggregate = sum if log else self.product
        probs = [self.get_string_probs(counts, string, order=i, log=log) \
                 for i in range(1, order+1)]
        probs_interpolated = []
        for tup in zip(*probs):
            prob_token = 0
            for i in range(len(tup)):
                prob_token += tup[i] * lmbd[i]
            probs_interpolated.append(prob_token)
        return aggregate(probs_interpolated)
    
    def fit(self, corpus):
        self.counts = self.get_counts(corpus, self.order)
        
    def prob(self, string, log=False):
        return self.interpolate(self.counts, string, self.order, log=log)
    
    def context_prob(self, word, context='', log=False):
        prob_fun = self.get_logprob if log else self.get_prob
        c = context.split()
        history = ' '.join(c) if len(c) < self.order else ' '.join(c[-self.order+1:])
        return prob_fun(self.counts, word, history)  
    
    def following(self, context):
        c = context.split()
        history = ' '.join(c) if len(c) < self.order else ' '.join(c[-self.order+1:])
        return self.get_following(self.counts, history)

In [3]:
def cleanse(s, rgxp = '[\W\da-z]'):
    return re.sub(' +', ' ', re.sub(rgxp, ' ', s.lower()))

In [4]:
with open('lt1.txt', encoding='utf-8') as f:
    tokens = cleanse(f.read().lower()).split()

In [5]:
%%time
model = LanguageModel(order=2)
model.fit(tokens)

Wall time: 962 ms


In [6]:
%%time
model.prob('наташа')

Wall time: 0 ns


0.001646777278875629

In [7]:
%%time
model.prob('череззаборногузадирищекно')

Wall time: 0 ns


2.0009444457784076e-06

In [8]:
%%time
model.prob('наташа и пьер не хотели ехать')

Wall time: 0 ns


5.515250993684362e-19

In [9]:
%%time
model.prob('наташа и череззаборногузадирищекно не хотели ехать')

Wall time: 0 ns


1.6336287741983484e-21

In [10]:
%%time
model.context_prob('наташа', 'и')

Wall time: 0 ns


0.00019359544774132881

In [11]:
%%time
model.context_prob('наташа', 'герцог чубакка и')
# same as previous cause model with order 2 (bigram) takes into consideration only k-1 word

Wall time: 0 ns


0.00019359544774132881

In [12]:
%%time
model.following('увидел')

Wall time: 570 ms


[('что', 2, 1.1212354519700106e-05),
 ('бы', 1, 7.474903013133404e-06),
 ('обман', 1, 7.474903013133404e-06),
 ('в', 1, 7.474903013133404e-06),
 ('неприятеля', 1, 7.474903013133404e-06),
 ('выезжая', 1, 7.474903013133404e-06),
 ('старшего', 1, 7.474903013133404e-06),
 ('молодое', 1, 7.474903013133404e-06)]

In [13]:
%%time
model.following('когда утуб наконец-то загрузился он увидел')
# same as previous cause model with order 2 (bigram) takes into consideration only k-1 word

Wall time: 559 ms


[('что', 2, 1.1212354519700106e-05),
 ('бы', 1, 7.474903013133404e-06),
 ('обман', 1, 7.474903013133404e-06),
 ('в', 1, 7.474903013133404e-06),
 ('неприятеля', 1, 7.474903013133404e-06),
 ('выезжая', 1, 7.474903013133404e-06),
 ('старшего', 1, 7.474903013133404e-06),
 ('молодое', 1, 7.474903013133404e-06)]

In [14]:
%%time
model.following('чубакка')

Wall time: 558 ms


[]

In [15]:
for k, v in model.counts.items():
    if k != 'n0': print(k, v.most_common(6))

n1 [('и', 21710), ('в', 11173), ('не', 8781), ('что', 8367), ('он', 7493), ('на', 6796)]
n2 [('что он', 861), ('князь андрей', 778), ('и не', 725), ('то что', 666), ('и в', 498), ('сказал он', 488)]
