In [1]:
import tqdm
import re
import numpy as np
from collections import Counter

In [2]:
class LanguageModel():
    def __init__(self, order=2):
        self.order=order
        
    def product(self, nums):
        "Multiply the numbers together.  (Like `sum`, but with multiplication.)"
        result = 1
        for x in nums: result *= x
        return result

    def get_ngrams(self, tokens, n):
        return [' '.join(tokens[i:i+n]) for i, token in enumerate(tokens)]
    
    def get_counts(self, corpus, order):  
        counts = {'n' + str(i) : Counter(self.get_ngrams(corpus, n=i)) for i in range(1, order+1)}
        counts['n0'] = {'':len(corpus)}
        return counts
    
    def get_prob(self, counts, word, context=''):
        '''With Laplace shoothing as yet.
        Not for public use.'''
        order = len(context.split())+1
        separator = ' ' if order > 1 else ''
        return (counts['n'+str(order)][separator.join([context, word])] + 1) / \
               (counts['n'+str(order-1)][context] + len(counts['n'+str(order)]))
        
    def get_logprob(self, counts, word, context=''):
        return np.log(self.get_prob(counts, word, context))
    
    def get_following(self, counts, context):
        '''Slow as hell. 
        To optimize might use embedded dictionaries.'''
        order = len(context.split())+1
        return sorted(
            [(k.split()[-1], v, self.get_prob(counts, k.split()[-1], context)) \
            for k, v in counts['n'+str(order)].items()                         \
            if re.match(context+' '+'\w+', k)],                                \
            key=lambda x:x[1], reverse=True)   
    
    def get_string_probs(self, counts, string, order, log=True):
        prob_fun = self.get_logprob if log else self.get_prob
        tokens = string.split()
        probs = []
        for i in range(len(tokens)):
            context = ' '.join(tokens[i-order+1:i]) if i>=order else ' '.join(tokens[:i])
            prob = prob_fun(counts, word = tokens[i], context = context)
            probs.append(prob)
        return probs
    
    def interpolate(self, counts, string, order, log=True, lambdas='default'):
        lmbd = [0.3, 0.7, 0.0] if lambdas == 'default' else lambdas
        aggregate = sum if log else self.product
        probs = [self.get_string_probs(counts, string, order=i, log=log) \
                 for i in range(1, order+1)]
        probs_interpolated = []
        for tup in zip(*probs):
            prob_token = 0
            for i in range(len(tup)):
                prob_token += tup[i] * lmbd[i]
            probs_interpolated.append(prob_token)
        return aggregate(probs_interpolated)
    
    def fit(self, corpus):
        self.counts = self.get_counts(corpus, self.order)
        
    def prob(self, string, log=False):
        return self.interpolate(self.counts, string, self.order, log=log)
    
    def context_prob(self, word, context='', log=False):
        prob_fun = self.get_logprob if log else self.get_prob
        c = context.split()
        history = ' '.join(c) if len(c) < self.order else ' '.join(c[-self.order+1:])
        return prob_fun(self.counts, word, history)  
    
    def following(self, context):
        c = context.split()
        history = ' '.join(c) if len(c) < self.order else ' '.join(c[-self.order+1:])
        return self.get_following(self.counts, history)

In [3]:
def cleanse(s, rgxp = '[\W\da-z]'):
    return re.sub(' +', ' ', re.sub(rgxp, ' ', s.lower()))

In [12]:
with open('corpora.txt', encoding='utf-8') as f:
    tokens1 = cleanse(f.read().lower()).split()

In [13]:
with open('lt1.txt', encoding='utf-8') as f:
    tokens2 = cleanse(f.read().lower()).split()

In [15]:
len(tokens1)
# tokens[:100]

1549458

In [27]:
%%time
model = LanguageModel(order=3)
model.fit(tokens1)

Wall time: 5.94 s


In [28]:
%%time
model.prob('наташа')

Wall time: 0 ns


4.095833133417668e-06

In [29]:
%%time
model.prob('череззаборногузадирищекно')

Wall time: 0 ns


5.851190190596669e-07

In [32]:
%%time
model.prob('сегодня я купил газету')

Wall time: 0 ns


1.7681723525851077e-17

In [42]:
%%time
model.prob('сегодня я купил кукушку')

Wall time: 0 ns


2.7964746635527687e-18

In [43]:
%%time
model.prob('сегодня я купил уключинозакрыватель')

Wall time: 0 ns


2.7964746635527687e-18

In [34]:
%%time
model.prob('наташа и череззаборногузадирищекно не хотели ехать')

Wall time: 0 ns


3.0274188709422195e-26

In [35]:
%%time
model.context_prob('наташа', 'и')

Wall time: 0 ns


1.9372730363558344e-06

In [36]:
%%time
model.context_prob('наташа', 'герцог чубакка и')
# same as previous cause model with order 2 (bigram) takes into consideration only k-1 word

Wall time: 0 ns


7.13835488045754e-07

In [37]:
%%time
model.following('увидел')

Wall time: 2.09 s


[('как', 7, 8.169475774951365e-06),
 ('в', 6, 7.148291303082445e-06),
 ('что', 4, 5.1059223593446035e-06),
 ('на', 4, 5.1059223593446035e-06),
 ('группу', 3, 4.084737887475683e-06),
 ('бы', 2, 3.063553415606762e-06),
 ('я', 2, 3.063553415606762e-06),
 ('его', 2, 3.063553415606762e-06),
 ('себя', 2, 3.063553415606762e-06),
 ('фильм', 2, 3.063553415606762e-06),
 ('но', 2, 3.063553415606762e-06),
 ('ее', 2, 3.063553415606762e-06),
 ('он', 2, 3.063553415606762e-06),
 ('духов', 2, 3.063553415606762e-06),
 ('чудо', 1, 2.0423689437378413e-06),
 ('то', 1, 2.0423689437378413e-06),
 ('результаты', 1, 2.0423689437378413e-06),
 ('совершенно', 1, 2.0423689437378413e-06),
 ('наготу', 1, 2.0423689437378413e-06),
 ('плюсы', 1, 2.0423689437378413e-06),
 ('признаки', 1, 2.0423689437378413e-06),
 ('своего', 1, 2.0423689437378413e-06),
 ('извозчика', 1, 2.0423689437378413e-06),
 ('пост', 1, 2.0423689437378413e-06),
 ('геракл', 1, 2.0423689437378413e-06),
 ('множество', 1, 2.0423689437378413e-06),
 ('еще',

In [38]:
%%time
model.following('когда утуб наконец-то загрузился он увидел')
# same as previous cause model with order 2 (bigram) takes into consideration only k-1 word

Wall time: 3.19 s


[('как', 3, 2.8553174934595385e-06),
 ('группу', 2, 2.1414881200946537e-06),
 ('чудо', 1, 1.4276587467297692e-06),
 ('результаты', 1, 1.4276587467297692e-06),
 ('но', 1, 1.4276587467297692e-06),
 ('еще', 1, 1.4276587467297692e-06),
 ('практически', 1, 1.4276587467297692e-06),
 ('в', 1, 1.4276587467297692e-06),
 ('себя', 1, 1.4276587467297692e-06)]

In [39]:
%%time
model.following('чубакка')

Wall time: 2.13 s


[]

In [40]:
for k, v in model.counts.items():
    if k != 'n0': print(k, v.most_common(6))

n1 [('в', 60069), ('и', 53202), ('на', 25000), ('не', 20776), ('с', 17257), ('что', 16570)]
n2 [('в году', 1801), ('и в', 1444), ('российской федерации', 1253), ('в том', 1248), ('и не', 1153), ('том что', 1075)]
n3 [('о том что', 639), ('в том числе', 579), ('в том что', 381), ('в соответствии с', 379), ('и т д', 323), ('в связи с', 308)]
