In [1]:
from collections import defaultdict
import random

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/don/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
def read_file(fname):
    text = ''
    with open(fname, 'r') as fin:
        text = fin.read()
    return text

def tokenize_text(text):
    sentences = nltk.sent_tokenize(text, language='russian')
    for sent in sentences:
        tokens.extend(nltk.word_tokenize(sent))
    return tokens

def tokenize_text_from_file(fname):
    tokens = []
    text = read_file(fname)
    return tokenize_text(text)

def get_corpus():
    tokens = []
    flist = [
        'anna-karenina.txt', 
        'alisa.txt', 
        'prestuplenie_i_nakazanie.txt', 
        'mertvye-dushi.txt', 
        'idiot.txt']
    for f in flist:
        tokens.extend(tokenize_text(f))
    return tokens

In [3]:
corpus = get_corpus()
bigrams = list(nltk.bigrams(corpus))
fdist = nltk.FreqDist(corpus)
print(len(corpus), len(bigrams))
print(bigrams[:10])

1029965 1029964
[('Annotation', '«'), ('«', 'Анна'), ('Анна', 'Каренина'), ('Каренина', '»'), ('»', ','), (',', 'один'), ('один', 'из'), ('из', 'самых'), ('самых', 'знаменитых'), ('знаменитых', 'романов')]


In [11]:
def _model_add_k_smoothing(model, vocab_len, k=0.00017):
    for w1 in model:
        frac = float(sum(model[w1].values())) + k*vocab_len
        for w2 in model[w1]:
            model[w1][w2] = (model[w1][w2] + k) / frac
    return model

def _model_add_one_smoothing(model, vocab_len):
    for w1 in model:
        history_count = 
        frac = float(sum(model[w1].values())) + vocab_len
        for w2 in model[w1]:
            model[w1][w2] = (model[w1][w2] + 1) / frac
    return model

def _model_without_smoothing(model):
    for w1 in model:
        history_count = float(sum(model[w1].values()))
        model[w1]['HISTORY_COUNT'] = history_count
        for w2 in model[w1]:
            model[w1][w2] /= history_count
    return model

def _mark_unk(corpus, thresh=3):
    fdist = nltk.FreqDist(corpus)
    unks = [w for w in corpus if fdist[w] < thresh]
    for i, w in enumerate(corpus):
        if w in unks:
            corpus[i] = '<UNK>'
    return corpus

def train(corpus, n=2, smoothing=None):
    """
    :corpus: word tokenized text (list of words)
    :n: n value for n-gram
    :smoothing: type of smoothing to apply (None, 'add-one', 'add-k',)
    """
    corpus = _mark_unk(corpus)
    vocab = set(corpus)
    model = defaultdict(lambda: defaultdict(lambda: 0))
    if n == 2:
        bigrams = nltk.bigrams(corpus)
        for w1, w2 in bigrams:
            model[w1][w2] += 1
    elif n == 3:
        trigrams = nltk.trigrams(corpus)
        for w1, w2, w3 in trigrams:
            model[(w1, w2)][w3] += 1
    if smoothing is None:
        return _model_without_smoothing(model)
    elif smoothing == 'add-one':
        return _model_add_one_smoothing(model, len(vocab))
    elif smoothing == 'add-k':
        return _model_add_k_smoothing(model, len(vocab))
    return model

In [None]:
def phrase_probability(model, fdist, vocab, phrase, n=2):
    tokens = tokenize_text(phrase)
    tokens = _mark_unk(tokens)
    prob = 1
    if n == 2:
        bigrams = nltk.bigrams(tokens)
        for w1, w2 in bigrams:
            prob *= model[w1][w2] or 1 / (fdist(w1) + len(vocab))
    if n == 3:
        trigrams = nltk.trigrams(tokens)
        for w1, w2, w3 in trigrams:
            prob *= model[(w1, w2)][w3] or 1 / (fdist(w1))
    return prob

In [5]:
model = defaultdict(lambda: defaultdict(lambda: 0))

In [6]:
for w1, w2 in bigrams:
    model[w1][w2] += 1

for w1 in model:
    total_count = float(sum(model[w1].values()))
    for w2 in model[w1]:
        model[w1][w2] /= total_count

In [7]:
def phrase_prob(phrase):
    prob = fdist.freq(phrase[0]) or 1 / len(bigrams)
    phrase_bigrams = list(nltk.bigrams(phrase))
    for w1, w2 in phrase_bigrams:
        prob *= model[w1][w2] or 1 / (fdist.freq(w1) + len(bigrams))
        print(f"model['{w1}']['{w2}']: {model[w1][w2]}")
    return prob

In [8]:
def build_sentence(text_in, word_cnt=10):
    text = [t for t in text_in]
    sent_finished = False
    cnt = 0
    while not sent_finished:
        r = random.random()
        accum = .0
        history = text[-1]
        for w in model[history].keys():
            accum += model[history][w]
            if accum >= r:
                text.append(w)
                cnt += 1
                break
        if history == (None, None) or word_cnt == cnt:
            sent_finished = True

    print(' '.join([t for t in text if t]))

In [9]:
from pprint import pprint

freqs = fdist.most_common(29050)
least = []
for f in freqs:
    if f[1] < 3:
        least.append(f)
print(len(freqs), len(least))
pprint(least[:10])
pprint(fdist.most_common(10))

29050 4643
[('достоинстве', 2),
 ('Широкого', 2),
 ('XXXV', 2),
 ('пятаяI', 2),
 ('шестаяI', 2),
 ('психологическую', 2),
 ('небывалый', 2),
 ('изображения', 2),
 ('Манн', 2),
 ('эстетической', 2)]
[(',', 117169),
 ('.', 42756),
 ('и', 35021),
 ('–', 28482),
 ('не', 17585),
 ('в', 16976),
 ('что', 14844),
 ('на', 10154),
 ('он', 10040),
 ('!', 9558)]
