# Языковая модель

Построим языковую модель сначала вручную на простом синтетическом корпусе, затем обучим модель из пакета `nltk` на стихотворении "Дом, который построил Джек"

## Импорты

In [None]:
from collections import Counter

import nltk.lm as lm
from nltk.util import ngrams as nltk_ngrams
import numpy as np
import scipy.stats as st

## Пример

In [None]:
text = 'SOS SOS ' + 'А Б ' * 100 + 'EOS'
tokens = text.split()
n = len(tokens)
tokens[:10]

In [None]:
def ngrams_and_prefix_counts(tokens, n_max):
    # словарь n-грамм и их частот
    ngrams_counts = {}
    # словарь n-граммных префиксов и их частот
    prefix_counts = {}
    
    n = len(tokens)
    for i in range(n_max):
        ngrams_counts[i + 1] = Counter([tuple(tokens[j : j + i + 1]) for j in range(n - i)])
        prefix_counts[i + 1] = Counter([tuple(tokens[j : j + i] + ['*']) for j in range(n - i)])

    return ngrams_counts, prefix_counts

In [None]:
ngram_counts, prefix_counts = ngrams_and_prefix_counts(tokens, 3)

In [None]:
ngram_counts

In [None]:
prefix_counts

### N-граммы и их частотные вероятности

$\hat p_i = \hat p(w_i)$

In [None]:
def unigram_probas(ngram_counts):
    p1 = {}
    n = sum(ngram_counts[1].values())
    for w in ngram_counts[1]:
        p1[w] = ngram_counts[1][w] / n
    return p1

In [None]:
p1 = unigram_probas(ngram_counts)
p1

$\hat p_{i, i - 1} = \hat p(w_i|w_{i - 1})$

In [None]:
def bigram_probas(ngram_counts, prefix_counts):
    p2 = {}
    for w in ngram_counts[2]:
        pre_w = tuple([w[0]] + ['*'])
        p2[u'{1}|{0}'.format(*w)] = ngram_counts[2][w] / prefix_counts[2][pre_w]
    return p2

In [None]:
p2 = bigram_probas(ngram_counts, prefix_counts)
p2

$\hat p_{i, i - 1, i - 2} = \hat p(w_i|w_{i - 1}, w_{i - 2})$

In [None]:
def trigram_probas(ngram_counts, prefix_counts):
    p3 = {}
    for w in ngram_counts[3]:
        pre_w = w[:2] + tuple(['*'])
        p3[u'{2}|{1},{0}'.format(*w)] = ngram_counts[3][w] / prefix_counts[3][pre_w]
    return p3

In [None]:
p3 =  trigram_probas(ngram_counts, prefix_counts)
p3

### Проверка гипотезы, что триграммную модель можно свести к биграммной против правосторонней альтернативы

Статистика:

$$-2 \log (\prod_{i, j, k = 1}^m (\hat p_{ij} / \hat p_{ijk})^{n_{ijk}}) = \sum_{i, j, k}^m -2 n_{ijk} \log \hat p_{ij} + 2 n_{ijk} \log \hat p_{ijk} = \sum_{i = 3}^N -2 \log \hat p_{i,i - 1} + 2 \log \hat p_{i, i - 1, i - 2},$$
$$n_{ijk} = |\{X_t: X_t = O_i, X_{t + 1} = O_j, X_{t + 2} = O_k\}|$$

In [None]:
def chi2_statistic(p2, p3, tokens):
    stat2 = []
    stat3 = []
    n = len(tokens)
    for i in range(n - 2):
        w = tokens[i : i + 3]
        ngram3 = '{2}|{1},{0}'.format(*w)
        ngram2 = '{1}|{0}'.format(*w)

        stat2.append(np.log(p2[ngram2]))
        stat3.append(np.log(p3[ngram3]))
    return - 2 * np.sum(stat2) + 2 * np.sum(stat3)

In [None]:
m = len(p3)
stat = chi2_statistic(p2, p3, tokens)

In [None]:
print(f'p-value = {1 - st.distributions.chi2(m * ((m - 1) ** 2) - 1).cdf(stat)}')

Гипотеза не отвергается

## Другой пример

In [None]:
text = 'SOS SOS ' + 'А Б Б А Б А Б А Б Б А А ' * 100
tokens = text.split()
tokens[:10]

In [None]:
ngram_counts, prefix_counts = ngrams_and_prefix_counts(tokens, 3)

In [None]:
ngram_counts

In [None]:
prefix_counts

In [None]:
p1 = unigram_probas(ngram_counts)
p1

In [None]:
p2 = bigram_probas(ngram_counts, prefix_counts)
p2

In [None]:
p3 =  trigram_probas(ngram_counts, prefix_counts)
p3

### Проверка той же гипотезы

In [None]:
stat = chi2_statistic(p2, p3, tokens)

In [None]:
print(f'p-value = {1 - st.distributions.chi2(m * ((m - 1) ** 2) - 1).cdf(stat)}')

Гипотеза отвергается

### Сглаживание Лапласа

In [None]:
n1 = list(nltk_ngrams(tokens, 1))
n2 = list(nltk_ngrams(tokens, 2))
n3 = list(nltk_ngrams(tokens, 3))
n3[:10]

In [None]:
laplace = lm.Laplace(order=3)
laplace.fit([n1] + [n2] + [n3], vocabulary_text=list(set(tokens)))
regular_lm = lm.MLE(order=3)
regular_lm.fit([n1] + [n2] + [n3], vocabulary_text=list(set(tokens)))

#### Перплексия

(Меньше => лучше)

In [None]:
laplace.perplexity(n1), regular_lm.perplexity(n1)

In [None]:
foo = [('b'), ('a'), ('r')]
laplace.perplexity(foo), regular_lm.perplexity(foo)

#### Сглаженная по Лапласу оценка вероятности

$$p_L(w_i) = \frac{c_i + 1}{\sum_{i = 1}^v c_i + v}$$
$$p_L(w_i|w_j) = \frac{c_{ij} + 1}{\sum_{j=1}^v (c_{ij} + 1)} = \frac{c_{ij} + 1}{c_i + v}$$

$p_L('А'|'SOS')$

In [None]:
laplace.score('А', context=['SOS']), regular_lm.score('А', context=['SOS'])

$p_L('SOS')$

In [None]:
laplace.score('SOS'), regular_lm.score('SOS')

Эти n-граммы не встречались в тексте:

In [None]:
laplace.score('C', context=['SOS']), laplace.score('ыаываа', context=['B']), laplace.score('B')

In [None]:
regular_lm.score('C', context=['SOS']), regular_lm.score('ыаываа', context=['B']), regular_lm.score('B')

## Генерация текста

"Дом, который построил Джек"

In [None]:
from nltk.tokenize import RegexpTokenizer
rt = RegexpTokenizer(u'\w+')

In [None]:
with open('jack.txt') as f:
    text = f.read().lower()

In [None]:
tokens = rt.tokenize(text)
len(tokens), len(set(tokens))

In [None]:
n1 = list(nltk_ngrams(tokens, 1) )
n2 = list(nltk_ngrams(tokens, 2))
n3 = list(nltk_ngrams(tokens, 3))

In [None]:
laplace = lm.Laplace(order=3)
laplace.fit([n1] + [n2] + [n3], vocabulary_text=list(set(tokens)))

In [None]:
' '.join(laplace.generate(50, random_seed=42))

In [None]:
' '.join(laplace.generate(50, text_seed='вот дом который построил джек'.split()))

In [None]:
' '.join(laplace.generate(50, text_seed='привет как дела'.split()))