# Import libraries

In [1]:
import numpy as np
import pandas as pd

# Ngram Language Model

In [2]:
from nltk.util import ngrams
from nltk import word_tokenize, sent_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.lm import Vocabulary, MLE, Lidstone, WittenBellInterpolated
from nltk.lm.models import InterpolatedLanguageModel, KneserNeyInterpolated
from nltk.lm.smoothing import KneserNey, WittenBell

In [3]:
from modifications.models import Lidstone as MLidstone
from modifications.models import KneserNeyInterpolated as MKneserNeyInterpolated
from modifications.models import WittenBellInterpolated as MWittenBellInterpolated

## Helper functions

In [4]:
import nltk
nltk.download('europarl_raw')

[nltk_data] Downloading package europarl_raw to
[nltk_data]     /home/palasso/nltk_data...
[nltk_data]   Package europarl_raw is already up-to-date!


True

In [5]:
from nltk.corpus.europarl_raw import english

In [6]:
def language_model_method(smoothing_method, order, smoothing_parameter, vocabulary):
    if smoothing_method == 'LID':
        lm = Lidstone(gamma=smoothing_parameter, order=order, vocabulary=vocabulary)
    elif smoothing_method == 'LIDM':
        lm = MLidstone(gamma=smoothing_parameter, order=order, vocabulary=vocabulary)
    elif smoothing_method == 'KNI':
        lm = KneserNeyInterpolated(order=order, discount=smoothing_parameter, vocabulary=vocabulary)
    elif smoothing_method == 'KNIM':
        lm = MKneserNeyInterpolated(order=order, discount=smoothing_parameter, vocabulary=vocabulary)
    elif smoothing_method == 'WBI':
        lm = WittenBellInterpolated(order=order, vocabulary=vocabulary)
    elif smoothing_method == 'WBIM':
        lm = MWittenBellInterpolated(order=order, vocabulary=vocabulary)
    return lm


def train_ngram_lm(smoothing_method, order, smoothing_parameter):
    training_ngrams, padded_sents = padded_everygram_pipeline(order, english.sents())
    lm = language_model_method(smoothing_method=smoothing_method, order=order, smoothing_parameter=smoothing_parameter, vocabulary=Vocabulary(padded_sents, unk_cutoff=10))
    lm.fit(training_ngrams)
    return lm


def train_ngram_language_models(order=3):
    smoothings = {'LID': 0.005, 'LIDM': 0.005, 'KNI': 0.93, 'KNIM': 0.93, 'WBI': 1, 'WBIM': 1}
    ngram = dict()
    for smoothing_method, smoothing_parameter in smoothings.items():
        ngram[smoothing_method] = train_ngram_lm(smoothing_method, order, smoothing_parameter)
    return ngram

In [7]:
nltk.__version__

'3.5'

## Train ngram language models

In [8]:
trigram_lm = train_ngram_language_models(order=3)

In [9]:
fivegram_lm = train_ngram_language_models(order=5)

## Generate ngrams to calculate their logscores

In [10]:
training_ngrams, padded_sents = padded_everygram_pipeline(5, english.sents())
ngrams = [ngram for sent in training_ngrams for ngram in sent]
trigrams = [ngram for ngram in ngrams if len(ngram) == 3]
fivegrams = [ngram for ngram in ngrams if len(ngram) == 5]

In [11]:
trigrams_test = [(ngram[-1], ngram[:-1]) for ngram in trigrams]
fivegrams_test = [(ngram[-1], ngram[:-1]) for ngram in fivegrams]

## Test speed

In [12]:
import timeit

In [13]:
calc = lambda lm, test_set: [lm.logscore(*ngram) for ngram in test_set]

In [14]:
times = {}
values = {}
for key, lm in trigram_lm.items():
    times[key] = timeit.timeit("[calc(lm, trigrams_test[:40])]", globals=globals(), number=100)
    values[key] = calc(lm, trigrams_test[:40])

In [15]:
print(values['LID'] == values['LIDM'])
print(values['KNI'] == values['KNIM'])
print(values['WBI'] == values['WBIM'])

True
True
True


In [16]:
times

{'LID': 44.19068663800135,
 'LIDM': 0.044285511001362465,
 'KNI': 47.008701104001375,
 'KNIM': 0.42280687999664224,
 'WBI': 161.33350436899855,
 'WBIM': 0.6393959919987537}

In [17]:
times = {}
values = {}
for key, lm in fivegram_lm.items():
    times[key] = timeit.timeit("[calc(lm, fivegrams_test[:40])]", globals=globals(), number=100)
    values[key] = calc(lm, trigrams_test[:40])

In [18]:
print(values['LID'] == values['LIDM'])
print(values['KNI'] == values['KNIM'])
print(values['WBI'] == values['WBIM'])

True
True
True


In [19]:
times

{'LID': 47.535225217001425,
 'LIDM': 0.058239693000359694,
 'KNI': 47.33269546700103,
 'KNIM': 0.5418092250001791,
 'WBI': 1277.4404161340026,
 'WBIM': 1.4441447879980842}