In [None]:
!pip install datasets tqdm

## Download wikitext dataset and convert to list of list

In [None]:
from nltk.tokenize import word_tokenize
from datasets import load_dataset
from tqdm import tqdm

In [None]:
dataset = load_dataset("wikitext", "wikitext-103-v1")
dataset

In [None]:
sentences = []
for each_sentence in tqdm(dataset['train']['text'], desc="For every sentence"):
    if len(each_sentence) == 0:
        continue
    each_sentence = each_sentence.strip()
    sentences.append(word_tokenize(each_sentence))

print(sentences[0])

### Let us create padded quadgrams

In [None]:
from nltk.lm.preprocessing import padded_everygram_pipeline

everygram_sentence, padded_stream_words = padded_everygram_pipeline(4, sentences)


### Let us train a KneserNeyInterpolated

In [None]:
from nltk.lm.models import KneserNeyInterpolated

lm = KneserNeyInterpolated(4)
lm.fit(everygram_sentence, padded_stream_words)

### We can score sentences now

In [None]:
lm.score("west", ["The", "sun", "rises", "in", "the"])

In [None]:
lm.score("east", ["The", "sun", "rises", "in", "the"])

### We can also let the model generate sentences/words

In [None]:
lm.generate(5, text_seed=["The", "sun", "rises", "in", "the"], random_seed=3)

### Let us train a Laplace

In [None]:
from nltk.lm.models import Laplace

everygram_sentence, padded_stream_words = padded_everygram_pipeline(4, sentences)
lm = Laplace(4)
lm.fit(everygram_sentence, padded_stream_words)

### We can score sentences now

In [None]:
lm.score("west", ["The", "sun", "rises", "in", "the"])

In [None]:
lm.score("east", ["The", "sun", "rises", "in", "the"])

### We can also let the model generate sentences/words

In [None]:
lm.generate(5, text_seed=["The", "sun", "rises", "in", "the"], random_seed=3)