# Word Predictor: n-gram model

In [1]:
import os
import io
from collections import defaultdict
import math
from tqdm import tqdm

In [2]:
from config import MIN_N, MAX_N, TRAINED_PATH, START_SYMBOL
from prepare import tokenize_sentences, train_val_test_split, prepend_start_symbol, buildWordIdMappings

In [3]:
sentences = tokenize_sentences("data/HP_all.txt")
train_sentences, _, test_sentences = train_val_test_split(sentences)
word2id, id2word = buildWordIdMappings(train_sentences)

Tokenizing input corpus...
Tokenization ready.
Train dataset size: 51035
Validation dataset size: 17012
Test dataset size: 17012


In [4]:
class NGramTrainer(object):

    def __init__(self, n, word2id, id2word):
        """
        NGramTrainer constructor

        :param n: Size of grams
        """
        # Size of grams
        self.n = n

        # For each word, its ID.
        self.word2id = word2id

        # For each ID, its word
        self.id2word = id2word

        # Sentence starter that adds 'padding' to each sentence
        self.sentence_starter = [START_SYMBOL] * (n - 1)

        # Sentence tokens from corpus
        self.sentences = None

        # Collection of n-gram counts (pos i corresponds to (i+1)-ngrams)
        self.ngrams = [defaultdict(int) for _ in range(n)]

        # Total number of words from corpus
        self.num_total_words = 0

    def process_sentences(self, sentences):
        """
        Processes the given sentences.
        """
        self.sentences = prepend_start_symbol(sentences, self.sentence_starter)
        for i in tqdm(range(len(self.sentences)), desc="Processing corpus", colour='green'):
            self._process_tokens(tuple(self.sentences[i]))
        return self.sentences

    def _process_tokens(self, tokens):
        """
        Processes the list of tokens, and
        adjusts the ngram counts.

        :param tokens: The list of tokens to be processed.
        """

        assert isinstance(tokens, tuple)

        self.num_total_words += len(tokens)

        # We will count one start symbol per sentence
        self.ngrams[self.n - 2][tuple(self.sentence_starter)] += 1

        # Iterate over all possible n-grams
        for i in range(self.n - 1, len(tokens)):
            # Obtain the n-gram stretching from pos i-n+1 to i --> interval [i-n+1, i+1)
            ngram = tokens[i - self.n + 1:i + 1]

            # Update the count for each l-gram, l = 1, ..., n
            for k in range(self.n):  # k = 0, ..., n-1
                self.ngrams[k][ngram[self.n - 1 - k:i + 1]] += 1

    def _get_stats(self):
        """
        Returns the model statistics
        """

        log = math.log

        # Initial row
        rows = [str(len(self.word2id)) + " " + str(self.num_total_words)]

        # For each k-grams, print their stats
        for k in range(self.n):

            # Get the k-grams
            kgrams = self.ngrams[k]

            # Record how many lines are gonna follow
            rows.append(str(len(kgrams)))

            # For each kgram (tuple) in the kgrams dict
            for kgram in kgrams:

                # Transform the words into string ids
                ids = ' '.join(str(self.word2id[word]) for word in kgram)

                # Compute the (log) probability
                # P(w_i | w_{i-n+1}, ..., w_{i-1}) =
                # c(w_{i-n+1}, ..., w_{i-1}, w_i) / c(w_{i-n+1}, ..., w_{i-1})

                # Get the number of occurrences of this kgram
                kgram_count = kgrams[kgram]
                if k == 0:  # Uni-gram --> Use log_prob for unigram_count
                    ids += ' ' + kgram[0]  # Append word.
                    log_prob = log(kgram_count) - log(self.num_total_words)
                else:  # Dealing with 2, 3, ... -grams.
                    # If the previous kgram doesn't exist (start symbols)
                    if kgram[:-1] not in self.ngrams[k - 1]:
                        log_prob = -float('inf')  # So that e^(-inf) = 0
                    else:
                        prev_kgram_count = self.ngrams[k - 1][kgram[:-1]]
                        log_prob = log(kgram_count) - log(prev_kgram_count)
                log_prob = format(log_prob, '.15f')
                rows.append(ids + " " + str(kgram_count) + " " + str(log_prob))
        rows.append(str(-1))  # EOF
        return rows

    def save_model(self, file):
        """
        Save model stats in the provided file
        """
        try:
            print("Saving model...")
            with io.open(file, mode='w', encoding='utf-8-sig') as f:
                for row in self._get_stats():
                    f.write(row + '\n')
            print("Model saved!")
        except FileNotFoundError:
            print("The file", file, " was not found.")
        except IOError:
            print("An IOError occurred while saving the model.")

In [5]:
for n in range(MIN_N, MAX_N + 1):
    print("\nTraining " + str(n) + "-gram model:")
    trainer = NGramTrainer(n, word2id, id2word)
    trainer.process_sentences(train_sentences)
    trainer.save_model(TRAINED_PATH + "model_" + str(n) + "gram_hp_all.txt")


Training 2-gram model:


Processing corpus: 100%|[32m██████████[0m| 51035/51035 [00:00<00:00, 56234.00it/s]


Saving model...
Model saved!

Training 3-gram model:


Processing corpus: 100%|[32m██████████[0m| 51035/51035 [00:01<00:00, 38752.80it/s]


Saving model...
Model saved!

Training 4-gram model:


Processing corpus: 100%|[32m██████████[0m| 51035/51035 [00:01<00:00, 26662.15it/s]


Saving model...
Model saved!

Training 5-gram model:


Processing corpus: 100%|[32m██████████[0m| 51035/51035 [00:02<00:00, 21032.54it/s]


Saving model...
Model saved!
