### Load trained data

In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split
import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omidt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
vocabulary = joblib.load('vocabulary.pkl')
n_gram_count_list = joblib.load('n_gram_count_list.pkl')

In [9]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0
    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts else 0
    numerator = n_plus1_gram_count + k
    denominator = previous_n_gram_count + k*vocabulary_size
    probability = numerator / denominator
    return probability

In [10]:
def probability_of_words(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token = "<e>", unknown_token = "<unk>", k = 1.0):
    previous_n_gram = tuple(previous_n_gram)
    vocabulary += [end_token, unknown_token]
    vocabulary_size = len(vocabulary)
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word,previous_n_gram,n_gram_counts,n_plus1_gram_counts,vocabulary_size,k)
        probabilities[word] = probability
    return probabilities

In [11]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token = "<e>", unknown_token = "<unk>", k = 1.0):
    n = len(list(n_gram_counts.keys())[0])
    previous_tokens = ['<s>'] * n + previous_tokens
    previous_n_gram = previous_tokens[-n:]
    probabilities = probability_of_words(previous_n_gram,n_gram_counts,n_plus1_gram_counts,vocabulary,end_token,unknown_token,k)
    suggestion = None
    max_prob = 0.0
    for word, prob in probabilities.items():
        if prob > max_prob:
            suggestion = word
            max_prob = prob
    return suggestion,max_prob

In [12]:
def get_multiple_suggestions(previous_tokens, n_gram_count_list,vocabulary,k = 1.0):
    model_counts = len(n_gram_count_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_gram_count_list[i]
        n_plus1_gram_counts = n_gram_count_list[i+1]
        suggestion = suggest_a_word(previous_tokens,n_gram_counts,n_plus1_gram_counts,vocabulary,k)
        suggestions.append(suggestion)
    return suggestions

In [16]:
previous_tokens = ["hey", "how", "are", "you"]
tmp_suggest4 = get_multiple_suggestions(previous_tokens, n_gram_count_list, vocabulary, k=1.0)

print(f"The previous words are {previous_tokens}, the suggestions are:")
display(tmp_suggest4)

The previous words are ['hey', 'how', 'are', 'you'], the suggestions are:


[('can', 0.060112804708675555),
 ('going', 0.002383565199948081),
 ('?', 0.0005271093568040012)]