# AutoComplete System -- Predict the next word

In [3]:
import import_ipynb
from N_Gram_Language_Model import (tokenized_sentences, count_n_grams, estimate_probabilities, 
                      get_words_with_nplus_frequency, count_words)

In [4]:
# Generating vocabulary with word occuring atleast twice
vocab = get_words_with_nplus_frequency(tokenized_sentences, 2)

### Get single word suggestion

In [5]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    
    """
    Get suggestion for the next word
    
    previous_tokens: The sentence you input where each token is a word. Must have length > n 
    n_gram_counts: Dictionary of counts of (n+1)-grams
    n_plus1_gram_counts: Dictionary of counts of (n+1)-grams
    vocabulary: List of words
    k: positive constant, smoothing parameter
    start_with: If not None, specifies the first few letters of the next word
    
    """
    
    # length of previous words
    n = len(list(n_gram_counts.keys())[0]) 
    
    # get the most recent 'n' words as the previous n-gram
    previous_n_gram = previous_tokens[-n:]

    # Estimate the probabilities that each word in the vocabulary
    # is the next word,
    # given the previous n-gram, the dictionary of n-gram counts,
    # the dictionary of n plus 1 gram counts, and the smoothing constant
    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
    
    # Initialize suggested word to None
    # This will be set to the word with highest probability
    suggestion = 'a'
    
    # Initialize the highest word probability to 0
    # this will be set to the highest probability 
    # of all words to be suggested
    max_prob = 0
    
    
    # For each word and its probability in the probabilities dictionary:
    for word, prob in probabilities.items(): 
        
        # If the optional start_with string is set
        if start_with != None:
            
            # Check if the beginning of word does not match with the letters in 'start_with'
            if not word.startswith(start_with):

                # if they don't match, skip this word (move onto the next word)
                continue 
        
        # Check if this word's probability
        # is greater than the current maximum probability
        if prob > max_prob:
            
            # If so, save this word as the best suggestion (so far)
            suggestion = word
            
            # Save the new maximum probability
            max_prob = prob
    
    return suggestion, max_prob

### Test for single word suggestion

In [6]:
bi_counts = count_n_grams(tokenized_sentences, 2)
tri_counts = count_n_grams(tokenized_sentences, 3)

In [7]:
previous_tokens = ['hello','how','are']

In [8]:
suggest_a_word(previous_tokens,bi_counts,tri_counts,vocab,k=1.0)

('you', 0.0007144132880871584)

## Get Suggestions on the basis of multiple N-Grams

In [9]:
def get_suggestions(previous_tokens, n_gram_counts_list, vocabulary, k=1.0, start_with=None):
    
    model_counts = len(n_gram_counts_list)
    suggestions = []
    
    for i in range(model_counts-1):
        n_gram_counts = n_gram_counts_list[i]
        n_plus1_gram_counts = n_gram_counts_list[i+1]
        
        suggestion = suggest_a_word(previous_tokens, n_gram_counts,
                                    n_plus1_gram_counts, vocabulary,
                                    k=k, start_with=start_with)
        suggestions.append(suggestion)
    
    return suggestions

In [10]:
n_gram_counts_list = []

for n in range(2, 6):
    print("Computing n-gram counts with n =", n, "...")
    n_model_counts = count_n_grams(tokenized_sentences, n)
    n_gram_counts_list.append(n_model_counts)

Computing n-gram counts with n = 2 ...
Computing n-gram counts with n = 3 ...
Computing n-gram counts with n = 4 ...
Computing n-gram counts with n = 5 ...


### Testing for multiple N-Grams model suggestion

In [11]:
previous_tokens = ['hello','where','are','you'] 

In [12]:
get_suggestions(previous_tokens,n_gram_counts_list,vocab,k=1.0)

[('going', 0.0009766781105718006),
 ('from', 0.00011918951132300358),
 ('oge', 1.4903795996840395e-05)]