# Typing Assistant

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omidt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Load the Data


In [2]:
def load_data():
    with open("./data/en_US.blogs.txt", "r", encoding = "utf8") as f:
        data = f.read()
    return data

In [3]:
data = load_data()
len(data)

207723793

### Pre-process the Data

1. Split data into sentences using "\n" as the delimiter.
1. Split each sentence into tokens.
1. Create Vocabulary by tokens that appear at least K times in the training data.
1. Replace tokens that appear less than K times by `<unk>`

In [4]:
def tokenize(data):
    sentences = data.split("\n")
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 0]
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentences.append(nltk.word_tokenize(sentence.lower()))
    return tokenized_sentences

In [5]:
train = tokenize(data)
len(train)

899288

In [6]:
def count_words(tokenized_sentences):
    word_counts = {}
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in word_counts.keys():
                word_counts[token] = 0
            word_counts[token] += 1
    return word_counts

def create_vocab(tokenized_sentences, count_threshold):
    vocabulary = []
    word_counts = count_words(tokenized_sentences)
    for word, count in word_counts.items():
        if count >= count_threshold:
            vocabulary.append(word)
    return vocabulary

In [7]:
count_threshold = 2
unknown_token = "<unk>"
vocabulary = create_vocab(train,count_threshold)
len(vocabulary)

180314

In [8]:
def replace_oov_words(tokenized_sentences, vocabulary, unknown_token = "<unk>"):
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []
    for sentence in tokenized_sentences:
        replaced_sentence = []
        for token in sentence:
            if token in vocabulary:
                replaced_sentence.append(token)
            else:
                replaced_sentence.append(unknown_token)
        replaced_tokenized_sentences.append(replaced_sentence)
    return replaced_tokenized_sentences

In [9]:
train_replaced = replace_oov_words(train,vocabulary,unknown_token)
len(train_replaced)

899288

## Create N-Grams

In [10]:
def create_n_gram(tokenized_sentences, n, start_token = "<s>", end_token = "<e>"):
    n_grams = {}
    for sentence in tokenized_sentences:
        sentence = [start_token] * n + sentence + [end_token]
        sentence = tuple(sentence) # n_grams are immutable so we use tuple
        m = len(sentence) if n == 1 else len(sentence) - n + 1
        for i in range(m):
            n_gram = sentence[i:i+n]
            if n_gram not in n_grams.keys():
                n_grams[n_gram] = 0
            n_grams[n_gram] += 1
    return n_grams

def get_n_grams(tokenized_sentences):
    n_gram_count_list = []
    for n in range(1,6):
        n_model_counts = create_n_gram(tokenized_sentences,n)
        n_gram_count_list.append(n_model_counts)
    return n_gram_count_list

In [11]:
n_grams_count_list = get_n_grams(train_replaced)
len(n_grams_count_list)
print(n_grams_count_list[0])



## Estimate Probabilities

In [12]:
def estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0
    n_plus1_gram = previous_n_gram + (word,)
    n_plus1_gram_count = n_plus1_gram_counts[n_plus1_gram] if n_plus1_gram in n_plus1_gram_counts else 0
    numerator = n_plus1_gram_count + k
    denominator = previous_n_gram_count + k*vocabulary_size
    probability = numerator / denominator
    return probability

def probability_of_words(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token = "<e>", unknown_token = "<unk>", k = 1.0):
    previous_n_gram = tuple(previous_n_gram)
    vocabulary += [end_token, unknown_token]
    vocabulary_size = len(vocabulary)
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word,previous_n_gram,n_gram_counts,n_plus1_gram_counts,vocabulary_size,k)
        probabilities[word] = probability
    return probabilities

## Get Suggestions

In [13]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, end_token = "<e>", unknown_token = "<unk>", k = 1.0):
    n = len(list(n_gram_counts.keys())[0])
    previous_tokens = ['<s>'] * n + previous_tokens
    previous_n_gram = previous_tokens[-n:]
    probabilities = probability_of_words(previous_n_gram,n_gram_counts,n_plus1_gram_counts,vocabulary,end_token,unknown_token,k)
    suggestion = None
    max_prob = 0.0
    for word, prob in probabilities.items():
        if prob > max_prob:
            suggestion = word
            max_prob = prob
    return suggestion,max_prob

def get_multiple_suggestions(previous_tokens, n_grams_count_list,vocabulary,k = 1.0):
    model_counts = len(n_grams_count_list)
    suggestions = []
    for i in range(model_counts-1):
        n_gram_counts = n_grams_count_list[i]
        n_plus1_gram_counts = n_grams_count_list[i+1]
        suggestion = suggest_a_word(previous_tokens,n_gram_counts,n_plus1_gram_counts,vocabulary,k)
        suggestions.append(suggestion)
    return suggestions

## Get User Input

In [14]:
while True:
    str_input = input(f"\nEnter a string (# for quit): ")
    if str_input != "#":
        print("Sentence:",str_input)
        tokenized_input = nltk.word_tokenize(str_input.strip())
        print("Suggestions:")
        print(get_multiple_suggestions(tokenized_input,n_grams_count_list,vocabulary))
        print()
    else:
        break

Sentence: Just started a new job, and I'm so
Suggestions:
[('i', 0.06096398320433279), ('excited', 0.0011578987996449111), ('in', 5.5456965394853596e-06), ('in', 5.545635030667362e-06)]

Sentence: She loves drinking coffee every
Suggestions:
[('day', 0.015035608123358257), ('morning', 3.881600106466746e-05), ('in', 5.545450512399627e-06), ('in', 5.545389009038984e-06)]

Sentence: Reading books can improve your
Suggestions:
[('own', 0.012910143684323397), ('writing', 8.310617645103384e-05), ('health', 1.66347833319471e-05), ('in', 5.545143009238209e-06)]

Sentence: My favorite color is
Suggestions:
[('a', 0.0732703694768322), ('a', 0.00012185462742947664), ('blue', 2.772110352168899e-05), ('in', 5.5448970312621296e-06)]

Sentence: No pain, no
Suggestions:
[('one', 0.019981127814697724), ('matter', 0.005008992564357233), ('gain', 2.2177374642389832e-05), ('in', 5.5446510751078435e-06)]

Sentence: He saved money to buy a new
Suggestions:
[('york', 0.021672235671178697), ('one', 0.00158168