# Typing Assistant

In [1]:
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
from sklearn.model_selection import train_test_split
import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\omidt\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load the Data


In [2]:
with open("./data/en_US.blogs.txt", "r", encoding = "utf8") as f:
    data = f.read()

### Pre-process the Data

1. Split data into sentences using "\n" as the delimiter.
1. Split each sentence into tokens.
1. Create Vocabulary by tokens that appear at least K times in the training data.
1. Replace tokens that appear less than K times by `<unk>`

In [3]:
def tokenize(data):
    sentences = data.split("\n")
    sentences = [sentence.strip() for sentence in sentences if len(sentence) > 0]
    tokenized_sentences = []
    for sentence in sentences:
        tokenized_sentences.append(nltk.word_tokenize(sentence.lower()))
    return tokenized_sentences

In [4]:
tokenized_data = tokenize(data)
train, test = train_test_split(tokenized_data, test_size=0.2, random_state=87)

Sentence 100000
Sentence 200000
Sentence 300000
Sentence 400000
Sentence 500000
Sentence 600000
Sentence 700000
Sentence 800000


In [5]:
def count_words(tokenized_sentences):
    word_counts = {}
    for sentence in tokenized_sentences:
        for token in sentence:
            if token not in word_counts.keys():
                word_counts[token] = 0
            word_counts[token] += 1
    return word_counts

In [6]:
def create_vocab(tokenized_sentences, count_threshold):
    vocabulary = []
    word_counts = count_words(tokenized_sentences)
    for word, count in word_counts.items():
        if count >= count_threshold:
            vocabulary.append(word)
    return vocabulary

In [7]:
def replace_oov_words(tokenized_sentences, vocabulary, unknown_token = "<unk>"):
    vocabulary = set(vocabulary)
    replaced_tokenized_sentences = []
    for sentence in tokenized_sentences:
        replaced_sentence = []
        for token in sentence:
            if token in vocabulary:
                replaced_sentence.append(token)
            else:
                replaced_sentence.append(unknown_token)
        replaced_tokenized_sentences.append(replaced_sentence)
    return replaced_tokenized_sentences

In [8]:
count_threshold = 6
unknown_token = "<unk>"
vocabulary = create_vocab(tokenized_data,count_threshold)
train_replaced = replace_oov_words(train,vocabulary,unknown_token)
test_replaced = replace_oov_words(test,vocabulary,unknown_token)

### Building the model

In [9]:
def count_n_grams(tokenized_sentences, n, start_token = "<s>", end_token = "<e>"):
    n_grams = {}
    for sentence in tokenized_sentences:
        sentence = [start_token] * n + sentence + [end_token] * n
        sentence = tuple(sentence) # n_grams are immutable so we use tuple
        m = len(sentence) if n == 1 else len(sentence) - n + 1
        for i in range(m):
            n_gram = sentence[i:i+n]
            if n_gram not in n_grams.keys():
                n_grams[n_gram] = 0
            n_grams[n_gram] += 1
    return n_grams

### Get list of N-Grams

In [10]:
n_gram_count_list = []
for n in range(1,5):
    n_model_counts = count_n_grams(train_replaced,n)
    n_gram_count_list.append(n_model_counts)

### Save Vocabulary and N-Gram counts

In [11]:
joblib.dump(n_gram_count_list,'n_gram_count_list.pkl')

['n_gram_count_list.pkl']

In [15]:
joblib.dump(vocabulary,'vocabulary.pkl')

['vocabulary.pkl']