In [1]:
import math
import random
import numpy as np
import pandas as pd
import nltk
nltk.data.path.append('.')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/piotrek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Part 1 - Loading and preprocessing data

In [3]:
# Part 1.1 - Loading the data - data utilized in this are tweets

In [4]:
with open('en_US.twitter.txt','r') as f:
    data = f.read()
print('Data type: ',type(data))
print('Number of letters: ',len(data))
print('First 300 letters of the data')
print('------')
display(data[-300:])
print('------')

Data type:  <class 'str'>
Number of letters:  3335477
First 300 letters of the data
------


"ust had one a few weeks back....hopefully we will be back soon! wish you the best yo\nColombia is with an 'o'...“: We now ship to 4 countries in South America (fist pump). Please welcome Columbia to the Stunner Family”\n#GutsiestMovesYouCanMake Giving a cat a bath.\nCoffee after 5 was a TERRIBLE idea.\n"

------


In [5]:
# Part 1.2 - Preprocessing data

In [6]:
# splitting to sentences with '\n' delimiter

In [7]:
def split_to_sentences(data):
    return [sentence.strip() for sentence in data.split('\n') if len(sentence.strip())>0]

In [8]:
x = """
I have a pen.\nI have an apple. \nAh\nApple pen.\n
"""
print(x)

split_to_sentences(x)


I have a pen.
I have an apple. 
Ah
Apple pen.




['I have a pen.', 'I have an apple.', 'Ah', 'Apple pen.']

In [9]:
# tokenizing using nltk.word_tokenize() function

In [10]:
def tokenize_sentences(sentences):
    return [nltk.word_tokenize(sentence.lower()) for sentence in sentences]

In [11]:
sentences = ["Sky is blue.", "Leaves are green.", "Roses are red."]
tokenize_sentences(sentences)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green', '.'],
 ['roses', 'are', 'red', '.']]

In [12]:
# getting tokenized data in one list

In [13]:
def get_tokenized_data(data):
    sentences=split_to_sentences(data)
    tokenized_sentences=tokenize_sentences(sentences)
    return tokenized_sentences

In [14]:
x = "Sky is blue.\nLeaves are green\nRoses are red."
get_tokenized_data(x)

[['sky', 'is', 'blue', '.'],
 ['leaves', 'are', 'green'],
 ['roses', 'are', 'red', '.']]

In [15]:
tokenized_data=get_tokenized_data(data)
random.seed(87)
random.shuffle(tokenized_data)

train_size=int(len(tokenized_data)*.8)
train_data=tokenized_data[:train_size]
test_data=tokenized_data[train_size:]

In [16]:
print("{} data are split into {} train and {} test set".format(
    len(tokenized_data), len(train_data), len(test_data)))

print("First training sample:")
print(train_data[0])
      
print("First test sample")
print(test_data[0])

47961 data are split into 38368 train and 9593 test set
First training sample:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the', 'team', 'local', 'company', 'and', 'quality', 'production']
First test sample
['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>']


In [17]:
# creating word count dictionary

In [18]:
def count_words(tokenized_sentences):
    word_counts={}
    for sentence in tokenized_sentences:
        for token in sentence:
            word_counts[token]=word_counts.get(token,0)+1
    return word_counts

In [19]:
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
count_words(tokenized_sentences)

{'sky': 1,
 'is': 1,
 'blue': 1,
 '.': 3,
 'leaves': 1,
 'are': 2,
 'green': 1,
 'roses': 1,
 'red': 1}

In [20]:
# handling out of vocabulary words - by setting a threshold

In [21]:
def get_words_with_nplus_frequency(tokenized_sentences,count_threshold):
    return [word for word,count in count_words(tokenized_sentences).items() if count>=count_threshold]

In [22]:
tokenized_sentences = [['sky', 'is', 'blue', '.'],
                       ['leaves', 'are', 'green', '.'],
                       ['roses', 'are', 'red', '.']]
tmp_closed_vocab = get_words_with_nplus_frequency(tokenized_sentences, count_threshold=2)
print(f"Closed vocabulary:")
print(tmp_closed_vocab)

Closed vocabulary:
['.', 'are']


In [23]:
def replace_oov_words_by_unk(tokenized_sentences,vocabulary,unknown_token='<unk>'):
    return [[token if token in vocabulary else unknown_token for token in sentence] for sentence in tokenized_sentences]

In [24]:
tokenized_sentences = [["dogs", "run"], ["cats", "sleep"]]
vocabulary = ["dogs", "sleep"]
tmp_replaced_tokenized_sentences = replace_oov_words_by_unk(tokenized_sentences, vocabulary)
print(f"Original sentence:")
print(tokenized_sentences)
print(f"tokenized_sentences with less frequent words converted to '<unk>':")
print(tmp_replaced_tokenized_sentences)

Original sentence:
[['dogs', 'run'], ['cats', 'sleep']]
tokenized_sentences with less frequent words converted to '<unk>':
[['dogs', '<unk>'], ['<unk>', 'sleep']]


In [25]:
# combining preprocessing procedures

In [26]:
def preprocess_data(train_data,test_data,count_threshold):
    vocabulary=get_words_with_nplus_frequency(train_data,count_threshold)
    train_data_replaced=replace_oov_words_by_unk(train_data,vocabulary)
    test_data_replaced=replace_oov_words_by_unk(test_data,vocabulary)
    return train_data_replaced,test_data_replaced,vocabulary

In [27]:
tmp_train = [['sky', 'is', 'blue', '.'],
     ['leaves', 'are', 'green']]
tmp_test = [['roses', 'are', 'red', '.']]

tmp_train_repl, tmp_test_repl, tmp_vocab = preprocess_data(tmp_train, 
                                                           tmp_test, 
                                                           count_threshold = 1)

print("tmp_train_repl")
print(tmp_train_repl)
print()
print("tmp_test_repl")
print(tmp_test_repl)
print()
print("tmp_vocab")
print(tmp_vocab)

tmp_train_repl
[['sky', 'is', 'blue', '.'], ['leaves', 'are', 'green']]

tmp_test_repl
[['<unk>', 'are', '<unk>', '.']]

tmp_vocab
['sky', 'is', 'blue', '.', 'leaves', 'are', 'green']


In [28]:
minimum_freq = 2
train_data_processed, test_data_processed, vocabulary = preprocess_data(train_data, 
                                                                        test_data, 
                                                                        minimum_freq)

In [29]:
print("First preprocessed training sample:")
print(train_data_processed[0])
print()
print("First preprocessed test sample:")
print(test_data_processed[0])
print()
print("First 10 vocabulary:")
print(vocabulary[0:10])
print()
print("Size of vocabulary:", len(vocabulary))

First preprocessed training sample:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the', 'team', 'local', 'company', 'and', 'quality', 'production']

First preprocessed test sample:
['that', 'picture', 'i', 'just', 'seen', 'whoa', 'dere', '!', '!', '>', '>', '>', '>', '>', '>', '>']

First 10 vocabulary:
['i', 'personally', 'would', 'like', 'as', 'our', 'official', 'glove', 'of', 'the']

Size of vocabulary: 14821


In [30]:
# Part 2 - N-gram language model

In [31]:
# creating N-gram frequency dictionary

In [32]:
def count_n_grams(data, n, start_token='<s>', end_token = '<e>'):
    n_grams={}
    for sentence in data:
        sentence=tuple([start_token]*n+sentence+[end_token])
        for i in range(len(sentence)-n+1):
            n_gram=sentence[i:i+n]
            n_grams[n_gram]=n_grams.get(n_gram,0)+1
    return n_grams

In [33]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
print("Uni-gram:")
print(count_n_grams(sentences, 1))
print("Bi-gram:")
print(count_n_grams(sentences, 2))

Uni-gram:
{('<s>',): 2, ('i',): 1, ('like',): 2, ('a',): 2, ('cat',): 2, ('<e>',): 2, ('this',): 1, ('dog',): 1, ('is',): 1}
Bi-gram:
{('<s>', '<s>'): 2, ('<s>', 'i'): 1, ('i', 'like'): 1, ('like', 'a'): 2, ('a', 'cat'): 2, ('cat', '<e>'): 2, ('<s>', 'this'): 1, ('this', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1}


In [34]:
# estimating word probability given N-gram with k-smoothing

In [35]:
def estimate_probability(word,previous_n_gram,n_gram_counts,n_plus1_gram_counts,vocabulary_size,k=1.0):
    previous_n_gram=tuple(previous_n_gram)
    previous_n_gram_count=n_gram_counts[previous_n_gram]
    denominator=previous_n_gram_count+k*vocabulary_size
    n_plus1_gram=previous_n_gram+tuple([word])
    n_plus1_gram_count=n_plus1_gram_counts.get(n_plus1_gram,0)
    numerator=n_plus1_gram_count+k
    probability=numerator/denominator
    return probability

In [36]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
tmp_prob = estimate_probability("cat", "a", unigram_counts, bigram_counts, len(unique_words), k=1)

print(f"The estimated probability of word 'cat' given the previous n-gram 'a' is: {tmp_prob:.4f}")

The estimated probability of word 'cat' given the previous n-gram 'a' is: 0.3333


In [39]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    previous_n_gram=tuple(previous_n_gram)
    vocabulary=vocabulary+['<e>'+'<unk>']
    vocabulary_size=len(vocabulary)
    probabilities={}
    for word in vocabulary:
        probability=estimate_probability(word,previous_n_gram,n_gram_counts,n_plus1_gram_counts,vocabulary_size,k=k)
        probabilities[word]=probability
    return probabilities

In [43]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0):
    previous_n_gram = tuple(previous_n_gram)
    vocabulary = vocabulary + ["<e>", "<unk>"]
    vocabulary_size = len(vocabulary)
    probabilities = {}
    for word in vocabulary:
        probability = estimate_probability(word, previous_n_gram,n_gram_counts,n_plus1_gram_counts,vocabulary_size, k=k)
        probabilities[word] = probability
    return probabilities

In [44]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
estimate_probabilities("a", unigram_counts, bigram_counts, unique_words, k=1)

{'a': 0.09090909090909091,
 'like': 0.09090909090909091,
 'i': 0.09090909090909091,
 'dog': 0.09090909090909091,
 'cat': 0.2727272727272727,
 'this': 0.09090909090909091,
 'is': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [45]:
trigram_counts = count_n_grams(sentences, 3)
estimate_probabilities(["<s>", "<s>"], bigram_counts, trigram_counts, unique_words, k=1)

{'a': 0.09090909090909091,
 'like': 0.09090909090909091,
 'i': 0.18181818181818182,
 'dog': 0.09090909090909091,
 'cat': 0.09090909090909091,
 'this': 0.18181818181818182,
 'is': 0.09090909090909091,
 '<e>': 0.09090909090909091,
 '<unk>': 0.09090909090909091}

In [51]:
# creating matrix of word frequencies given N-gram

In [47]:
def make_count_matrix(n_plus1_gram_counts,vocabulary):
    vocabulary=vocabulary+['<e>','<unk>']
    n_grams=[]
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram=n_plus1_gram[:-1]
        n_grams.append(n_gram)
    n_grams=list(set(n_grams))
    row_index={n_gram:i for i,n_gram in enumerate(n_grams)}
    col_index={word:j for j,word in enumerate(vocabulary)}
    nrow=len(n_grams)
    ncol=len(vocabulary)
    count_matrix=np.zeros((nrow,ncol))
    for n_plus1_gram,count in n_plus1_gram_counts.items():
        n_gram=n_plus1_gram[:-1]
        word=n_plus1_gram[-1]
        if word not in vocabulary:
            continue
        i=row_index[n_gram]
        j=col_index[word]
        count_matrix[i,j]=count
    count_matrix=pd.DataFrame(count_matrix,index=n_grams,columns=vocabulary)
    return count_matrix

In [48]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)

print('bigram counts')
display(make_count_matrix(bigram_counts, unique_words))

bigram counts


Unnamed: 0,a,like,i,dog,cat,this,is,<e>,<unk>
"(like,)",2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(a,)",0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
"(this,)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(is,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(i,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(cat,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"(<s>,)",0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
"(dog,)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [49]:
print('\ntrigram counts')
trigram_counts = count_n_grams(sentences, 3)
display(make_count_matrix(trigram_counts, unique_words))


trigram counts


Unnamed: 0,a,like,i,dog,cat,this,is,<e>,<unk>
"(like, a)",0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
"(<s>, i)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(i, like)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog, is)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, <s>)",0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
"(is, like)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>, this)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(this, dog)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
"(a, cat)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0


In [52]:
# creating matrix of word probabilities given N-gram

In [50]:
def make_probability_matrix(n_plus1_gram_counts,vocabulary,k):
    count_matrix=make_count_matrix(n_plus1_gram_counts,unique_words)
    count_matrix+=k
    prob_matrix=count_matrix.div(count_matrix.sum(axis=1),axis=0)
    return prob_matrix

In [53]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))
bigram_counts = count_n_grams(sentences, 2)
print("bigram probabilities")
display(make_probability_matrix(bigram_counts, unique_words, k=1))

bigram probabilities


Unnamed: 0,a,like,i,dog,cat,this,is,<e>,<unk>
"(like,)",0.272727,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(a,)",0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909
"(this,)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(is,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(i,)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(cat,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909
"(<s>,)",0.090909,0.090909,0.181818,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909
"(dog,)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1


In [54]:
print("trigram probabilities")
trigram_counts = count_n_grams(sentences, 3)
display(make_probability_matrix(trigram_counts, unique_words, k=1))

trigram probabilities


Unnamed: 0,a,like,i,dog,cat,this,is,<e>,<unk>
"(like, a)",0.090909,0.090909,0.090909,0.090909,0.272727,0.090909,0.090909,0.090909,0.090909
"(<s>, i)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(i, like)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(dog, is)",0.1,0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, <s>)",0.090909,0.090909,0.181818,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909
"(is, like)",0.2,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
"(<s>, this)",0.1,0.1,0.1,0.2,0.1,0.1,0.1,0.1,0.1
"(this, dog)",0.1,0.1,0.1,0.1,0.1,0.1,0.2,0.1,0.1
"(a, cat)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.272727,0.090909


In [58]:
# Part 3 - computing perplexity

In [56]:
def calculate_perplexity(sentence, n_gram_counts, n_plus1_gram_counts, vocabulary_size, k=1.0):
    n = len(list(n_gram_counts.keys())[0])
    sentence = ["<s>"] * n + sentence + ["<e>"]
    sentence = tuple(sentence)
    N=len(sentence)
    product_pi=1.0
    for t in range(n,N):
        n_gram=sentence[t-n:t]
        word=sentence[t]
        probability=estimate_probability(word,n_gram,n_gram_counts,n_plus1_gram_counts,vocabulary_size,k=k)
        product_pi*=probability
    perplexity=product_pi**(-1/N)
    return perplexity

In [57]:
sentences = [['i', 'like', 'a', 'cat'],
                 ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)


perplexity_train1 = calculate_perplexity(sentences[0],
                                         unigram_counts, bigram_counts,
                                         len(unique_words), k=1.0)
print(f"Perplexity for first train sample: {perplexity_train1:.4f}")

test_sentence = ['i', 'like', 'a', 'dog']
perplexity_test = calculate_perplexity(test_sentence,
                                       unigram_counts, bigram_counts,
                                       len(unique_words), k=1.0)
print(f"Perplexity for test sample: {perplexity_test:.4f}")

Perplexity for first train sample: 2.8040
Perplexity for test sample: 3.9654


In [59]:
# Part 4 - autocomplete system

In [62]:
def suggest_a_word(previous_tokens, n_gram_counts, n_plus1_gram_counts, vocabulary, k=1.0, start_with=None):
    n = len(list(n_gram_counts.keys())[0]) 
    previous_n_gram = previous_tokens[-n:]
    probabilities = estimate_probabilities(previous_n_gram,
                                           n_gram_counts, n_plus1_gram_counts,
                                           vocabulary, k=k)
    suggestion = None
    max_prob = 0
    for word, prob in probabilities.items():
        if start_with:
            if not word.startswith(start_with):
                continue
        if prob > max_prob:
            suggestion = word
            max_prob = prob
    return suggestion, max_prob

In [63]:
sentences = [['i', 'like', 'a', 'cat'],
             ['this', 'dog', 'is', 'like', 'a', 'cat']]
unique_words = list(set(sentences[0] + sentences[1]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)

previous_tokens = ["i", "like"]
tmp_suggest1 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0)
print(f"The previous words are 'i like',\n\tand the suggested word is `{tmp_suggest1[0]}` with a probability of {tmp_suggest1[1]:.4f}")

print()
# test your code when setting the starts_with
tmp_starts_with = 'c'
tmp_suggest2 = suggest_a_word(previous_tokens, unigram_counts, bigram_counts, unique_words, k=1.0, start_with=tmp_starts_with)
print(f"The previous words are 'i like', the suggestion must start with `{tmp_starts_with}`\n\tand the suggested word is `{tmp_suggest2[0]}` with a probability of {tmp_suggest2[1]:.4f}")

The previous words are 'i like',
	and the suggested word is `a` with a probability of 0.2727

The previous words are 'i like', the suggestion must start with `c`
	and the suggested word is `cat` with a probability of 0.0909
