# Data Preparation

In [1]:
import re
import random
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, TweetTokenizer

In [2]:
df = pd.read_csv('npr.csv')

In [3]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
df['Article'][3][74:75]

'’'

In [5]:
df['Article'][173][1101:1120]

'#themostserioussmog'

### Data Cleaning

In [6]:
### Lowercasing ### 
df['Article'] = df['Article'].str.lower()

In [7]:
### Using correct apostrophe ###
def replace_apostrophe(text):
    apostrophe = re.compile(r"’")
    return apostrophe.sub(r"'", text)

df['Article'] = df['Article'].map(lambda x: replace_apostrophe(x))

In [8]:
### Removing hashtag words: (#brexit, #dumpsterfyre) ###  
def remove_hash_words(text):
    hashtag_words = re.compile(r"#[A-Za-z0-9]+")
    return hashtag_words.sub(r"", text)

df['Article'] = df['Article'].map(lambda x: remove_hash_words(x))

### Creating Corpus

In [9]:
def create_corpus(df):
    
    corpus = []
    
    for x in df['Article'].str.split():
        for i in x:
            corpus.append(i)
             
    text = " " 
    
    return (text.join(corpus))

In [10]:
corpus = create_corpus(df)

In [11]:
corpus[:200]

'in the washington of 2016, even when the policy can be bipartisan, the politics cannot. and in that sense, this year shows little sign of ending on dec. 31. when president obama moved to sanction russ'

#### Only including a-z,'?!.

In [12]:
corpus = re.sub(r"[^a-z.?!, ]+", "", corpus)

### Converting into Tokenized Sentences

In [13]:
def text_to_sentences(text):
    
    sentences = sent_tokenize(text)
    
    sentences = [s.strip() for s in sentences]
    sentences = [s for s in sentences if len(s) > 0]
    
    return sentences


def tokenize_sentences(sentences):
    
    tokenized_senteces = []
    
    for sentence in sentences:
        
        tokenized = TweetTokenizer().tokenize(sentence)
        tokenized_senteces.append(tokenized)
        
    return tokenized_senteces

In [14]:
def get_tokenized_sentences(text):
    
    sentences = text_to_sentences(text)
    tokenized_sentences = tokenize_sentences(sentences)
    
    return tokenized_sentences

### Splitting into Train and Test Sets

In [22]:
tokenized_sentences = get_tokenized_sentences(corpus)
random.seed(1)
random.shuffle(tokenized_sentences)

train_size = int(len(tokenized_sentences) * 0.8)
train_data = tokenized_sentences[0:train_size]
test_data = tokenized_sentences[train_size:]

In [23]:
print(f"{len(tokenized_sentences)} data are split into {len(train_data)} train and {len(test_data)} test set")
print('\n')
print("First training sample:")
print(train_data[0])
print('\n') 
print("First test sample")
print(test_data[0])

491790 data are split into 393432 train and 98358 test set


First training sample:
['i', 'think', ',', 'in', 'general', ',', 'im', 'a', 'big', 'fan', 'of', 'graphic', 'novels', 'if', 'they', 'can', 'be', 'a', 'gateway', 'drug', 'just', 'to', 'reach', 'kids', 'who', 'think', 'that', 'print', 'is', 'not', 'worth', 'their', 'time', '.']


First test sample
['lessler', 'and', 'his', 'colleagues', 'report', 'evidence', 'thursday', 'that', 'the', 'epidemic', 'has', 'peaked', 'and', 'started', 'to', 'subside', '.']


### Word Count, Vocabulary and Handling Unknown Words

In [24]:
def count_words(tokenized_sentences):
    
    word_counts = {}
    
    for sentence in tokenized_sentences:
        for token in sentence:
            
            if token not in word_counts.keys():
                word_counts[token] = 1
            else:
                word_counts[token] += 1
    
    return word_counts


#Find the words that appear N times or more
def get_words_with_nplus_frequency(tokenized_sentences, count_threshold):
    
    closed_vocab = []
    word_counts = count_words(tokenized_sentences)
    
    for word, count in word_counts.items():
        if count >= count_threshold:
            closed_vocab.append(word)
            
    return closed_vocab


# Replace words not in the given vocabulary with '<unk>' token
def oov_words_to_unk(tokenized_sentences, vocab, unknown_token='<unk>'):
    
    vocab = set(vocab)
    replaced_tokenized_sentences = []
    
    for sentence in tokenized_sentences:
        
        replaced_sentence = []
        
        for token in sentence:
            
            if token in vocab:
                replaced_sentence.append(token)
            else:
                replaced_sentence.append(unknown_token)
                
        replaced_tokenized_sentences.append(replaced_sentence)
        
    return replaced_tokenized_sentences

In [25]:
count_words(tokenized_sentences[:1])

{'i': 1,
 'think': 2,
 ',': 2,
 'in': 1,
 'general': 1,
 'im': 1,
 'a': 2,
 'big': 1,
 'fan': 1,
 'of': 1,
 'graphic': 1,
 'novels': 1,
 'if': 1,
 'they': 1,
 'can': 1,
 'be': 1,
 'gateway': 1,
 'drug': 1,
 'just': 1,
 'to': 1,
 'reach': 1,
 'kids': 1,
 'who': 1,
 'that': 1,
 'print': 1,
 'is': 1,
 'not': 1,
 'worth': 1,
 'their': 1,
 'time': 1,
 '.': 1}

In [26]:
get_words_with_nplus_frequency(tokenized_sentences[:1], 2)

['think', ',', 'a']

In [28]:
oov_words_to_unk(tokenized_sentences[:1],vocab=[',','think','a','who','big','kids','reach','to'])

[['<unk>',
  'think',
  ',',
  '<unk>',
  '<unk>',
  ',',
  '<unk>',
  'a',
  'big',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  'a',
  '<unk>',
  '<unk>',
  '<unk>',
  'to',
  'reach',
  'kids',
  'who',
  'think',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>',
  '<unk>']]

In [29]:
def preprocess_data(train_data, test_data, count_threshold):
    
    vocab = get_words_with_nplus_frequency(train_data, count_threshold)
    
    train_data_replaced = oov_words_to_unk(train_data,vocab)
    
    test_data_replaced = oov_words_to_unk(test_data,vocab)
    
    return train_data_replaced, test_data_replaced, vocab

In [30]:
minimum_freq = 2
train_data_processed, test_data_processed, vocab = preprocess_data(train_data,test_data,minimum_freq)

In [31]:
print("First preprocessed training sample:")
print(train_data_processed[0])
print()
print("First preprocessed test sample:")
print(test_data_processed[0])
print()
print("First 10 vocabulary:")
print(vocab[0:10])
print()
print("Size of vocabulary:", len(vocab))

First preprocessed training sample:
['i', 'think', ',', 'in', 'general', ',', 'im', 'a', 'big', 'fan', 'of', 'graphic', 'novels', 'if', 'they', 'can', 'be', 'a', 'gateway', 'drug', 'just', 'to', 'reach', 'kids', 'who', 'think', 'that', 'print', 'is', 'not', 'worth', 'their', 'time', '.']

First preprocessed test sample:
['lessler', 'and', 'his', 'colleagues', 'report', 'evidence', 'thursday', 'that', 'the', 'epidemic', 'has', 'peaked', 'and', 'started', 'to', 'subside', '.']

First 10 vocabulary:
['i', 'think', ',', 'in', 'general', 'im', 'a', 'big', 'fan', 'of']

Size of vocabulary: 61453


# N-Gram Model

### Compute number of n-grams for a given n

In [32]:
def count_n_grams(tokenized_sentences, n, start_token='<s>', end_token='<e>'):
    
    n_grams = {}
    
    for sentence in tokenized_sentences:
        
        sentence = [start_token]*(n-1) + sentence + [end_token]
        sentence = tuple(sentence)
        
        m = len(sentence) if n==1 else len(sentence)-1
        for i in range(m):
            
            n_gram = sentence[i:i+n]
            
            if n_gram in n_grams.keys():
                n_grams[n_gram] += 1
            else:
                n_grams[n_gram] = 1
    
    return n_grams

In [33]:
sentences = tokenized_sentences[:1]
print("Uni-gram:")
print(count_n_grams(sentences, 1))
print('\n')
print("Bi-gram:")
print(count_n_grams(sentences, 2))

Uni-gram:
{('i',): 1, ('think',): 2, (',',): 2, ('in',): 1, ('general',): 1, ('im',): 1, ('a',): 2, ('big',): 1, ('fan',): 1, ('of',): 1, ('graphic',): 1, ('novels',): 1, ('if',): 1, ('they',): 1, ('can',): 1, ('be',): 1, ('gateway',): 1, ('drug',): 1, ('just',): 1, ('to',): 1, ('reach',): 1, ('kids',): 1, ('who',): 1, ('that',): 1, ('print',): 1, ('is',): 1, ('not',): 1, ('worth',): 1, ('their',): 1, ('time',): 1, ('.',): 1, ('<e>',): 1}


Bi-gram:
{('<s>', 'i'): 1, ('i', 'think'): 1, ('think', ','): 1, (',', 'in'): 1, ('in', 'general'): 1, ('general', ','): 1, (',', 'im'): 1, ('im', 'a'): 1, ('a', 'big'): 1, ('big', 'fan'): 1, ('fan', 'of'): 1, ('of', 'graphic'): 1, ('graphic', 'novels'): 1, ('novels', 'if'): 1, ('if', 'they'): 1, ('they', 'can'): 1, ('can', 'be'): 1, ('be', 'a'): 1, ('a', 'gateway'): 1, ('gateway', 'drug'): 1, ('drug', 'just'): 1, ('just', 'to'): 1, ('to', 'reach'): 1, ('reach', 'kids'): 1, ('kids', 'who'): 1, ('who', 'think'): 1, ('think', 'that'): 1, ('that', 'pri

### Estimate Probability for a single word

In [34]:
def estimate_probability(word, previous_n_gram, n_gram_counts, nplus1_gram_counts, vocab_size, k=1.0):
    
    previous_n_gram = tuple(previous_n_gram)
    
    
    previous_n_gram_count = n_gram_counts[previous_n_gram] if previous_n_gram in n_gram_counts else 0
    
    denominator = previous_n_gram_count + k*vocab_size
    
    nplus1_gram = previous_n_gram + (word,)
    
    nplus1_gram_counts = nplus1_gram_counts[nplus1_gram] if nplus1_gram in nplus1_gram_counts else 0
    
    numerator = nplus1_gram_counts + k
    
    probability = numerator/denominator
    
    
    return probability    

In [35]:
tokenized_sentences[:1]

[['i',
  'think',
  ',',
  'in',
  'general',
  ',',
  'im',
  'a',
  'big',
  'fan',
  'of',
  'graphic',
  'novels',
  'if',
  'they',
  'can',
  'be',
  'a',
  'gateway',
  'drug',
  'just',
  'to',
  'reach',
  'kids',
  'who',
  'think',
  'that',
  'print',
  'is',
  'not',
  'worth',
  'their',
  'time',
  '.']]

In [37]:
sentences = tokenized_sentences[:1]
unique_words = list(set(sentences[0]))

unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
tmp_prob = estimate_probability("think", "who", unigram_counts, bigram_counts, len(unique_words), k=1)
print(f"The estimated probability of word 'can' given the previous n-gram 'go' is: {tmp_prob:.4f}")

The estimated probability of word 'can' given the previous n-gram 'go' is: 0.0323


### Estimate Probability for a all words

In [38]:
def estimate_probabilities(previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocab, k=0.1):
    
    previous_n_gram = tuple(previous_n_gram)
    
    vocab = vocab + ['<e>', '<unk>']
    vocab_size = len(vocab)
    
    probabilities = {}
    for word in vocab:
        probability = estimate_probability(word, previous_n_gram, n_gram_counts, n_plus1_gram_counts, vocab_size, k=k)
        probabilities[word] = probability
        
    return probabilities

In [40]:
sentences = tokenized_sentences[:1]
unique_words = list(set(sentences[0]))
unigram_counts = count_n_grams(sentences, 1)
bigram_counts = count_n_grams(sentences, 2)
estimate_probabilities("if", unigram_counts, bigram_counts, unique_words, k=1)

{'they': 0.030303030303030304,
 'in': 0.030303030303030304,
 'that': 0.030303030303030304,
 'is': 0.030303030303030304,
 ',': 0.030303030303030304,
 'general': 0.030303030303030304,
 'their': 0.030303030303030304,
 'im': 0.030303030303030304,
 'just': 0.030303030303030304,
 'if': 0.030303030303030304,
 'gateway': 0.030303030303030304,
 'not': 0.030303030303030304,
 'drug': 0.030303030303030304,
 'reach': 0.030303030303030304,
 'kids': 0.030303030303030304,
 'print': 0.030303030303030304,
 'worth': 0.030303030303030304,
 'fan': 0.030303030303030304,
 'i': 0.030303030303030304,
 'graphic': 0.030303030303030304,
 'who': 0.030303030303030304,
 'time': 0.030303030303030304,
 'a': 0.030303030303030304,
 '.': 0.030303030303030304,
 'novels': 0.030303030303030304,
 'be': 0.030303030303030304,
 'of': 0.030303030303030304,
 'to': 0.030303030303030304,
 'big': 0.030303030303030304,
 'think': 0.030303030303030304,
 'can': 0.030303030303030304,
 '<e>': 0.030303030303030304,
 '<unk>': 0.030303030303

### Count and probability matrices

In [41]:
def create_count_matrix(n_plus1_gram_counts, vocab):
    
    vocab = vocab + ['<e>', '<unk>']
    
    n_grams = []
    
    for n_plus1_gram in n_plus1_gram_counts.keys():
        n_gram = n_plus1_gram[0:-1]
        n_grams.append(n_gram)
    n_grams = list(set(n_grams))
    
    row_index = {n_gram:i for i, n_gram in enumerate(n_grams)}
    col_index = {word:j for j,word in enumerate(vocab)}
    
    nrow = len(n_grams)
    ncol = len(vocab)
    
    count_matrix = np.zeros((nrow,ncol))
    
    for n_plus1_gram, count in n_plus1_gram_counts.items():
        n_gram = n_plus1_gram[0:-1]
        word = n_plus1_gram[-1]
        
        if word not in vocab:
            continue
        i = row_index[n_gram]
        j = col_index[word]
        count_matrix[i, j] = count
        
    count_matrix = pd.DataFrame(count_matrix, index=n_grams, columns=vocab)
    
    return count_matrix

In [42]:
sentences = tokenized_sentences[1:2]
unique_words = list(set(sentences[0]))
bigram_counts = count_n_grams(sentences, 2)

print('bigram counts')
display(create_count_matrix(bigram_counts, unique_words))

bigram counts


Unnamed: 0,republican,every,.,percentage,points,years,four,more,<e>,<unk>
"(years,)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(four,)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"(.,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
"(more,)",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(every,)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
"(republican,)",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(<s>,)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
"(percentage,)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
"(points,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [43]:
def create_probability_matrix(n_plus1_gram_counts, vocab, k):
    
    count_matrix = create_count_matrix(n_plus1_gram_counts, unique_words)
    count_matrix += k
    prob_matrix = count_matrix.div(count_matrix.sum(axis=1), axis=0)
    
    return prob_matrix

In [44]:
sentences = tokenized_sentences[1:2]
unique_words = list(set(sentences[0]))
bigram_counts = count_n_grams(sentences, 2)
print("bigram probabilities")
display(create_probability_matrix(bigram_counts, unique_words, k=1))

bigram probabilities


Unnamed: 0,republican,every,.,percentage,points,years,four,more,<e>,<unk>
"(years,)",0.090909,0.090909,0.181818,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(four,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909,0.090909
"(.,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.181818,0.090909
"(more,)",0.181818,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(every,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909
"(republican,)",0.090909,0.181818,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(<s>,)",0.090909,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909
"(percentage,)",0.090909,0.090909,0.090909,0.090909,0.181818,0.090909,0.090909,0.090909,0.090909,0.090909
"(points,)",0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.090909,0.181818,0.090909,0.090909
