# Next Word Prediction

This is a simple application of how ngrams can be used in real word. In any messaging application, you tend to see suggestions for the probable next word from the application. A naive version of this can realised with ngrams. 

In [2]:
# Basic Libraries
import bs4 as bs
import urllib.request
import re

import nltk
from nltk import bigrams,trigrams
from collections import defaultdict

In [3]:
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Tennis')
raw_html = raw_html.read()

article_html = bs.BeautifulSoup(raw_html, 'lxml')
article_paragraphs = article_html.find_all('p')
article_text = ''

for para in article_paragraphs:
    article_text += para.text

article_text = article_text.lower()
article_text = re.sub(r'[^A-Za-z. ]', '', article_text)
article_text



# Preprocessing

Lets Tokenize the text so that we can construct n-grams

In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
tokenized_sent = nltk.word_tokenize(article_text)
tokenized_sent

['tennis',
 'is',
 'a',
 'racket',
 'sport',
 'that',
 'can',
 'be',
 'played',
 'individually',
 'against',
 'a',
 'single',
 'opponent',
 'singles',
 'or',
 'between',
 'two',
 'teams',
 'of',
 'two',
 'players',
 'each',
 'doubles',
 '.',
 'each',
 'player',
 'uses',
 'a',
 'tennis',
 'racket',
 'that',
 'is',
 'strung',
 'with',
 'cord',
 'to',
 'strike',
 'a',
 'hollow',
 'rubber',
 'ball',
 'covered',
 'with',
 'felt',
 'over',
 'or',
 'around',
 'a',
 'net',
 'and',
 'into',
 'the',
 'opponents',
 'court',
 '.',
 'the',
 'object',
 'of',
 'the',
 'game',
 'is',
 'to',
 'manoeuvre',
 'the',
 'ball',
 'in',
 'such',
 'a',
 'way',
 'that',
 'the',
 'opponent',
 'is',
 'not',
 'able',
 'to',
 'play',
 'a',
 'valid',
 'return',
 '.',
 'the',
 'player',
 'who',
 'is',
 'unable',
 'to',
 'return',
 'the',
 'ball',
 'validly',
 'will',
 'not',
 'gain',
 'a',
 'point',
 'while',
 'the',
 'opposite',
 'player',
 'will.tennis',
 'is',
 'an',
 'olympic',
 'sport',
 'and',
 'is',
 'played',


---
# Generating N-Gram Model

---
# 2-Gram

In [6]:
# Create a placeholder for model
model_2_gram = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for w1, w2 in bigrams(tokenized_sent):
    model_2_gram[(w1)][w2] += 1

# Let's transform the counts to probabilities
for w1 in model_2_gram:
    total_count = float(sum(model_2_gram[w1].values()))
    for w2 in model_2_gram[w1]:
        model_2_gram[w1][w2] /= total_count

In [7]:
model_2_gram

defaultdict(<function __main__.<lambda>>,
            {'tennis': defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
                         {'.': 0.05970149253731343,
                          'a': 0.014925373134328358,
                          'after': 0.007462686567164179,
                          'almost': 0.007462686567164179,
                          'and': 0.022388059701492536,
                          'as': 0.007462686567164179,
                          'association': 0.05970149253731343,
                          'australia': 0.007462686567164179,
                          'balls': 0.022388059701492536,
                          'balls.tennis': 0.007462686567164179,
                          'ballwas': 0.007462686567164179,
                          'by': 0.007462686567164179,
                          'can': 0.007462686567164179,
                          'championships': 0.007462686567164179,
                          'channel': 0.007462686567164179,
          

---
# 3-Gram

In [8]:
# Create a placeholder for model
model_3_gram = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for w1, w2, w3 in trigrams(tokenized_sent, pad_right=True, pad_left=True):
    model_3_gram[(w1, w2)][w3] += 1

# Let's transform the counts to probabilities
for w1_w2 in model_3_gram:
    total_count = float(sum(model_3_gram[w1_w2].values()))
    for w3 in model_3_gram[w1_w2]:
        model_3_gram[w1_w2][w3] /= total_count

In [9]:
model_3_gram

defaultdict(<function __main__.<lambda>>,
            {(None,
              None): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'tennis': 1.0}),
             (None,
              'tennis'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'is': 1.0}),
             ('tennis',
              'is'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'a': 0.5,
                          'played': 0.25,
                          'the': 0.25}),
             ('is',
              'a'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'break': 0.06666666666666667,
                          'diagonal': 0.06666666666666667,
                          'group': 0.06666666666666667,
                          'let': 0.06666666666666667,
                          'player': 0.06666666666666667,
                          'racket': 0.06666666666666667,
                          'relevant': 0.06666666666666667,
                          'serve': 0.06666666666

---
# Provide the input

In [15]:
text_input = input('Enter the Text (Only 2 Words are Required) :')
text_tokenized = text_input.split(sep = ' ')

Enter the Text (Only 2 Words are Required) :volley is


In [16]:
token_length = len(text_tokenized)

In [17]:
values_3_gram = model_3_gram.get((text_tokenized[token_length-2], text_tokenized[token_length-1]), '<NA>')
values_3_gram

defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
            {'a': 0.3333333333333333,
             'hit': 0.3333333333333333,
             'made': 0.3333333333333333})

In [18]:
if(values_3_gram == '<NA>'):
    pred_ngrams = dict(model_2_gram[text_tokenized[token_length-1]])
else:
    pred_ngrams = dict(model_3_gram[text_tokenized[token_length-2],text_tokenized[token_length - 1]])

In [19]:
pred_ngrams

{'a': 0.3333333333333333,
 'hit': 0.3333333333333333,
 'made': 0.3333333333333333}

In [None]:
import random
# Generating a random number
rand_no = random.uniform(0, 1)
print('rand_no',rand_no)

import numpy as np
position = np.where(np.array(list(pred_ngrams.values())).cumsum() > rand_no)[0][0]
print('position',position)

rand_no 0.36483716388469767
position 20


In [None]:
list(pred_ngrams.keys())[position]

'referred'

---
# Suggested Next Word

In [None]:
print('Next Suggested Word is :', list(pred_ngrams.keys())[position])

Next Suggested Word is : referred
