In [None]:
import re
from collections import Counter
from math import log
import nltk


corpus = """
Mumbai also known as Bombay /bɒmˈbeɪ/— the official name until 1995) is the capital city of the Indian state of Maharashtra. 
Mumbai is the de facto financial centre and the most populous city of India with an estimated city proper population of 12.5 million (1.25 crore).
Mumbai is the centre of Mumbai Metropolitan Region,
the sixth most populous metropolitan area in the world with a population of over 23 million (2.3 crore) living under the Brihanmumbai Municipal Corporation.
Mumbai lies on the Konkan coast on the west coast of India and has a deep natural harbour. In 2008, Mumbai was named an alpha world city.
"""

corpus = corpus.lower()
corpus = re.sub(r'[^\w\s]', '', corpus)
words = nltk.word_tokenize(corpus)

unigrams = Counter(words)
bigrams = Counter(zip(words, words[1:]))
trigrams = Counter(zip(words, words[1:], words[2:]))

vocab_size = len(set(words))
smoothed_bigrams = {}
for bigram in bigrams:
    word1 = bigram[0]
    prob = (bigrams[bigram] + 1) / (unigrams[word1] + vocab_size)
    smoothed_bigrams[bigram] = prob

smoothed_trigrams = {}
for trigram in trigrams:
    bigram = (trigram[0], trigram[1])
    prob = (trigrams[trigram] + 1) / (bigrams[bigram] + vocab_size)
    smoothed_trigrams[trigram] = prob

def predict_next_word(previous_words, n=3):
    previous_words = previous_words.lower()
    previous_words = re.sub(r'[^\w\s]', '', previous_words)
    words = nltk.word_tokenize(previous_words)
    if len(words) < n - 1:
        return "Please provide at least {} words.".format(n - 1)
    if n == 2:
        probs = [(word, smoothed_bigrams.get((words[-1], word), 1/vocab_size)) for word in unigrams]
    elif n == 3:
        probs = [(word, smoothed_trigrams.get((words[-2], words[-1], word), 1/vocab_size)) for word in unigrams]
    else:
        return "n can only be 2 or 3."
    probs.sort(key=lambda x: x[1], reverse=True)
    return probs[0][0]


print(predict_next_word("West Coast of")) 
print(predict_next_word("centre of"))


india
mumbai


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True