# N-gram Model Implementation.

In [12]:
from collections import Counter

In [16]:
corpus = [
    "I love natural language processing",
    "Natural language processing is fun",
    "I love coding in Python for natural language processing",
    "I currently live in nepal",
    "I love hiking in the mountains",
    "Python is a great programming language"
    ]

tokens  = [word.lower() for sentence in corpus for word in sentence.split()]

word_counts = Counter(tokens)
total_words = sum(word_counts.values())
total_words

36

In [17]:
unigram_probs = {word: count / total_words for word, count in word_counts.items()}
list(unigram_probs.items())[:10]

[('i', 0.1111111111111111),
 ('love', 0.08333333333333333),
 ('natural', 0.08333333333333333),
 ('language', 0.1111111111111111),
 ('processing', 0.08333333333333333),
 ('is', 0.05555555555555555),
 ('fun', 0.027777777777777776),
 ('coding', 0.027777777777777776),
 ('in', 0.08333333333333333),
 ('python', 0.05555555555555555)]

In [19]:
# Interactive prediction function
def predict_next_word(input_sentence):
    next_word = max(unigram_probs, key=unigram_probs.get)
    print(f"Input: '{input_sentence}' → Predicted next word: '{next_word}' (unigram model)")
    return next_word

# Test
predict_next_word("I love to")

Input: 'I love to' → Predicted next word: 'i' (unigram model)


'i'

**From the obeservation we can check that it is poor to next-word prediction and it ignores the sequence and context.**.

**So for context awareness we need to use the bigram or trigram model.**

## Bigram Model

In [22]:
corpus = [
    "I love natural language processing",
    "Natural language processing is fun",
    "I love coding in Python for natural language processing",
    "I currently live in nepal",
    "I love hiking in the mountains",
    "Python is a great programming language"
]

In [23]:
from nltk.tokenize import word_tokenize

In [24]:
tokens = [word_tokenize(sentence.lower()) for sentence in corpus]
tokens = ['<s>'] + [word for sentence in tokens for word in sentence] + ['</s>']

In [25]:
from collections import defaultdict

bigram_counts = defaultdict(Counter)
for i in range(len(tokens) - 1):
    bigram_counts[tokens[i]][tokens[i + 1]] += 1


In [29]:
# Predict next word
def predict_next_word(prev_word):
    prev_word = prev_word.lower()
    if prev_word in bigram_counts:
        next_word = bigram_counts[prev_word].most_common(1)[0][0]
        return next_word
    else:
        return "Unknown context"

# Test
print("Next word after 'I':", predict_next_word("I"))
print("Next word after 'love':", predict_next_word("love"))

Next word after 'I': love
Next word after 'love': natural


In [32]:
def predict_next_word(input_phrase):
    words = input_phrase.lower().split()
    if not words:
        return "No input"
    # Use the last word as context
    last_word = words[-1]
    if last_word in bigram_counts:
        next_word = bigram_counts[last_word].most_common(1)[0][0]
        print(f"Next word after '{input_phrase}': {next_word}")
        return next_word
    else:
        print(f"No prediction for '{last_word}'")
        return "Unknown"

# Test
print("Next word after 'My love is':", predict_next_word("My love is"))   

Next word after 'My love is': fun
Next word after 'My love is': fun
