In [6]:
from collections import defaultdict

##### Task 1: Data Preparation on the given corpus:

In [7]:
corpus = [
    "The sun sets over the horizon, painting the sky with hues of orange and pink.",
    "In the heart of the forest, a gentle breeze rustles the leaves.",
    "She walked along the sandy shore, feeling the cool water on her feet.",
    "The old bookshop on the corner is filled with stories waiting to be discovered.",
    "As the rain falls outside, I sit by the window with a cup of hot tea."
]

tokenized_corpus = [sentence.split() for sentence in corpus]

In [8]:
print(tokenized_corpus)

[['The', 'sun', 'sets', 'over', 'the', 'horizon,', 'painting', 'the', 'sky', 'with', 'hues', 'of', 'orange', 'and', 'pink.'], ['In', 'the', 'heart', 'of', 'the', 'forest,', 'a', 'gentle', 'breeze', 'rustles', 'the', 'leaves.'], ['She', 'walked', 'along', 'the', 'sandy', 'shore,', 'feeling', 'the', 'cool', 'water', 'on', 'her', 'feet.'], ['The', 'old', 'bookshop', 'on', 'the', 'corner', 'is', 'filled', 'with', 'stories', 'waiting', 'to', 'be', 'discovered.'], ['As', 'the', 'rain', 'falls', 'outside,', 'I', 'sit', 'by', 'the', 'window', 'with', 'a', 'cup', 'of', 'hot', 'tea.']]


##### Task 2: Build the N-Gram Model

In [9]:
def generate_ngrams(words, n):
    ngrams = []
    for i in range(len(words) - n + 1):
        ngram = tuple(words[i:i+n])
        ngrams.append(ngram)
    return ngrams

In [10]:
def build_ngram_model(tokenized_corpus, n):
    ngram_model = defaultdict(dict)
    
    for sentence in tokenized_corpus:
        ngrams = generate_ngrams(sentence, n)
        for i in range(len(ngrams) - 1):
            prefix = ngrams[i][:-1]
            next_word = ngrams[i][-1]
            if next_word in ngram_model[prefix]:
                ngram_model[prefix][next_word] += 1
            else:
                ngram_model[prefix][next_word] = 1
                
    return ngram_model

In [11]:
ngram_model = build_ngram_model(tokenized_corpus, n=2)

In [12]:
ngram_model

defaultdict(dict,
            {('The',): {'sun': 1, 'old': 1},
             ('sun',): {'sets': 1},
             ('sets',): {'over': 1},
             ('over',): {'the': 1},
             ('the',): {'horizon,': 1,
              'sky': 1,
              'heart': 1,
              'forest,': 1,
              'sandy': 1,
              'cool': 1,
              'corner': 1,
              'rain': 1,
              'window': 1},
             ('horizon,',): {'painting': 1},
             ('painting',): {'the': 1},
             ('sky',): {'with': 1},
             ('with',): {'hues': 1, 'stories': 1, 'a': 1},
             ('hues',): {'of': 1},
             ('of',): {'orange': 1, 'the': 1, 'hot': 1},
             ('orange',): {'and': 1},
             ('In',): {'the': 1},
             ('heart',): {'of': 1},
             ('forest,',): {'a': 1},
             ('a',): {'gentle': 1, 'cup': 1},
             ('gentle',): {'breeze': 1},
             ('breeze',): {'rustles': 1},
             ('rustles',): {'the':

##### Task 3: Generate Text with Unseen Data

In [13]:
def predict_next_word_with_smoothing(ngram_model, prefix):
    if prefix in ngram_model:
        next_words = ngram_model[prefix]
        total_count = sum(next_words.values())
        probabilities = {word: (count + 1) / (total_count + len(next_words)) for word, count in next_words.items()}
        max_prob_word = max(probabilities, key=probabilities.get)
        return max_prob_word
    else:
        return None

In [14]:
def generate_text_with_unseen_data(ngram_model, n, test_prefix):
    next_word = predict_next_word_with_smoothing(ngram_model, tuple(test_prefix[-n+1:]))
    return next_word if next_word is not None else "No prediction available"

In [15]:
user_input_1 = input("Enter a sentence for test prefix 1: ")
user_input_2 = input("Enter a sentence for test prefix 2: ")

In [16]:
tokenized_input_1 = user_input_1.split()
tokenized_input_2 = user_input_2.split()
generated_text_user_input_1 = generate_text_with_unseen_data(ngram_model, n=2, test_prefix=tokenized_input_1)
generated_text_user_input_2 = generate_text_with_unseen_data(ngram_model, n=2, test_prefix=tokenized_input_2)
generated_sentence_1 = " ".join(tokenized_input_1 + [generated_text_user_input_1])
generated_sentence_2 = " ".join(tokenized_input_2 + [generated_text_user_input_2])

In [17]:
print("Generated Text for User Input 1:", generated_sentence_1)
print("Generated Text for User Input 2:", generated_sentence_2)

Generated Text for User Input 1: The sun sets
Generated Text for User Input 2: She walked
