Code snippet for trigram model tested in full pipeline.

In [None]:
import nltk
from nltk.util import ngrams
from nltk.corpus import reuters
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import Counter
import random

nltk.download('punkt')
nltk.download('reuters')

reuters_text = ' '.join(reuters.words())
reuters_sents = [word_tokenize(sent) for sent in sent_tokenize(reuters_text)]

tokens = [word.lower() for sentence in reuters_sents for word in sentence]
bigrams = list(ngrams(tokens, 2))  # Bigrams (n=2)
trigrams = list(ngrams(tokens, 3))  # Trigrams (n=3)

bigram_freq = Counter(bigrams)
trigram_freq = Counter(trigrams)

def predict_next_word(start_sentence):
    words = word_tokenize(start_sentence)
    if len(words) < 2:
        return predict_using_bigrams(start_sentence)

    last_bigram = tuple(words[-2:])
    trigram_candidates = {trigram[2]: freq for trigram, freq in trigram_freq.items() if trigram[:2] == last_bigram}

    if trigram_candidates:
        return max(trigram_candidates, key=trigram_candidates.get)
    return predict_using_bigrams(start_sentence)

def predict_using_bigrams(start_sentence):
    words = word_tokenize(start_sentence)
    if not words:
        return None

    last_word = words[-1].lower()
    bigram_candidates = {bigram[1]: freq for bigram, freq in bigram_freq.items() if bigram[0] == last_word}

    if bigram_candidates:
        return max(bigram_candidates, key=bigram_candidates.get)
    return None

random.seed(42)
test_data = random.sample(reuters_sents, 1000)

def evaluate_accuracy(test_data):
    correct_predictions = 0
    total_predictions = 0

    for sentence in test_data:
        if len(sentence) < 3:
            continue

        input_sentence = ' '.join(sentence[:-1])
        actual_next_word = sentence[-1].lower()  # convert to lowercase

        predicted_next_word = predict_next_word(input_sentence)

        if predicted_next_word == actual_next_word:
            correct_predictions += 1
        total_predictions += 1

    if total_predictions == 0:
        return 0.0

    accuracy = correct_predictions / total_predictions
    return accuracy

#start_sentence = "The dictionary is filled with"
generated_text
predicted_word = predict_next_word(generated_text)
print(f"Predicted next word for '{generated_text}': words")

accuracy = evaluate_accuracy(test_data)
print(f"Accuracy on 1000 test sentences from Reuters: {accuracy:.4f}")