In [4]:
import re
import nltk
import random
from collections import defaultdict, Counter
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [7]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
# Step 1: Preprocess the Text Data
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and special characters
    words = word_tokenize(text)  # Tokenize words
    return words

In [8]:
# Sample corpus (Use a larger dataset for better accuracy)
corpus = """
Artificial intelligence is revolutionizing the world.
AI is being used in speech recognition, translation, and automation.
Machine learning models are improving every day.
AI applications include healthcare, robotics, and smart assistants.
"""

words = preprocess_text(corpus)

In [9]:
# Step 2: Build the N-gram Model
def build_ngram_model(words, n=2):
    ngram_counts = defaultdict(Counter)

    # Create N-grams
    n_grams = list(ngrams(words, n))

    for gram in n_grams:
        prefix = gram[:-1]  # First (n-1) words
        next_word = gram[-1]  # Last word
        ngram_counts[prefix][next_word] += 1  # Count occurrences

    # Convert counts to probabilities
    ngram_probabilities = {prefix: {word: count / sum(counts.values())
                                    for word, count in counts.items()}
                           for prefix, counts in ngram_counts.items()}

    return ngram_probabilities

In [10]:
# Build bigram and trigram models
bigram_model = build_ngram_model(words, 2)
trigram_model = build_ngram_model(words, 3)

In [11]:
# Step 3: Predict the Next Word
def predict_next_word(previous_words, ngram_model):
    previous_words = tuple(previous_words)

    if previous_words in ngram_model:
        word_probabilities = ngram_model[previous_words]
        return max(word_probabilities, key=word_probabilities.get)  # Most probable word

    return None  # No prediction available

In [12]:
# Step 4: Auto-Complete Function
def auto_complete(text, ngram_model):
    text = preprocess_text(text)

    if len(text) >= 2:
        # Use trigrams if we have at least 2 previous words
        predicted_word = predict_next_word(text[-2:], trigram_model)
        if predicted_word:
            return predicted_word

    if len(text) >= 1:
        # Use bigrams if we have at least 1 previous word
        predicted_word = predict_next_word(text[-1:], bigram_model)
        if predicted_word:
            return predicted_word

    return "No suggestion"

In [17]:
# Step 5: Test the Model
user_input = "computer science and "
predicted_word = auto_complete(user_input, bigram_model)
print(f"Predicted next word: {predicted_word}")

Predicted next word: automation
