# unigram model without punctuation 


In [58]:
import re
from collections import Counter
import random

# Example text
text = "This is a simple example to demonstrate a unigram model. This example is simple and easy."

# Step 1: Convert to lowercase and tokenize
def preprocess(text):
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    return words

words = preprocess(text)
print("Tokenized Words:", words)

# Step 2: Count word frequencies
word_counts = Counter(words)
print("Word Counts:", word_counts)

# Step 3: Calculate total number of words
total_words = sum(word_counts.values())
print("Total Words:", total_words)

# Step 4: Generate text based on unigram probabilities
unique_words = list(word_counts.keys())
probabilities = [word_counts[word] / total_words for word in unique_words]

def generate_text(num_words):
    generated_text = random.choices(unique_words, probabilities, k=num_words)
    return ' '.join(generated_text)

# Generate 10 words of text
generated_text = generate_text(5)
print("Generated Text:", generated_text)


Tokenized Words: ['this', 'is', 'a', 'simple', 'example', 'to', 'demonstrate', 'a', 'unigram', 'model', 'this', 'example', 'is', 'simple', 'and', 'easy']
Word Counts: Counter({'this': 2, 'is': 2, 'a': 2, 'simple': 2, 'example': 2, 'to': 1, 'demonstrate': 1, 'unigram': 1, 'model': 1, 'and': 1, 'easy': 1})
Total Words: 16
Generated Text: is easy example a simple


# unigram model with punctuation 


In [95]:
import re
from collections import Counter
import random

# Example text
text = "This is a simple example to demonstrate a unigram model. This example is simple and easy."

# Step 1: Convert to lowercase and tokenize, including punctuation
def preprocess(text):
    text = text.lower()
    words = re.findall(r'\b\w+\b|[.,!?;]', text)
    return words

# Preprocess the text without removing stop words
words = preprocess(text)
print("Tokenized Words:", words)

# Step 2: Count word frequencies
word_counts = Counter(words)
print("Word Counts:", word_counts)

# Step 3: Calculate total number of words
total_words = sum(word_counts.values())
print("Total Words:", total_words)

# Step 4: Generate text based on unigram probabilities
unique_words = list(word_counts.keys())
probabilities = [word_counts[word] / total_words for word in unique_words]

def generate_text(num_words):
    generated_text = random.choices(unique_words, probabilities, k=num_words)
    return ' '.join(generated_text)

# Generate 10 words of text
generated_text = generate_text(10)
print("Generated Text:", generated_text)


Tokenized Words: ['this', 'is', 'a', 'simple', 'example', 'to', 'demonstrate', 'a', 'unigram', 'model', '.', 'this', 'example', 'is', 'simple', 'and', 'easy', '.']
Word Counts: Counter({'this': 2, 'is': 2, 'a': 2, 'simple': 2, 'example': 2, '.': 2, 'to': 1, 'demonstrate': 1, 'unigram': 1, 'model': 1, 'and': 1, 'easy': 1})
Total Words: 18
Generated Text: example to model example unigram to a simple simple example


# simple bigram with punctuation 

In [15]:
import re
from collections import Counter, defaultdict
import random

# Example text
text = "This is a simple example to demonstrate a bigram model. This example is simple and easy."

# Step 1: Convert to lowercase and tokenize, preserving punctuation
def preprocess(text):
    text = text.lower()
    # Split words and punctuation while keeping punctuation adjacent to words
    words = re.findall(r'\b\w+\b|[.,!?;]', text)
    return words

# Preprocess the text
words = preprocess(text)
print("Tokenized Words:", words)

# Step 2: Count bigram frequencies
def build_bigrams(words):
    bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
    bigram_counts = Counter(bigrams)
    return bigram_counts

bigrams = build_bigrams(words)
print("Bigram Counts:", bigrams)

# Step 3: Calculate bigram probabilities
def calculate_bigram_probabilities(bigrams):
    bigram_probabilities = defaultdict(dict)
    word_counts = Counter(words)
    
    for (w1, w2), count in bigrams.items():
        bigram_probabilities[w1][w2] = count / word_counts[w1]
        
    return bigram_probabilities

bigram_probabilities = calculate_bigram_probabilities(bigrams)
print("Bigram Probabilities:", bigram_probabilities)

# Step 4: Generate text based on bigram probabilities
def generate_text(bigram_probabilities, start_word, num_words):
    current_word = start_word
    generated_text = [current_word]
    
    for _ in range(num_words - 1):
        next_words = list(bigram_probabilities[current_word].keys())
        if not next_words:
            break
        probabilities = list(bigram_probabilities[current_word].values())
        next_word = random.choices(next_words, probabilities)[0]
        # If next word is punctuation, it should be appended without a space
        if next_word in '.,!?;':
            generated_text[-1] += next_word
        else:
            generated_text.append(next_word)
        current_word = next_word
        
    return ' '.join(generated_text)

# Generate text starting with the word 'this'
start_word = 'this'
generated_text = generate_text(bigram_probabilities, start_word, 10)
print("Generated Text:", generated_text)


Tokenized Words: ['this', 'is', 'a', 'simple', 'example', 'to', 'demonstrate', 'a', 'bigram', 'model', '.', 'this', 'example', 'is', 'simple', 'and', 'easy', '.']
Bigram Counts: Counter({('this', 'is'): 1, ('is', 'a'): 1, ('a', 'simple'): 1, ('simple', 'example'): 1, ('example', 'to'): 1, ('to', 'demonstrate'): 1, ('demonstrate', 'a'): 1, ('a', 'bigram'): 1, ('bigram', 'model'): 1, ('model', '.'): 1, ('.', 'this'): 1, ('this', 'example'): 1, ('example', 'is'): 1, ('is', 'simple'): 1, ('simple', 'and'): 1, ('and', 'easy'): 1, ('easy', '.'): 1})
Bigram Probabilities: defaultdict(<class 'dict'>, {'this': {'is': 0.5, 'example': 0.5}, 'is': {'a': 0.5, 'simple': 0.5}, 'a': {'simple': 0.5, 'bigram': 0.5}, 'simple': {'example': 0.5, 'and': 0.5}, 'example': {'to': 0.5, 'is': 0.5}, 'to': {'demonstrate': 1.0}, 'demonstrate': {'a': 1.0}, 'bigram': {'model': 1.0}, 'model': {'.': 1.0}, '.': {'this': 0.5}, 'and': {'easy': 1.0}, 'easy': {'.': 1.0}})
Generated Text: this example to demonstrate a bigram

# class for simple bigram model

In [139]:

class BigramModel:
    import re
    from collections import Counter
    import random
    def __init__(self, text):
        self.text = text.lower()
        self.words = self.preprocess(self.text)
        self.bigrams = self.build_bigrams(self.words)
        self.bigram_probabilities = self.calculate_bigram_probabilities(self.bigrams, self.words)

    def preprocess(self, text):
        # Split words and punctuation while keeping punctuation adjacent to words
        words = re.findall(r'\b\w+\b', text)
        return words 

    def build_bigrams(self, words):
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        bigram_counts = Counter(bigrams)
        return bigram_counts

    def calculate_bigram_probabilities(self, bigrams, words):
        bigram_probabilities = {}
        word_counts = Counter(words)
        
        for (w1, w2), count in bigrams.items():
            if w1 not in bigram_probabilities:
                bigram_probabilities[w1] = {}
            bigram_probabilities[w1][w2] = count / word_counts[w1] #conditional probability of w2 given w1
            
        return bigram_probabilities

    def generate_text(self, start_word, num_words):
        current_word = start_word
        generated_text = [current_word]
        
        for _ in range(num_words - 1):
            if current_word not in self.bigram_probabilities:
                break
            next_words = list(self.bigram_probabilities[current_word].keys())
            probabilities = list(self.bigram_probabilities[current_word].values())
            next_word = random.choices(next_words, probabilities)[0]
            # If next word is punctuation, it should be appended without a space
            if next_word in '.,!?;':
                generated_text[-1] += next_word
            else:
                generated_text.append(next_word)
            current_word = next_word
            
        return ' '.join(generated_text)


# Example usage
text = "This is a simple example to demonstrate a bigram model. This example is simple and easy."
bigram_model = BigramModel(text)

# Generate text starting with the word 'this'
start_word = 'this'
generated_text = bigram_model.generate_text(start_word, 16)
print("Generated Text:", generated_text)


Generated Text: this is a simple example is simple example to demonstrate a simple example is a simple


## bigram with fallback method

In [185]:
import re
from collections import Counter
import random

class BigramModel:
    def __init__(self, text):
        self.text = text.lower()
        self.words = self.preprocess(self.text)
        self.bigrams = self.build_bigrams(self.words)
        self.bigram_probabilities = self.calculate_bigram_probabilities(self.bigrams, self.words)

    def preprocess(self, text):
        # Split words and punctuation while keeping punctuation adjacent to words
        words = re.findall(r'\b\w+\b', text)
        return words 

    def build_bigrams(self, words):
        bigrams = [(words[i], words[i+1]) for i in range(len(words)-1)]
        bigram_counts = Counter(bigrams)
        return bigram_counts

    def calculate_bigram_probabilities(self, bigrams, words):
        bigram_probabilities = {}
        word_counts = Counter(words)
        
        for (w1, w2), count in bigrams.items():
            if w1 not in bigram_probabilities:
                bigram_probabilities[w1] = {}
            bigram_probabilities[w1][w2] = count / word_counts[w1] #conditional probability of w2 given w1
            
        return bigram_probabilities

    def generate_text(self, start_word, num_words):
        current_word = start_word
        generated_text = [current_word]
        
        for _ in range(num_words - 1):
            if current_word not in self.bigram_probabilities: #or not self.bigram_probabilities[current_word]:
                # Fallback: Choose a new random start word
                current_word = random.choice(self.words)
                generated_text.append(current_word)
                continue
            next_words = list(self.bigram_probabilities[current_word].keys())
            probabilities = list(self.bigram_probabilities[current_word].values())
            next_word = random.choices(next_words, probabilities)[0]
            # If next word is punctuation, it should be appended without a space
            if next_word in '.,!?;':
                generated_text[-1] += next_word
            else:
                generated_text.append(next_word)
            current_word = next_word
            
        return ' '.join(generated_text)


# Example usage
text = "This is a simple example to demonstrate a bigram model. This example is simple and easy."
bigram_model = BigramModel(text)

# Generate text starting with the word 'this'
start_word = 'this'
generated_text = bigram_model.generate_text(start_word, 2)
print("Generated Text:", generated_text)


Generated Text: this is


# text generation using neural network 

In [13]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Text Preprocessing
text = "This is a simple example to demonstrate a bigram model. This example is simple and easy."

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

# Create sequences for n-gram (bigram in this case)
n_gram = 2
words = tokenizer.texts_to_sequences([text])[0]
sequences = []
for i in range(len(words) - n_gram + 1):
    sequences.append(words[i:i + n_gram])

# Calculate the maximum sequence length
max_sequence_len = max([len(seq) for seq in sequences])
total_words = len(tokenizer.word_index) + 1

# Step 2: Build the Model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 3: Train the Model
sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))
X, y = sequences[:,:-1], sequences[:,-1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
model.fit(X, y, epochs=10, verbose=1)

# Step 4: Generate Text
start_word = 'this is'
num_words = 10
current_words = start_word.split()
generated_text = current_words.copy()

for _ in range(num_words):
    token_list = tokenizer.texts_to_sequences([current_words])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=-1)[0]
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            output_word = word
            break
    if not output_word:
        break
    current_words.append(output_word)
    generated_text.append(output_word)
    current_words = current_words[1:]

print("Generated Text:", ' '.join(generated_text))


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0667 - loss: 2.4849
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.1333 - loss: 2.4838
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.1333 - loss: 2.4826
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.1333 - loss: 2.4815
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.2000 - loss: 2.4803
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2000 - loss: 2.4791
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2000 - loss: 2.4779
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2000 - loss: 2.4767
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_text(text):
    # Tokenize the text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts([text])
    return tokenizer

def create_sequences(text, tokenizer, n_gram=2):
    words = tokenizer.texts_to_sequences([text])[0]
    sequences = []
    for i in range(len(words) - n_gram + 1):
        sequences.append(words[i:i + n_gram])
    return sequences

def build_model(total_words, max_sequence_len):
    model = Sequential()
    model.add(Embedding(total_words, 10, input_length=max_sequence_len - 1))
    model.add(LSTM(100))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_model(model, sequences, total_words, max_sequence_len, epochs=100):
    sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))
    X, y = sequences[:,:-1], sequences[:,-1]
    y = tf.keras.utils.to_categorical(y, num_classes=total_words)
    model.fit(X, y, epochs=epochs, verbose=1)

def generate_text(model, tokenizer, start_word, num_words, max_sequence_len):
    current_words = start_word.split()
    generated_text = current_words.copy()

    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([current_words])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=-1)[0]
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                output_word = word
                break
        if not output_word:
            break
        current_words.append(output_word)
        generated_text.append(output_word)
        current_words = current_words[1:]

    return ' '.join(generated_text)

# Example usage
text = "This is a simple example to demonstrate a bigram model. This example is simple and easy."

# Preprocess the text
tokenizer = preprocess_text(text)

# Create sequences
n_gram = 2  # Change to 3 for trigram, etc.
sequences = create_sequences(text, tokenizer, n_gram)
max_sequence_len = max([len(seq) for seq in sequences])
total_words = len(tokenizer.word_index) + 1

# Build the model
model = build_model(total_words, max_sequence_len)

# Train the model
train_model(model, sequences, total_words, max_sequence_len, epochs=100)

# Generate text starting with the word 'this'
start_word = 'this is'
generated_text = generate_text(model, tokenizer, start_word, 10, max_sequence_len)
print("Generated Text:", generated_text)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.2000 - loss: 2.4846
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.2667 - loss: 2.4834
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.3333 - loss: 2.4822
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.3333 - loss: 2.4809
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2000 - loss: 2.4797
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2667 - loss: 2.4785
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.2667 - loss: 2.4772
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.2667 - loss: 2.4760
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

class RNNModel:
    def __init__(self, text, n_gram=2):
        self.text = text.lower()
        self.n_gram = n_gram
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts([self.text])
        self.total_words = len(self.tokenizer.word_index) + 1
        self.sequences = self.create_sequences(self.text)
        self.max_sequence_len = max([len(seq) for seq in self.sequences])
        self.model = self.build_model()

    def preprocess(self, text):
        # Tokenize the text
        return self.tokenizer.texts_to_sequences([text])[0]

    def create_sequences(self, text):
        words = self.preprocess(text)
        sequences = []
        for i in range(len(words) - self.n_gram + 1):
            sequences.append(words[i:i + self.n_gram])
        return sequences

    def build_model(self):
        model = Sequential()
        model.add(Embedding(self.total_words, 10, input_length=self.max_sequence_len - 1))
        model.add(LSTM(100))
        model.add(Dense(self.total_words, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def train(self, epochs=100):
        sequences = np.array(pad_sequences(self.sequences, maxlen=self.max_sequence_len, padding='pre'))
        X, y = sequences[:,:-1], sequences[:,-1]
        y = tf.keras.utils.to_categorical(y, num_classes=self.total_words)
        self.model.fit(X, y, epochs=epochs, verbose=1)

    def generate_text(self, start_word, num_words):
        current_words = start_word.split()
        generated_text = current_words.copy()

        for _ in range(num_words):
            token_list = self.tokenizer.texts_to_sequences([current_words])[0]
            token_list = pad_sequences([token_list], maxlen=self.max_sequence_len - 1, padding='pre')
            predicted = self.model.predict(token_list, verbose=0)
            predicted_word_index = np.argmax(predicted, axis=-1)[0]
            output_word = ""
            for word, index in self.tokenizer.word_index.items():
                if index == predicted_word_index:
                    output_word = word
                    break
            if not output_word:
                break
            current_words.append(output_word)
            generated_text.append(output_word)
            current_words = current_words[1:]

        return ' '.join(generated_text)




In [4]:
# Example usage
text = "This is a simple example to demonstrate a bigram model. This example is simple and easy."
rnn_model = RNNModel(text, n_gram=2)
rnn_model.train(epochs=100)

# Generate text starting with the word 'this'
start_word = 'this is'
generated_text = rnn_model.generate_text(start_word, 10)
print("Generated Text:", generated_text)

Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.1333 - loss: 2.4849
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.1333 - loss: 2.4838
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.1333 - loss: 2.4826
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.1333 - loss: 2.4815
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.1333 - loss: 2.4803
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.2000 - loss: 2.4792
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.3333 - loss: 2.4780
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.3333 - loss: 2.4768
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m