In [1]:
!pip install nltk numpy tensorflow





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# @title Building a Text Generation Model with Python


import nltk
import numpy as np
import re
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

# Download necessary NLTK resources
nltk.download('gutenberg')
nltk.download('punkt')


[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [8]:
# Load data from Project Gutenberg (e.g., Shakespeare's Hamlet)
raw_text = gutenberg.raw('shakespeare-hamlet.txt')

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    return tokens

tokens = preprocess_text(raw_text)


In [9]:
# Prepare sequences
sequence_length = 50
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokens)
sequences = []

for i in range(sequence_length, len(tokens)):
    seq = tokens[i-sequence_length:i]
    sequences.append(seq)

# Convert sequences to integer values
sequences = tokenizer.texts_to_sequences(sequences)
sequences = np.array(sequences)

# Separate features and labels
X, y = sequences[:, :-1], sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)


In [11]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=sequence_length-1))
model.add(LSTM(256, return_sequences=True))
model.add(LSTM(256))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary of the model
model.summary()




In [12]:
# Train the model
history = model.fit(X, y, epochs=50, batch_size=256, verbose=1)


Epoch 1/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 1s/step - accuracy: 0.0304 - loss: 7.3659
Epoch 2/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 1s/step - accuracy: 0.0312 - loss: 6.5957
Epoch 3/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 1s/step - accuracy: 0.0298 - loss: 6.5148
Epoch 4/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 1s/step - accuracy: 0.0335 - loss: 6.5438
Epoch 5/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 971ms/step - accuracy: 0.0385 - loss: 6.2743
Epoch 6/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 1s/step - accuracy: 0.0470 - loss: 6.1709   
Epoch 7/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 1s/step - accuracy: 0.0497 - loss: 6.0704
Epoch 8/50
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 1s/step - accuracy: 0.0504 - loss: 5.9950
Epoch 9/50
[1m116/116[0m

In [13]:
reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))

def generate_text(model, seed_text, max_length):
    for _ in range(max_length):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=sequence_length-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        next_index = np.argmax(predicted)
        next_word = reverse_word_map[next_index]
        seed_text += " " + next_word
        if next_word == '.':
            break
    return seed_text

# Generate text based on a seed prompt
print(generate_text(model, "To be or not to be", 50))


To be or not to be to the dane and conuey the king and the opinions and the corner of the king and traitorous soules idoll to the shell to the carpenter and not moult i haue seene the king and the opinions and the corner of the king and traitorous soules idoll to the shell


In [14]:
model.save('text_generation_model.h5')




In [15]:
import math

def calculate_perplexity(model, X, y):
    predictions = model.predict(X)
    # Take the actual values (the target words) from y
    perplexity = 0
    total_samples = len(y)
    for i, target in enumerate(y):
        predicted_prob = predictions[i][np.argmax(target)]
        perplexity += -math.log(predicted_prob)
    perplexity = math.exp(perplexity / total_samples)
    return perplexity

# Calculate perplexity on the training data
perplexity = calculate_perplexity(model, X, y)
print(f"Model Perplexity: {perplexity:.2f}")


[1m924/924[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 41ms/step
Model Perplexity: 42.27


In [17]:
# Example prompts to test the model
test_prompts = [
    "Text Generation is",
    "Warewe is the best because",
    "Far away there is",
    "Artificial Intelligence is",
    "Atomic Habits are"
]

# Generate text for each test prompt
for prompt in test_prompts:
    generated_text = generate_text(model, prompt, 50)
    print(f"\nPrompt: '{prompt}'")
    print(f"Generated Text: '{generated_text}'")



Prompt: 'Text Generation is'
Generated Text: 'Text Generation is hecuba to tell him to the purgation and was plundge me to the tragedians of the king and traitorous soules is i haue deliuerd of the king and traitorous soules idoll to the shell to the carpenter and not moult i haue seene the king and the opinions and a'

Prompt: 'Warewe is the best because'
Generated Text: 'Warewe is the best because so bestowd not mend the frend and you beratled extreamity of the king and traitorous soules is i haue seene the king and the opinions and the corner of the king and traitorous soules idoll to the shell to the carpenter and not moult i haue seene the king and'

Prompt: 'Far away there is'
Generated Text: 'Far away there is not boudge you ham i haue not vndertake him to the exployt and humour the king and the attent death and sweete religion burnt and stoode the skirts of sables and horrible that carbuncles the hellish and blowne and profound and all a seale and a lender the bubbles is'

Pro