In [19]:

   pip install tensorflow gensim nltk numpy pandas scikit-learn tqdm


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [20]:
# Cell 2: Imports
import re
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

print("TensorFlow version:", tf.__version__)
print("GPU Available:", tf.config.list_physical_devices('GPU'))

TensorFlow version: 2.20.0
GPU Available: []


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ADMIN\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [21]:
# Cell 3: Load data (adjust path if needed)
with open(r"C:\Users\ADMIN\Documents\shakespear.txt", encoding='utf-8') as f:
    text = f.read().lower()

print(f"Total characters: {len(text):,}")
print(f"Preview:\n{text[:500]}")

Total characters: 1,115,393
Preview:
first citizen:
before we proceed any further, hear me speak.

all:
speak, speak.

first citizen:
you are all resolved rather to die than to famish?

all:
resolved. resolved.

first citizen:
first, you know caius marcius is chief enemy to the people.

all:
we know't, we know't.

first citizen:
let us kill him, and we'll have corn at our own price.
is't a verdict?

all:
no more talking on't; let it be done: away, away!

second citizen:
one word, good citizens.

first citizen:
we are accounted poor


In [22]:
# 3. Text Preprocessing
#- Remove punctuation
#- Remove numbers
#- Tokenize
#- Remove stopwords
#- Lemmatize

In [23]:
# Cell 4: Preprocessing functions
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
punct = str.maketrans('', '', string.punctuation + '“”‘’—')

def clean_text(text):
    # Lowercase already done
    text = text.translate(punct)                 # Remove punctuation
    text = re.sub(r'\d+', '', text)              # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()     # Normalize spaces
    
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    return tokens

print("Cleaning text...")
tokens = clean_text(text)
print(f"Vocabulary size (after cleaning): {len(set(tokens)):,}")
print(f"Total tokens: {len(tokens):,}")
print("Sample tokens:", tokens[:50])

Cleaning text...
Vocabulary size (after cleaning): 11,205
Total tokens: 107,148
Sample tokens: ['first', 'citizen', 'proceed', 'hear', 'speak', 'speak', 'speak', 'first', 'citizen', 'resolved', 'rather', 'die', 'famish', 'resolved', 'resolved', 'first', 'citizen', 'first', 'know', 'caius', 'marcius', 'chief', 'enemy', 'people', 'knowt', 'knowt', 'first', 'citizen', 'let', 'u', 'kill', 'well', 'corn', 'price', 'ist', 'verdict', 'talking', 'ont', 'let', 'done', 'away', 'away', 'second', 'citizen', 'one', 'word', 'good', 'citizen', 'first', 'citizen']


In [24]:
# 4. Train Word2Vec Embeddings (Skip-gram, 100d)

In [25]:
# Cell 5: Train Word2Vec
print("Training Word2Vec...")
w2v_model = Word2Vec(
    sentences=[tokens],
    vector_size=100,
    window=5,
    min_count=5,
    workers=8,
    sg=1,           # Skip-gram
    epochs=15
)
w2v_model.save("shakespeare_w2v.model")
print("Word2Vec trained and saved.")
print(f"Vocab size in Word2Vec: {len(w2v_model.wv)}")

Training Word2Vec...
Word2Vec trained and saved.
Vocab size in Word2Vec: 2944


In [26]:
# 5. Prepare Sequences for LSTM
#- Use top 10,000 words
#- Create n-gram sequences (seq_len = 30)

In [27]:
# Cell 6: Tokenizer with top 10k words
VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts([tokens])

sequences = tokenizer.texts_to_sequences([tokens])[0]
print(f"Sequence length: {len(sequences)}")

# Create n-grams
SEQ_LENGTH = 30
X, y = [], []

for i in range(SEQ_LENGTH, len(sequences)):
    seq = sequences[i-SEQ_LENGTH:i]
    target = sequences[i]
    X.append(seq)
    y.append(target)

X = np.array(X)
y = to_categorical(y, num_classes=VOCAB_SIZE)

print(f"X shape: {X.shape}, y shape: {y.shape}")

Sequence length: 107148


MemoryError: Unable to allocate 7.98 GiB for an array with shape (107118, 10000) and data type float64

In [28]:
# 6. Create Embedding Matrix from Word2Vec

In [29]:
# Cell 7: Embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((VOCAB_SIZE, embedding_dim))

word_index = tokenizer.word_index
hits = 0
misses = 0

for word, i in word_index.items():
    if i >= VOCAB_SIZE:
        continue
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]
        hits += 1
    else:
        misses += 1

print(f"Embedding hits: {hits}, misses: {misses}")

Embedding hits: 2944, misses: 7055


In [30]:
# 7. Build Best LSTM Model (Bidirectional + Dropout)

In [31]:
# Cell 8: Model Architecture
model = Sequential([
    Embedding(VOCAB_SIZE, embedding_dim, weights=[embedding_matrix], 
              input_length=SEQ_LENGTH, trainable=False),
    Bidirectional(LSTM(256, return_sequences=True)),
    Dropout(0.1),
    Bidirectional(LSTM(256)),
    Dropout(0.1),
    Dense(512, activation='relu'),
    Dropout(0.1),
    Dense(VOCAB_SIZE, activation='softmax')
])

model.compile(
    loss='categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)
model.build(input_shape=(None,SEQ_LENGTH))
model.summary()



In [32]:
# 8. Train Model (GPU if available)

In [33]:
# Cell 9: Train
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=42)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=25,
    batch_size=128,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True),
        tf.keras.callbacks.ReduceLROnPlateau(patience=3)
    ],
    verbose=1
)

print("Training completed.")

ValueError: Unrecognized data type: x=[[   3  261  783 ...   75  202  182]
 [  25  577  131 ... 2297  121  312]
 [ 526   94  233 ...   18  736 3406]
 ...
 [1377 5918 1581 ...  212   20 1353]
 [  35  437 1008 ... 4415 3526   84]
 [3798  691 1287 ...   65  241   36]] (of type <class 'numpy.ndarray'>)

In [34]:
# 9. Save Model

In [35]:
# Cell 10: Save
model.save("shakespeare_lstm_best.h5")
tokenizer_json = tokenizer.to_json()
import json
with open('tokenizer.json', 'w') as f:
    json.dump(tokenizer_json, f)
print("Model & tokenizer saved.")



Model & tokenizer saved.


In [36]:
# 10. Predict Next Word (Inference Function)

In [37]:
# Cell 11: Inference
from tensorflow.keras.models import load_model
import json

def predict_next_word(seed_text, model, tokenizer, seq_length=30, top_k=5):
    token_list = tokenizer.texts_to_sequences([clean_text(seed_text)])[0]
    token_list = token_list[-seq_length:]
    token_list = pad_sequences([token_list], maxlen=seq_length, truncating='pre')
    
    predicted = model.predict(token_list, verbose=0)
    top_indices = np.argsort(predicted[0])[-top_k:][::-1]
    
    output = []
    for idx in top_indices:
        word = tokenizer.index_word.get(idx, '?')
        prob = predicted[0][idx]
        output.append((word, prob))
    return output

# Load if needed
# model = load_model("shakespeare_lstm_best.h5")
# with open('tokenizer.json') as f:
#     tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(json.load(f))

# Test
seed = "to be or not to"
print(f"Seed: '{seed}'")
preds = predict_next_word(seed, model, tokenizer)
for word, prob in preds:
    print(f"  → {word} ({prob:.4f})")

Seed: 'to be or not to'
  → duke (0.0001)
  → ill (0.0001)
  → love (0.0001)
  → go (0.0001)
  → hath (0.0001)


In [38]:
# 11. Evaluate Accuracy (Top-1, Top-5)

In [39]:
# Cell 12: Top-k Accuracy
def top_k_accuracy(y_true, y_pred, k=5):
    top_k = np.argsort(y_pred, axis=1)[:, -k:]
    correct = 0
    for i, true_idx in enumerate(np.argmax(y_true, axis=1)):
        if true_idx in top_k[i]:
            correct += 1
    return correct / len(y_true)

val_pred = model.predict(X_val, verbose=0)
top1 = np.mean(np.argmax(val_pred, axis=1) == np.argmax(y_val, axis=1))
top5 = top_k_accuracy(y_val, val_pred, k=5)

print(f"Validation Top-1 Accuracy: {top1:.4f}")
print(f"Validation Top-5 Accuracy: {top5:.4f}")

AxisError: axis 1 is out of bounds for array of dimension 1