<a href="https://colab.research.google.com/github/mickeyrahm/Portfolio/blob/master/notebooks/starter_publishing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import requests
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

# ---------------------------------------------------
# 1. Download The Great Gatsby
# ---------------------------------------------------

print("Downloading text...")
url = "https://www.gutenberg.org/cache/epub/64317/pg64317.txt"
response = requests.get(url)
text = response.text

# Find real novel beginning
start_phrase = "In my younger and more vulnerable years my father gave me some advice"
start_idx = text.find(start_phrase)

if start_idx == -1:
    print("Could not find novel start. Dumping first 3000 characters for inspection:")
    print(text[:3000])
    raise ValueError("Start phrase not found in text!")

text = text[start_idx:].strip()

# Truncate before Gutenberg footer if present
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"
end_idx = text.find(end_marker)
if end_idx != -1:
    text = text[:end_idx].strip()

if len(text) == 0:
    raise ValueError("Downloaded text is empty!")

print("Sample text:")
print(text[:1000])

# ---------------------------------------------------
# 2. Word-level Tokenization
# ---------------------------------------------------

vectorizer = layers.TextVectorization(
    max_tokens=10000,
    output_sequence_length=20,
    standardize='lower_and_strip_punctuation',
    split='whitespace'
)

text_ds = tf.data.Dataset.from_tensor_slices([text])
vectorizer.adapt(text_ds)

tokens = vectorizer(tf.constant([text])).numpy()[0]
tokens = tokens[tokens > 0]

print("Number of tokens:", len(tokens))
print("First tokens:", tokens[:20])

# ---------------------------------------------------
# 3. Create Training Sequences
# ---------------------------------------------------

seq_len = 10

if len(tokens) < seq_len + 1:
    raise ValueError(f"Not enough tokens ({len(tokens)}) for sequence length {seq_len}. "
                     f"Try lowering seq_len or using more text.")

inputs = []
targets = []

for i in range(len(tokens) - seq_len):
    inputs.append(tokens[i:i+seq_len])
    targets.append(tokens[i+seq_len])

inputs = np.stack(inputs).astype(np.int32)
targets = np.array(targets).astype(np.int32)

print("Shape of inputs:", inputs.shape)
print("Shape of targets:", targets.shape)

dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset = dataset.shuffle(5000).batch(64).repeat()

# ---------------------------------------------------
# 4. Build Word-level Transformer Model
# ---------------------------------------------------

vocab_size = len(vectorizer.get_vocabulary())
embedding_dim = 512

inputs_layer = layers.Input(shape=(seq_len,), dtype=tf.int32)
x = layers.Embedding(vocab_size, embedding_dim)(inputs_layer)
x = layers.MultiHeadAttention(num_heads=4, key_dim=embedding_dim)(x, x)
x = layers.Dropout(0.3)(x)
x = layers.LayerNormalization()(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(vocab_size)(x)

model = tf.keras.Model(inputs=inputs_layer, outputs=outputs)

# Learning rate schedule
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=1000,
    decay_rate=0.9
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
)

print(model.summary())

# ---------------------------------------------------
# 5. Text Generation Function (with padding fix)
# ---------------------------------------------------

index2word = np.array(vectorizer.get_vocabulary())

def generate_text_word(model, seed_text, num_words=50, temperature=0.3):
    tokens = vectorizer(tf.constant([seed_text])).numpy()[0]
    tokens = tokens[tokens > 0].tolist()

    # Pad left if seed is too short
    if len(tokens) < seq_len:
        tokens = [0] * (seq_len - len(tokens)) + tokens

    for _ in range(num_words):
        input_seq = np.array(tokens[-seq_len:], dtype=np.int32).reshape(1, -1)
        logits = model.predict(input_seq, verbose=0)[0]
        logits = logits / temperature
        next_token = tf.random.categorical(tf.expand_dims(logits, 0), num_samples=1).numpy()[0, 0]
        tokens.append(next_token)

    words = [index2word[i] for i in tokens if i < len(index2word)]
    return " ".join(words)

class WordSampler(tf.keras.callbacks.Callback):
    def __init__(self, seed_text):
        super().__init__()
        self.seed_text = seed_text

    def on_epoch_end(self, epoch, logs=None):
        print(f"\n--- SAMPLE TEXT AFTER EPOCH {epoch+1} ---\n")
        sample = generate_text_word(self.model, self.seed_text, num_words=50, temperature=0.3)
        print(sample)
        print("\n-----------------------------------------\n")

# ---------------------------------------------------
# 6. Train the Model
# ---------------------------------------------------

steps_per_epoch = len(inputs) // 64
steps_per_epoch = max(1, steps_per_epoch)

print("Training...")
model.fit(
    dataset,
    epochs=200,
    steps_per_epoch=steps_per_epoch,
    callbacks=[WordSampler("Gatsby said")]
)

# ---------------------------------------------------
# 7. Generate Final Text
# ---------------------------------------------------

print("\nFinal Synthesized Text:\n")
print(generate_text_word(model, "Gatsby said", num_words=100, temperature=0.3))


Downloading text...
Sample text:
In my younger and more vulnerable years my father gave me some advice
that I’ve been turning over in my mind ever since.

“Whenever you feel like criticizing anyone,” he told me, “just
remember that all the people in this world haven’t had the advantages
that you’ve had.”

He didn’t say any more, but we’ve always been unusually communicative
in a reserved way, and I understood that he meant a great deal more
than that. In consequence, I’m inclined to reserve all judgements, a
habit that has opened up many curious natures to me and also made me
the victim of not a few veteran bores. The abnormal mind is quick to
detect and attach itself to this quality when it appears in a normal
person, and so it came about that in college I was unjustly accused of
being a politician, because I was privy to the secret griefs of wild,
unknown men. Most of the confidences were unsought—frequently I have
feigned sleep, preoccupation, or a hostile levity when I realized by


None
Training...
Epoch 1/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - loss: 8.8797
--- SAMPLE TEXT AFTER EPOCH 1 ---

        gatsby said provocation live crowd—when savage dying furnish vestibules stalked done” windy impressionability durable “see” ella shades my winebrenner’s nail shades my “sensuous” anchor steadily chefs kinds end” raspingly fact mine—do supposed seem echoing absurd here—this england despairing swept challengingly “either “remember swindler carnegie garage—then indiscreet store gilda attain “who singlemindedness listened

-----------------------------------------

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - loss: 8.8797
Epoch 2/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 8.0749
--- SAMPLE TEXT AFTER EPOCH 2 ---

        gatsby said war” americans knocking lavender bowl fender token final jerk echoes brand lawn “don’t endowing gus suppressed waved france” regular patron 