In [37]:
import tensorflow as tf

shakespeare_url = "https://homl.info/shakespeare" # shortcut URL
filepath = tf.keras.utils.get_file("shkespeare.txt", shakespeare_url)

with open(filepath) as f:
    shakespeare_text = f.read()

print(shakespeare_text[:80])


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


In [38]:
# se set split="charachter" to split the text into a list of single characters instead of words (the default)
text_vec_layer = tf.keras.layers.TextVectorization(split="character", standardize='lower')
text_vec_layer.adapt([shakespeare_text])
encoded = text_vec_layer([shakespeare_text])[0]

In [39]:
text_vec_layer(shakespeare_text)

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [40]:
text_vec_layer([shakespeare_text])[0]

<tf.Tensor: shape=(1115394,), dtype=int64, numpy=array([21,  7, 10, ..., 22, 28, 12])>

In [41]:
encoded[:5]

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([21,  7, 10,  9,  4])>

In [42]:
# we don't need the pad and unk tokens, so we can skip the first 2
encoded -= 2 # drop 0 (pad) and 1 (unkown) tokens

n_tokens = text_vec_layer.vocabulary_size() - 2 # number of distinct characters
dataset_size =  len(encoded) # total number of chars = 1,155,394
dataset_size

1115394

In [43]:
# utility function to convert a long sequence of characters into many small windows of text: convert long sequence of charcters IDs into a dataset of input/target pairs of small windows of text

def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
  ds = tf.data.Dataset.from_tensor_slices(sequence)
  ds = ds.window(length + 1, shift=1, drop_remainder=True)
  ds = ds.flat_map(lambda window: window.batch(length+1))
  if shuffle:
    ds = ds.shuffle(buffer_size=100_000, seed=seed)
  ds = ds.batch(batch_size)
  return ds.map(lambda window: (window[:, :-1], window[:, 1:])).prefetch(1)

In [90]:
# Now let's create the training (90%), validation (5%) and test (5%) sets
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:dataset_size*90//100], length=length, shuffle=True, seed=42)
valid_set = to_dataset(encoded[dataset_size*90//100:dataset_size*90//100+dataset_size*5//100], length=length)
test_set = to_dataset(encoded[dataset_size*95//100:], length=length)

In [45]:
# Now let's create the model
model = tf.keras.Sequential([
  tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16),
  tf.keras.layers.GRU(128, return_sequences=True),
  tf.keras.layers.Dense(n_tokens, activation="softmax")
])

model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint("my_shakespeare_model", save_best_only=True)
history = model.fit(train_set, epochs=20, validation_data=valid_set, callbacks=[model_ckpt])

Epoch 1/20
   3192/Unknown - 157s 46ms/step - loss: 1.7656 - accuracy: 0.4772

KeyboardInterrupt: 

In [47]:
# This model does not handle text preprocessing, so we need to add a preprocessing layer to take care of that
# We also need to add a RNN layer to handle the long sequences of text

shakespeare_model = tf.keras.Sequential([
  text_vec_layer,
  tf.keras.layers.lambda(lambda x: x - 2), # no <PAD> and <UNK> tokens
  model
  ])


SyntaxError: invalid syntax (2314910303.py, line 6)

In [None]:
# Now we can use the model to generate some text
y_pred = model.predict_classes(['To be or nor to be'])[0, -1]
y_pred = tf.argmax(y_pred)
text_vec_layer.get_vocabulary()[y_pred + 2] # output 'e'

NameError: name 'model' is not defined

In [36]:
log_probas = tf.math.log([[0.5, 0.4, 0.1]])
tf.random.set_seed(42)
tf.random.categorical(log_probas, num_samples=8)

<tf.Tensor: shape=(1, 8), dtype=int64, numpy=array([[0, 1, 0, 2, 1, 0, 0, 1]])>

In [None]:
# tokanize the text
vocab_size = 1000
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size)
text_vec_layer.adapt(train_set.map(lambda reviews, labels: reviews))

In [92]:
train_set = train_set.map(lambda reviews, labels: (text_vec_layer(reviews), labels))

ValueError: in user code:

    File "/var/folders/4m/_fbrmlzn7l3dlhlqf49qhgg00000gn/T/ipykernel_20811/2089277519.py", line 1, in None  *
        lambda reviews, labels: (text_vec_layer(reviews), labels)
    File "/Users/mehannioui/Documents/HOML/practice/homl/lib/python3.9/site-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/Users/mehannioui/Documents/HOML/practice/homl/lib/python3.9/site-packages/keras/src/layers/preprocessing/text_vectorization.py", line 588, in _preprocess
        raise ValueError(

    ValueError: Exception encountered when calling layer 'text_vectorization_8' (type TextVectorization).
    
    When using `TextVectorization` to tokenize strings, the input rank must be 1 or the last shape dimension must be 1. Received: inputs.shape=(None, None) with rank=2
    
    Call arguments received by layer 'text_vectorization_8' (type TextVectorization):
      • inputs=tf.Tensor(shape=(None, None), dtype=int64)


In [83]:
# Finally, we can create the model and train it
embed_size = 128
tf.random.set_seed(42)
model = tf.keras.Sequential([
  text_vec_layer,
  tf.keras.layers.Embedding(input_dim=vocab_size, embed_size=embed_size),
  tf.keras.layers.GRU(128),
  tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(train_set, epochs=2, validation_data=valid_set)

NameError: name 'vocab_size' is not defined

In [80]:
import nltk
nltk.download('punkt')
# Suppose you have the following text:
text = "I love my dog. I love my cat. You love my dog! Do you think my dog is amazing?"

# Tokenize the text into sentences
sentences = nltk.sent_tokenize(text)

# Tokenize the sentences into words and convert the words to integers
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Pad the sequences so they all have the same length
padded = tf.keras.preprocessing.sequence.pad_sequences(sequences, padding="post")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/mehannioui/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [81]:
sentences
tokenizer
sequences
padded

array([[ 5,  3,  2,  4,  0,  0,  0],
       [ 5,  3,  2,  7,  0,  0,  0],
       [ 6,  3,  2,  4,  0,  0,  0],
       [ 8,  6,  9,  2,  4, 10, 11]], dtype=int32)

In [74]:
tokenizer.word_index

{'<OOV>': 1,
 'my': 2,
 'love': 3,
 'dog': 4,
 'i': 5,
 'you': 6,
 'cat': 7,
 'do': 8,
 'think': 9,
 'is': 10,
 'amazing': 11}

In [None]:
tf.keras.layers.TextVectorization(max_tokens=1000).adapt(train_set.map(lambda reviews, labels: reviews))