# Character RNNs: Generating Shakespearean Text

### Luis G.

Dataset : [Shakespear Dataset](https://homl.info/shakespeare) 

----------------------------

## 1. Preparation

### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1.1 Data Preparation

In [1]:
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

# Download Shakespeare's dataset
shakespeare_url = "https://homl.info/shakespeare"
file_name = "shakespeare.txt"
filepath = keras.utils.get_file(file_name, shakespeare_url)

# Read/Store text file in string
with open(filepath) as f:
    shakespeare_text = f.read()

# Print a small section of the dataset
print(shakespeare_text[: len(shakespeare_text) // 5000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is 


### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1.2 Text/Word Pre-processing

In [2]:
# Tokenize - encode each CHARACTER as an integer/id
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([shakespeare_text])

# Number of distinct characters
max_id = len(tokenizer.word_index)
print("Distinct Characters:", max_id)

# Total number of characters
dataset_size = tokenizer.document_count
print("Dataset size:", dataset_size)

# Verify tokenizer -> ex: word - "First"
tokenizer.texts_to_sequences(["First"])

Distinct Characters: 39
Dataset size: 1


[[20, 6, 9, 8, 3]]

In [3]:
# Encode the entire dataset so each char is represented by it's unique ID
# - Subtract by 1, to get vals 0-38 vs. 1-39
[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 1.3 Creating Training & Testing Data

In [4]:
# Training set = 90%
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

2023-11-10 05:21:14.898085: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-10 05:21:14.901547: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-10 05:21:14.901662: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-10 05:21:14.902616: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [5]:
n_steps = 100

# target = input shifted 1 character ahead
window_length = n_steps + 1
dataset = dataset.window(window_length, shift=1,drop_remainder=True)

In [6]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [7]:
batch_size = 32    
dataset = dataset.shuffle(10000).batch(batch_size)

In [8]:
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [9]:
# Encode each char using one-hot vector since there's only 39 distinct characters
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))

# Add prefetching to dataset 
dataset = dataset.prefetch(1)

-----------------------------

## 2. Modeling

### &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; 2.1 Exploring Recurrent Network Architectures

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, TimeDistributed, Dense

model = Sequencial()

model.add(GRU(128, return_sequences=True, input_shape=[None,max_id], dropout=0.2, recurrent_dropout=0.2))
model.add(GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
model.add(TimeDistributed(Dense(max_id, activation="softmax"))

# model = Sequential([
#             GRU(128, return_sequences=True, input_shape=[None,max_id], dropout=0.2, recurrent_dropout=0.2),
#             GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
#             TimeDistributed(Dense(max_id, activation="softmax"))
#         ])


model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history = model.fit(dataset,epochs=20)

SyntaxError: invalid syntax (2645709473.py, line 17)