# Recursive Neural Network

### Import libraries and data

In [1]:
import numpy as np
import collections
from llm.config import DATA_DIR, IMAGES_DIR

### Data Preparation

I will use a set of functions defined in the `alice` subpackage and demonstrate for a `basic_text = 'Hello world, hello all, hello world'`

`word_index` = `{'hello': 1, 'world': 2, 'all': 3}`

`encoded` = `[1, 2, 1, 3, 1, 2]`

`X[-1]` = `[1, 2, 1, 3, 1]` -> 'Hello world, hello all, hello'

`y[-1]` = `[0., 0., 1., 0.]` -> 'world'

In [2]:
from llm.core.alice import encode_text, create_sequences, pad_sequences_to_same_length, split_sequences, one_hot_encode_labels

basic_text = 'Hello world, hello all, hello world'

# Encode the text into sequences of integers and retrieve the word index
encoded, word_index = encode_text(basic_text)

# Prepare the sequences used by the Neural Network
vocab_size = len(word_index) + 1  # Including zero index
sequences = create_sequences(encoded)

padded_sequences = pad_sequences_to_same_length(sequences)
X, y = split_sequences(padded_sequences)
y = one_hot_encode_labels(y, vocab_size)
print(X.shape)
print(y.shape)

2024-05-09 11:23:13.792292: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


(5, 5)
(5, 4)


### Real data

Let us read the first chapter of Lewis Carroll's ALICE'S ADVENTURES IN WONDERLAND

In [3]:
from llm.core.alice import prepare_data
from llm.core.functions import read_text_file

filepath = DATA_DIR.joinpath('alice_chap_one.txt')
chap_one = read_text_file(filepath)
X, y, vocab_size, max_length = prepare_data(chap_one)
print(X.shape)
print(y.shape)

(280, 280)
(280, 146)


[nltk_data] Downloading package punkt to /home/marco/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Create the RRN Model

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding

model = Sequential([
    Embedding(vocab_size, 10, input_length=max_length-1),  # 10-dimensional embeddings
    SimpleRNN(50),  # RNN with 50 units
    Dense(vocab_size, activation='softmax')  # Output layer
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




### Train the model

In [5]:
model.fit(X, y, epochs=100, verbose=2)

Epoch 1/100
9/9 - 2s - 251ms/step - accuracy: 0.0071 - loss: 4.9898
Epoch 2/100
9/9 - 0s - 54ms/step - accuracy: 0.0214 - loss: 4.9531
Epoch 3/100
9/9 - 0s - 46ms/step - accuracy: 0.0321 - loss: 4.9215
Epoch 4/100
9/9 - 0s - 45ms/step - accuracy: 0.0500 - loss: 4.8871
Epoch 5/100
9/9 - 0s - 46ms/step - accuracy: 0.0571 - loss: 4.8530
Epoch 6/100
9/9 - 1s - 66ms/step - accuracy: 0.0679 - loss: 4.8137
Epoch 7/100
9/9 - 0s - 45ms/step - accuracy: 0.0607 - loss: 4.7798
Epoch 8/100
9/9 - 0s - 46ms/step - accuracy: 0.0786 - loss: 4.7324
Epoch 9/100
9/9 - 0s - 48ms/step - accuracy: 0.0679 - loss: 4.6810
Epoch 10/100
9/9 - 0s - 47ms/step - accuracy: 0.0714 - loss: 4.6468
Epoch 11/100
9/9 - 0s - 53ms/step - accuracy: 0.0679 - loss: 4.5967
Epoch 12/100
9/9 - 1s - 65ms/step - accuracy: 0.0464 - loss: 4.5482
Epoch 13/100
9/9 - 0s - 49ms/step - accuracy: 0.0571 - loss: 4.5023
Epoch 14/100
9/9 - 1s - 66ms/step - accuracy: 0.0714 - loss: 4.4677
Epoch 15/100
9/9 - 0s - 49ms/step - accuracy: 0.0964 - l

<keras.src.callbacks.history.History at 0x71b0fe82a890>

### Generate Text

In [6]:
from llm.core.alice import generate_text

In [7]:
real_text = ('The rabbit-hole went straight on like a tunnel for some way,'
    'and then dipped suddenly down, so suddenly that Alice had not a'
    'moment to think about stopping herself before she found herself'
    'falling down a very deep well.')

In [8]:
(len(real_text.split()))

36

In [11]:
generate_text("The rabbit-hole went", 10, model, 10)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step


'The rabbit-hole went          '