## NLP by RNN

## Reading the data

In [1]:
nov_path = 'gingatetsudono_yoru.txt'

with open(nov_path, 'r') as f:
    nov_text = f.read()

## Preprocessing

In [2]:
import re

text = re.sub("《[^》]+》", "", nov_text)  # remove <brackets>
text = re.sub("［[^］]+］", "", text) # remove [brackets]
text = re.sub("[｜ 　]", "", text) # remove | and spaces

## Setting up the hyperparameters

In [3]:
n_rnn = 10 # number of characters to consider
batch_size = 128 # number of sequences to consider at once
n_epochs = 60 # number of epochs to train for
n_hidden = 128 # number of hidden units in the RNN

## word2vec

In [4]:
import numpy as np

# create a index to character mapping
chars = list(set(text))
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

# cut the text in semi-redundant sequences of n_rnn characters
time_chars = []
next_chars = []
for i in range(0, len(text) - n_rnn):
    time_chars.append(text[i: i + n_rnn])
    next_chars.append(text[i + n_rnn])

# create a vectorized representation of the sequences
X = np.zeros((len(time_chars), n_rnn, len(chars)), dtype=np.bool)
y = np.zeros((len(time_chars), len(chars)), dtype=np.bool)
for i, time_char in enumerate(time_chars):
    for t, char in enumerate(time_char):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

## Creating the model
SimpleRNN, LSTM, GRU

In [5]:
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, LSTM, GRU

# build the model: a simple RNN
model_rnn = Sequential()
model_rnn.add(SimpleRNN(n_hidden, input_shape=(n_rnn, len(chars))))
model_rnn.add(Dense(len(chars), activation='softmax'))
model_rnn.compile(loss='categorical_crossentropy', optimizer='adam')
model_rnn.summary()

# build the model: a LSTM
model_lstm = Sequential()
model_lstm.add(LSTM(n_hidden, input_shape=(n_rnn, len(chars))))
model_lstm.add(Dense(len(chars), activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam')
model_lstm.summary()

# build the model: a GRU
model_gru = Sequential()
model_gru.add(GRU(n_hidden, input_shape=(n_rnn, len(chars))))
model_gru.add(Dense(len(chars), activation='softmax'))
model_gru.compile(loss='categorical_crossentropy', optimizer='adam')
model_gru.summary()

2023-03-02 18:22:17.408904: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 128)               150784    
                                                                 
 dense (Dense)               (None, 1049)              135321    
                                                                 
Total params: 286,105
Trainable params: 286,105
Non-trainable params: 0
_________________________________________________________________
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 128)               603136    
                                                                 
 dense_1 (Dense)             (None, 1049)              135321    
                                                                 
Total params: 738,457
Traina

## Defining the setence generator function

In [9]:
from keras.callbacks import LambdaCallback
def on_epoch_end(epochs, logs):
    print('----- Generating text after Epoch: %d' % epochs)
    beta = 5 # temperature
    prev_text = text[0: n_rnn] # seed text
    created_text = prev_text

    print('----- Generating with seed: "' + prev_text + '"')
    for i in range(400):
        # one-hot encode the characters
        X_pred = np.zeros((1, n_rnn, len(chars)))
        for t, char in enumerate(prev_text):
            X_pred[0, t, char_indices[char]] = 1.

        # predict the next character
        preds = model.predict(X_pred, verbose=0)[0]
        p_power = np.asarray(preds).astype('float64')
        p_power = p_power ** beta # modify the distribution
        next_index = np.random.choice(len(p_power), p=p_power/np.sum(p_power))
        next_char = indices_char[next_index]

        created_text += next_char
        prev_text = prev_text[1:] + next_char 
    print(created_text)
    print()

# set up the callbacks
epoch_end_callback = LambdaCallback(on_epoch_end=on_epoch_end)

## Training the model

In [10]:
# train the rnn model
model = model_rnn
history_rnn = model.fit(X, y, batch_size=batch_size, epochs=n_epochs, callbacks=[epoch_end_callback])

Epoch 1/60
----- Generating with seed: "「ではみなさんは、そ"


KeyboardInterrupt: 

In [None]:
# train the lstm model
model = model_lstm
history_lstm = model_lstm.fit(X, y, batch_size=batch_size, epochs=n_epochs, callbacks=[epoch_end_callback])

In [None]:
# train the gru model
model = model_gru
history_gru = model_gru.fit(X, y, batch_size=batch_size, epochs=n_epochs, callbacks=[epoch_end_callback])

## History

In [None]:
loss_rnn = history_rnn.history['loss']
loss_lstm = history_lstm.history['loss']
loss_gru = history_gru.history['loss']

import matplotlib.pyplot as plt
plt.plot(loss_rnn, label='RNN')
plt.plot(loss_lstm, label='LSTM')
plt.plot(loss_gru, label='GRU')
plt.legend()
plt.show()