# Лабораторная работа № 8

Генерация текста на основе “Алисы в стране чудес”
    
Задачи:

   1. Ознакомиться с генерацией текста
   2. Ознакомиться с системой Callback в Keras

# Цель работы:
Рекуррентные нейронные сети также могут быть использованы в качестве генеративных
моделей.
Это означает, что в дополнение к тому, что они используются для прогнозных моделей
(создания прогнозов), они могут изучать последовательности проблемы, а затем
генерировать совершенно новые вероятные последовательности для проблемной
области.
Подобные генеративные модели полезны не только для изучения того, насколько хорошо
модель выявила проблему, но и для того, чтобы узнать больше о самой проблемной
области.

In [1]:
import numpy
import sys
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

In [2]:
filename = "wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

Total Characters:  144522
Total Vocab:  48


In [3]:
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

Total Patterns:  144422


In [4]:
# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam')

In [5]:
# define the checkpoint
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss',
verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
model.fit(X, y, epochs=50, batch_size=128,
callbacks=callbacks_list)

Epoch 1/50
Epoch 1: loss improved from inf to 2.96996, saving model to weights-improvement-01-2.9700.hdf5
Epoch 2/50
Epoch 2: loss improved from 2.96996 to 2.77092, saving model to weights-improvement-02-2.7709.hdf5
Epoch 3/50
Epoch 3: loss improved from 2.77092 to 2.66609, saving model to weights-improvement-03-2.6661.hdf5
Epoch 4/50
Epoch 4: loss improved from 2.66609 to 2.58714, saving model to weights-improvement-04-2.5871.hdf5
Epoch 5/50
Epoch 5: loss improved from 2.58714 to 2.52556, saving model to weights-improvement-05-2.5256.hdf5
Epoch 6/50
Epoch 6: loss improved from 2.52556 to 2.47070, saving model to weights-improvement-06-2.4707.hdf5
Epoch 7/50
Epoch 7: loss improved from 2.47070 to 2.41745, saving model to weights-improvement-07-2.4174.hdf5
Epoch 8/50
Epoch 8: loss improved from 2.41745 to 2.37087, saving model to weights-improvement-08-2.3709.hdf5
Epoch 9/50
Epoch 9: loss improved from 2.37087 to 2.32425, saving model to weights-improvement-09-2.3242.hdf5
Epoch 10/50
Ep

<keras.callbacks.History at 0x1d83ab75f40>

In [7]:
# load the network weights
filename = "weights-improvement-50-1.6400.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',
optimizer='adam')
int_to_char = dict((i, c) for i, c in enumerate(chars))

# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):

    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

print ("\nDone.")

Seed:
" aced
along the course, here and there. there was no 'one, two, three, and
away,' but they began runn "
ing than the wan sutteig of the thaless on the was of the wasee ou a thrne tael to the whnt satelk, and she white rabbit war she fadl as the could, and salde lutten and muoked an toe to tee har in a lorg,eroglt, and she tas aownog to then so her  and then sas no hor ane thing it an once as the caue pf the dour of the was anl the wisl on tit oooe, and the tam to aenen in a lorent to be thry sile she was oote to sooe to sae it sas aalen hirtens, and the was sot the lopk thit hir hert to soo  she was the ras an ohc cineer oe the wasee oh the was a little bro oftilg that saed-toe oadd ant anoog thit sith ot hadd oo tiet so hav eore to the was a wiry sidel so eand oo the bool  sha was a little so two the rame ti her head to sene the rage ti head thitg whsh she had aali badlln the tiite she had oo havd the horpe oo tee thong the hadd afdin, 
'thet sere th tee,  said the konk turtle 

In [8]:
# pick a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print ("Seed:")
print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
# generate characters
for i in range(1000):

    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

print ("\nDone.")

Seed:
" fully one can't hear
oneself speak--and they don't seem to have any rules in particular;
at least, i "
n a lortter of the kande of the saalit hn a lort ereat soink alout thme the radle  the rase that io the tas toink it oo a bor,  the had not the kant was sot the ladd of the dourt, and then sas anilt hord that she was not it tomh oo she shoe of the court, whe was not a boo of the sabli  she lase whit was to tary toted of the sablit on tee thale, and the was sotting on the taale  the dart was so as the lodst ruote to salk the whine whsh she sable  the nast wiihe io she sade toinl in was oo aalin hnr fonn that sae in tae tadd to seel the had bavi
task theer hor hem: and she was not al once an inde on the coore, and the was sutteig of the shaetesseone oe the white  an to aod thin ii the was oarking to teet to har and  'yhat as in so moooe,  she said to herself, and said an a lortee to cenned the sintese 
the was to tary tone, bnd saede to herself  tou know, 

'i dan t leke the har

In [9]:
import numpy
import sys
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

filename = "wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam')

# define the checkpoint
tb_callback = tf.keras.callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, batch_size=32,
                                             write_graph=True, write_grads=False, write_images=False,
                                             embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None,
                                             embeddings_data=None, update_freq='epoch')

model.fit(X, y, epochs=10, batch_size=512, callbacks=[tb_callback])

Total Characters:  144522
Total Vocab:  48
Total Patterns:  144422
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1d84fc35400>

In [10]:
# load the network weights
%load_ext tensorboard
%tensorboard --logdir logs

In [11]:
filename = "wonderland.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))

n_chars = len(raw_text)
n_vocab = len(chars)
print ("Total Characters: ", n_chars)
print ("Total Vocab: ", n_vocab)

seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

# reshape X to be [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy',
optimizer='adam')


class CustomCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
            if (epoch + 1)%1 == 0:
                # pick a random seed
                start = numpy.random.randint(0, len(dataX)-1)
                pattern = dataX[start]
                print ("Seed:")
                print ("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
    
            # generate characters
                for i in range(1000):
                    x = numpy.reshape(pattern, (1, len(pattern), 1))
                    x = x / float(n_vocab)
                    prediction = model.predict(x, verbose=0) 
                    index = numpy.argmax(prediction)
                    result = int_to_char[index]
                    seq_in = [int_to_char[value] for value in pattern]
                    sys.stdout.write(result)
                    pattern.append(index)
                    pattern = pattern[1:len(pattern)]
                    
                print ("\nDone.")
                
# define the checkpoint
#tb_callback = keras.callbacks.Callback.CustomCallback (log_dir='./logs', histogram_freq=0, batch_size=32,
#write_graph=True, write_grads=False, write_images=False,
#embeddings_freq=0, embeddings_layer_names=None, embeddings_metadata=None,
#embeddings_data=None, update_freq='epoch')

model.fit(X, y, epochs=20, batch_size=512, callbacks=[CustomCallback()])


Total Characters:  144522
Total Vocab:  48
Total Patterns:  144422
Epoch 1/20
" 
the table was a large one, but the three were all crowded together at
one corner of it: 'no room! n "
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 

<keras.callbacks.History at 0x1d83c46a400>