In [11]:
import keras
import numpy as np

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print ('Corpus length:', len(text))

Corpus length: 600893


In [12]:
maxlen = 60        # extract sequences of 60 chars
step = 3           # sample new sequence every 3 chars
sentences = []     # holds extracted sequences
next_chars = []    # holds the targets

for i in range (0, len(text) - maxlen, step):
    sentences.append(text[i:i+maxlen])
    next_chars.append(text[i+maxlen])
    
print ('Number of sequences:', len(sentences))

# list of unique chars in the corpus
chars = sorted(list(set(text)))                
print ('Unique characters:', len(chars))

# dictionary that maps unique characters to their index in the list 'chars'
char_indices = dict((char, chars.index(char)) for char in chars) 

print ('Vectorization...')
# One-hot encode the characters into binary arrays
# x dims = [sentences, sentence length, possible chars]
# y dims = [sentences, possible chars]
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1
    


Number of sequences: 200278
Unique characters: 57
Vectorization...


The following code just converts the x,y training data back into text, as a debug check

In [None]:
def bool_array_to_char(bool_array, chars):
    # chars is char index
    return (chars[np.argmax(bool_array.astype(int))])


def decode_line(x, chars):
    string = []
    for i in range (x.shape[0]):
#        string.append(chars[np.argmax(x[i].astype(int))])
        string.append(bool_array_to_char(x[i], chars))
    return (string)
                                 
def decode_example(x, y): 
    # decodes x, y from array type back into english
    print (''.join(decode_line(x, chars)), ":", # decode x
       bool_array_to_char(y, chars))   # decode y


In [None]:
for i in range (10):
    decode_example(x[i], y[i])

preface


supposing that truth is a woman--what then? is the : r
face


supposing that truth is a woman--what then? is there  : n
e


supposing that truth is a woman--what then? is there not :  

supposing that truth is a woman--what then? is there not gr : o
pposing that truth is a woman--what then? is there not groun : d
sing that truth is a woman--what then? is there not ground
f : o
g that truth is a woman--what then? is there not ground
for  : s
hat truth is a woman--what then? is there not ground
for sus : p
 truth is a woman--what then? is there not ground
for suspec : t
uth is a woman--what then? is there not ground
for suspectin : g


The network is a single LSTM followed by a Dense softmax classifier.

In [8]:
from keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

# loss function is categorical_crossentropy as targets are one-hot encoded
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


## Training the language model and sampling from it
Given a trained model and a seed text snippet, can generate new text according to the following method:

1. Draw from the model a probability distribution for the next character, given the generated text available so far
2. Reweight the distribution to a certain temperature
3. Sample the next character at random according to the reweighted distribution
4. Add the new character at the end of the available text

In [9]:
# Function to sample next character given the model's 
# prediction (a probability distribution over 'chars'
# reweights the softmax probability dist using a 
# temperature parameter

def sample (preds, temperature = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

## Train the model, generate text
Use a range of temperatures after every epoch, to see how generated text evolves

In [10]:
import random
import sys


# uncomment to load saved model
#model = keras.models.load_model ('nietzsche_generate.h5')

model.optimizer.lr.assign(0.01) # reset the learning rate

# train the model for 60 epochs
for epoch in range (1, 60):
    print ('epoch', epoch)
    
    # fit the model for one iteration on the data
    model.fit (x, y, batch_size = 128, epochs = 1)
    #model.save('nietzsche_generate.h5')
    
    # select a text seed at random
    start_index = random.randint(0, len(text) - maxlen - 1)
    generated_text = text[start_index: start_index + maxlen]
    print ('--- Generating with seed: "' + generated_text + '"')
    
    # try a range of different sampling temperature
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print ('--------- temperature:', temperature)
        sys.stdout.write(generated_text)
        
        # generate 400 characters starting from the seed text
        for i in range (400):
            # one-hot encode the characters generated so far
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1
                
            # sample the next character
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]
            
            generated_text += next_char
            generated_text = generated_text[1:]
            
            sys.stdout.write(next_char)
        
        print ("\n")
    

epoch 1
Epoch 1/1
--- Generating with seed: "ing out a double meaning. but,
as in the case of books and l"
--------- temperature: 0.2
ing out a double meaning. but,
as in the case of books and l  oeoo  oo oooo ooo ooo  o e oeo o  o o    o  eoto oeo   o  oo  o  o ooo  ooooe   eoo o  ooooo  o ooo eo   o  o  oo   oo ooe  o ooo o   oo    oo  oo oe  o oooo  o  o o  ooooooo o   oo e ooo   e o ooooooooooo  eo eoeoootooo e ooo  oo t  o oo oo  e  o   o  ooo oeo ooooooo ooo oo  oo   oooooo oooo ooao o  e  eo oo ooo oooo oooooo oeooo  oo ooo  ooooo ooo  o   o  o o  ooo   oo o oo   o  o     e tooo o

--------- temperature: 0.5
oo  ooooo ooo  o   o  o o  ooo   oo o oo   o  o     e tooo o   oo  eaaooeeeo o ot o aeaoeo o ebo ttee  ehooo eoeot  oho  ooooeeeotho  o oooecoeo vh   oea  h ot ottew ooochoooo too o oo o oe  aoo oeoo eooot oo o   tr  co oot h  otooe ootoe ooto  o oebo o  o  oe  eo ou   eaee eoeoo  oaooeeooecoi  oe    oec"  eao eo oo  ceio o ho o ee  e   oehecho o oob teoooe  oreo ooee a eewo

  


on
n eiaenaeoetotaataoooea
aaaeeaoattia
taeenaatatt
eaeetnaaoaa
tootaooeaetoi
tteen
aeoitot eioanoiaataea
aata
a
aei
iaii
neo

aeaaeeeaotaaaoaoieeotaa
n
aoneeaioa
etoooeao

tien

eaeaianttitattotaoitoetotaoo
otoeaoi


atotoee
oa
ttoo
ataoatteeao tnotaaoeooiaiatattatoaaieetiaa
ata

enoatiateaatie a tteo
tont
ao aatneaaoaotanttta
e

aaaaati
otto
tni

a
eaao
t
iateaonoto
itete

--------- temperature: 0.5
eaaoaotanttta
e

aaaaati
otto
tni

a
eaao
t
iateaonoto
itete 
tea aeneintmve
et uyaattarnonha n
inieat eei
tniaententedeo
toaio

eateaiieo


t eo onaienoeioteotatar ae
taeoieoio
it

to
te
tnteoaaioetetyninooe
oiaiealtotao ntoeaaaemeatn
feea
ii
at ooeianeeiinaeio
oo
ot  iieaton
aatn
 oennotunocaeeaao 
eaoeaet
oaanntt
iteeeinoaae
taa
nrteo
ana
at 
oitnenae
ota
ai
 
onatietat
aniaeao
at
htiee
aneo
af  to t
aoitoi
nttaonnnolefoeoeneneci
tttneatt

itote

oieas


--------- temperature: 1.0
  to t
aoitoi
nttaonnnolefoeoeneneci
tttneatt

itote

oieas
aap 
imroountfh dtr atralpfeftidraorlt
nisn a


e nut h r e  rn ts t t i l   hrw nw  chh hhr g   rfirw tnti h ia nno gt na   tkitgoti rngr !tyoi! tr f t!ttoiny ig nwerbb ton" aadinnntdh s nob t i  q  g ,e ro trsrgah. btt.t   r n e et  i ,a ir !hui,nrriyehh r  fntgbtnot ay nlabu  i a  !t .rrgyque ttnnirclntgj aird ansreta op-ornirietas
togllnt kbnrll  t rtlqsranntci oncs i ttg hghhr tntrer  msnid ltry   harnrg   it vrgno,gg entii!ra of no  i   tnre!i   f  ergs rhrfnogelm ono derdn  r1  siouoh .g    lit  

epoch 10
Epoch 1/1
--- Generating with seed: " where do we not
find this cripple sitting nowadays! and yet"
--------- temperature: 0.2
 where do we not
find this cripple sitting nowadays! and yet    e e  tr r   e  reeer er er t r rtr re te   teretnrree etere   ete tt   r      reree  rerd  teetem rt   retr       r     e  t    t t      r e   er  e   e     e        e  ne   r er et  e rt e  ttte   ttt     rt e e   e eee ee eee r   e       t  tte  t  e    r  e re  ee   rr    rr  t  e e te e   t e ereee r  rerr  re  e       tmem  r   ettr

hfhhfhhfehhfffnhffhfhffnhhehfhhfhffhfhnifffihhffhnffhhffhhhhhiinnhfffith fhhfhhnfffnhfisifnshnihfithe fiftfffgfffneiffifhffnffhehh hheh hnfafhf fiihhfhfhfhnifhsnnhnhnsh hihhtiehfhi fhfnniifes fffnhfnfnfsnnfhhhnniffeshi ehhnn tnnh fhhhf  fi hin ifhihifhfnfshsseinis fhhhnfhhihhhfisnniinhsshhhhifhfnhfhnnifaffhshef fhhfhfh nnhsihfshhtefnfhhaht hifnfsffshhuihhusifihhifffffohfhfhfhf,nihn hiinfii nheffhhihnsahfhhsinhhfnnhheffiihhhhhfih fi iifsihhihfhhfifhhnffheif

--------- temperature: 1.0
ihnsahfhhsinhhfnnhheffiihhhhhfih fi iifsihhihfhhfifhhnffheifafushhu
ih hhnhnmyh
 flftfhshutsibhfsff sahfi ahene,,huiiisfiif hegtmihnhhnnha ntlniih fftt lsoifhpash,fiancn o
fhfhhhii hehhingfnihsnfnnnnhnsnhnef
iuiithhiyhylisshhnhh ho aohimi eofa-fue-fmff  sdhthhhtifhshinlefmnn ,tiffhhofhhtjnniihioesee  ihisoaig tnfoei eagfifrvhnn-socshiftnehiowthmwfnnhf,atf ehthofhnuntemafn fihfhoaiefvee-hhsnieniwshfsfef,iif,noiief ieenus-huhhnoeisfesni  iff h w i iju hfnuet

--------- temperature: 1.2
fsfef,iif,noiief ieenu

KeyboardInterrupt: 